From d624bce9af25e5bc1141de4dcdb50a15223f106d Mon Sep 17 00:00:00 2001 From: Andy Fiddaman Date: Mon, 9 Oct 2023 20:02:08 +0100 Subject: [PATCH] Back /var/fm/fmd with a dataset from the boot M.2 (#4212) `/var/fm/fmd` is where the illumos fault management system records data. We want to preserve this data across system reboots and in real time rather than via periodic data copying, so that the information is available should the system panic shortly thereafter. Fixes: https://github.com/oxidecomputer/omicron/issues/4211 --- illumos-utils/src/zfs.rs | 41 ++++-- sled-agent/src/backing_fs.rs | 178 +++++++++++++++++++++++++ sled-agent/src/bootstrap/pre_server.rs | 1 + sled-agent/src/lib.rs | 1 + sled-agent/src/sled_agent.rs | 22 ++- sled-agent/src/storage_manager.rs | 1 + sled-agent/src/swap_device.rs | 3 - sled-hardware/src/disk.rs | 15 ++- 8 files changed, 242 insertions(+), 20 deletions(-) create mode 100644 sled-agent/src/backing_fs.rs diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index ba8cd8c84a..9118a9a3cd 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -61,6 +61,9 @@ enum EnsureFilesystemErrorRaw { #[error("Failed to mount encrypted filesystem: {0}")] MountEncryptedFsFailed(crate::ExecutionError), + + #[error("Failed to mount overlay filesystem: {0}")] + MountOverlayFsFailed(crate::ExecutionError), } /// Error returned by [`Zfs::ensure_filesystem`]. @@ -202,6 +205,7 @@ impl Zfs { /// Creates a new ZFS filesystem named `name`, unless one already exists. /// /// Applies an optional quota, provided _in bytes_. + #[allow(clippy::too_many_arguments)] pub fn ensure_filesystem( name: &str, mountpoint: Mountpoint, @@ -209,6 +213,7 @@ impl Zfs { do_format: bool, encryption_details: Option, size_details: Option, + additional_options: Option>, ) -> Result<(), EnsureFilesystemError> { let (exists, mounted) = Self::dataset_exists(name, &mountpoint)?; if exists { @@ -261,7 +266,14 @@ impl Zfs { ]); } + if let Some(opts) = additional_options { + for o in &opts { + cmd.args(&["-o", &o]); + } + } + cmd.args(&["-o", &format!("mountpoint={}", mountpoint), name]); + execute(cmd).map_err(|err| EnsureFilesystemError { name: name.to_string(), mountpoint: mountpoint.clone(), @@ -322,6 +334,20 @@ impl Zfs { Ok(()) } + pub fn mount_overlay_dataset( + name: &str, + mountpoint: &Mountpoint, + ) -> Result<(), EnsureFilesystemError> { + let mut command = std::process::Command::new(PFEXEC); + let cmd = command.args(&[ZFS, "mount", "-O", name]); + execute(cmd).map_err(|err| EnsureFilesystemError { + name: name.to_string(), + mountpoint: mountpoint.clone(), + err: EnsureFilesystemErrorRaw::MountOverlayFsFailed(err), + })?; + Ok(()) + } + // Return (true, mounted) if the dataset exists, (false, false) otherwise, // where mounted is if the dataset is mounted. fn dataset_exists( @@ -385,7 +411,7 @@ impl Zfs { Zfs::get_value(filesystem_name, &format!("oxide:{}", name)) } - fn get_value( + pub fn get_value( filesystem_name: &str, name: &str, ) -> Result { @@ -422,13 +448,12 @@ pub fn get_all_omicron_datasets_for_delete() -> anyhow::Result> { let internal = pool.kind() == crate::zpool::ZpoolKind::Internal; let pool = pool.to_string(); for dataset in &Zfs::list_datasets(&pool)? { - // Avoid erasing crashdump datasets on internal pools - if dataset == "crash" && internal { - continue; - } - - // The swap device might be in use, so don't assert that it can be deleted. - if dataset == "swap" && internal { + // Avoid erasing crashdump, backing data and swap datasets on + // internal pools. The swap device may be in use. + if internal + && (["crash", "backing", "swap"].contains(&dataset.as_str()) + || dataset.starts_with("backing/")) + { continue; } diff --git a/sled-agent/src/backing_fs.rs b/sled-agent/src/backing_fs.rs new file mode 100644 index 0000000000..5014ac5999 --- /dev/null +++ b/sled-agent/src/backing_fs.rs @@ -0,0 +1,178 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Operations for dealing with persistent backing mounts for OS data + +// On Oxide hardware, the root filesystem is backed by a ramdisk and +// non-persistent. However, there are several things within the root filesystem +// which are useful to preserve across reboots, and these are backed persistent +// datasets on the boot disk. +// +// Each boot disk contains a dataset sled_hardware::disk::M2_BACKING_DATASET +// and for each backing mount, a child dataset is created under there that +// is configured with the desired mountpoint in the root filesystem. Since +// there are multiple disks which can be used to boot, these datasets are also +// marked with the "canmount=noauto" attribute so that they do not all try to +// mount automatically and race -- only one could ever succeed. This allows us +// to come along later and specifically mount the one that we want (the one from +// the current boot disk) and also perform an overlay mount so that it succeeds +// even if there is content from the ramdisk image or early boot services +// present underneath. The overlay mount action is optionally bracketed with a +// service stop/start. + +use camino::Utf8PathBuf; +use illumos_utils::zfs::{ + EnsureFilesystemError, GetValueError, Mountpoint, SizeDetails, Zfs, +}; + +#[derive(Debug, thiserror::Error)] +pub enum BackingFsError { + #[error("Error administering service: {0}")] + Adm(#[from] smf::AdmError), + + #[error("Error retrieving dataset property: {0}")] + DatasetProperty(#[from] GetValueError), + + #[error("Error initializing dataset: {0}")] + Mount(#[from] EnsureFilesystemError), +} + +struct BackingFs { + // Dataset name + name: &'static str, + // Mountpoint + mountpoint: &'static str, + // Optional quota, in _bytes_ + quota: Option, + // Optional compression mode + compression: Option<&'static str>, + // Linked service + service: Option<&'static str>, +} + +impl BackingFs { + const fn new(name: &'static str) -> Self { + Self { + name, + mountpoint: "legacy", + quota: None, + compression: None, + service: None, + } + } + + const fn mountpoint(mut self, mountpoint: &'static str) -> Self { + self.mountpoint = mountpoint; + self + } + + const fn quota(mut self, quota: usize) -> Self { + self.quota = Some(quota); + self + } + + const fn compression(mut self, compression: &'static str) -> Self { + self.compression = Some(compression); + self + } + + const fn service(mut self, service: &'static str) -> Self { + self.service = Some(service); + self + } +} + +const BACKING_FMD_DATASET: &'static str = "fmd"; +const BACKING_FMD_MOUNTPOINT: &'static str = "/var/fm/fmd"; +const BACKING_FMD_SERVICE: &'static str = "svc:/system/fmd:default"; +const BACKING_FMD_QUOTA: usize = 500 * (1 << 20); // 500 MiB + +const BACKING_COMPRESSION: &'static str = "on"; + +const BACKINGFS_COUNT: usize = 1; +static BACKINGFS: [BackingFs; BACKINGFS_COUNT] = + [BackingFs::new(BACKING_FMD_DATASET) + .mountpoint(BACKING_FMD_MOUNTPOINT) + .quota(BACKING_FMD_QUOTA) + .compression(BACKING_COMPRESSION) + .service(BACKING_FMD_SERVICE)]; + +/// Ensure that the backing filesystems are mounted. +/// If the underlying dataset for a backing fs does not exist on the specified +/// boot disk then it will be created. +pub(crate) fn ensure_backing_fs( + log: &slog::Logger, + boot_zpool_name: &illumos_utils::zpool::ZpoolName, +) -> Result<(), BackingFsError> { + let log = log.new(o!( + "component" => "BackingFs", + )); + for bfs in BACKINGFS.iter() { + info!(log, "Processing {}", bfs.name); + + let dataset = format!( + "{}/{}/{}", + boot_zpool_name, + sled_hardware::disk::M2_BACKING_DATASET, + bfs.name + ); + let mountpoint = Mountpoint::Path(Utf8PathBuf::from(bfs.mountpoint)); + + info!(log, "Ensuring dataset {}", dataset); + + let size_details = Some(SizeDetails { + quota: bfs.quota, + compression: bfs.compression, + }); + + Zfs::ensure_filesystem( + &dataset, + mountpoint.clone(), + false, // zoned + true, // do_format + None, // encryption_details, + size_details, + Some(vec!["canmount=noauto".to_string()]), // options + )?; + + // Check if a ZFS filesystem is already mounted on bfs.mountpoint by + // retrieving the ZFS `mountpoint` property and comparing it. This + // might seem counter-intuitive but if there is a filesystem mounted + // there, its mountpoint will match, and if not then we will retrieve + // the mountpoint of a higher level filesystem, such as '/'. If we + // can't retrieve the property at all, then there is definitely no ZFS + // filesystem mounted there - most likely we are running with a non-ZFS + // root, such as when net booted during CI. + if Zfs::get_value(&bfs.mountpoint, "mountpoint") + .unwrap_or("not-zfs".to_string()) + == bfs.mountpoint + { + info!(log, "{} is already mounted", bfs.mountpoint); + return Ok(()); + } + + if let Some(service) = bfs.service { + info!(log, "Stopping service {}", service); + smf::Adm::new() + .disable() + .temporary() + .synchronous() + .run(smf::AdmSelection::ByPattern(&[service]))?; + } + + info!(log, "Mounting {} on {}", dataset, mountpoint); + + Zfs::mount_overlay_dataset(&dataset, &mountpoint)?; + + if let Some(service) = bfs.service { + info!(log, "Starting service {}", service); + smf::Adm::new() + .enable() + .synchronous() + .run(smf::AdmSelection::ByPattern(&[service]))?; + } + } + + Ok(()) +} diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 0899bdd82f..71325fef3d 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -381,6 +381,7 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> { do_format, encryption_details, quota, + None, ) .map_err(StartError::EnsureZfsRamdiskDataset) } diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 5c4dbd8310..4e7921c605 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -17,6 +17,7 @@ pub mod sim; pub mod common; // Modules for the non-simulated sled agent. +mod backing_fs; pub mod bootstrap; pub mod config; mod http_entrypoints; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 7e62f6a8a7..5574edca55 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -59,9 +59,15 @@ use illumos_utils::{dladm::MockDladm as Dladm, zone::MockZones as Zones}; #[derive(thiserror::Error, Debug)] pub enum Error { + #[error("Could not find boot disk")] + BootDiskNotFound, + #[error("Configuration error: {0}")] Config(#[from] crate::config::ConfigError), + #[error("Error setting up backing filesystems: {0}")] + BackingFs(#[from] crate::backing_fs::BackingFsError), + #[error("Error setting up swap device: {0}")] SwapDevice(#[from] crate::swap_device::SwapDeviceError), @@ -268,14 +274,17 @@ impl SledAgent { )); info!(&log, "SledAgent::new(..) starting"); - // Configure a swap device of the configured size before other system setup. + let boot_disk = storage + .resources() + .boot_disk() + .await + .ok_or_else(|| Error::BootDiskNotFound)?; + + // Configure a swap device of the configured size before other system + // setup. match config.swap_device_size_gb { Some(sz) if sz > 0 => { info!(log, "Requested swap device of size {} GiB", sz); - let boot_disk = - storage.resources().boot_disk().await.ok_or_else(|| { - crate::swap_device::SwapDeviceError::BootDiskNotFound - })?; crate::swap_device::ensure_swap_device( &parent_log, &boot_disk.1, @@ -290,6 +299,9 @@ impl SledAgent { } } + info!(log, "Mounting backing filesystems"); + crate::backing_fs::ensure_backing_fs(&parent_log, &boot_disk.1)?; + // Ensure we have a thread that automatically reaps process contracts // when they become empty. See the comments in // illumos-utils/src/running_zone.rs for more detail. diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index bd71371396..c31a4dc0bc 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -417,6 +417,7 @@ impl StorageWorker { do_format, encryption_details, size_details, + None, )?; // Ensure the dataset has a usable UUID. if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { diff --git a/sled-agent/src/swap_device.rs b/sled-agent/src/swap_device.rs index 5a8f40adbd..6a00b42672 100644 --- a/sled-agent/src/swap_device.rs +++ b/sled-agent/src/swap_device.rs @@ -9,9 +9,6 @@ use zeroize::Zeroize; #[derive(Debug, thiserror::Error)] pub enum SwapDeviceError { - #[error("Could not find boot disk")] - BootDiskNotFound, - #[error("Error running ZFS command: {0}")] Zfs(illumos_utils::ExecutionError), diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index aec99ae3f8..e3078cbeea 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -256,6 +256,7 @@ pub const CRASH_DATASET: &'static str = "crash"; pub const CLUSTER_DATASET: &'static str = "cluster"; pub const CONFIG_DATASET: &'static str = "config"; pub const M2_DEBUG_DATASET: &'static str = "debug"; +pub const M2_BACKING_DATASET: &'static str = "backing"; // TODO-correctness: This value of 100GiB is a pretty wild guess, and should be // tuned as needed. pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); @@ -282,7 +283,7 @@ static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ .compression(DUMP_DATASET_COMPRESSION), ]; -const M2_EXPECTED_DATASET_COUNT: usize = 5; +const M2_EXPECTED_DATASET_COUNT: usize = 6; static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ // Stores software images. // @@ -290,7 +291,11 @@ static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ ExpectedDataset::new(INSTALL_DATASET), // Stores crash dumps. ExpectedDataset::new(CRASH_DATASET), - // Stores cluter configuration information. + // Backing store for OS data that should be persisted across reboots. + // Its children are selectively overlay mounted onto parts of the ramdisk + // root. + ExpectedDataset::new(M2_BACKING_DATASET), + // Stores cluster configuration information. // // Should be duplicated to both M.2s. ExpectedDataset::new(CLUSTER_DATASET), @@ -524,6 +529,7 @@ impl Disk { do_format, Some(encryption_details), None, + None, ); keyfile.zero_and_unlink().await.map_err(|error| { @@ -562,8 +568,8 @@ impl Disk { "Automatically destroying dataset: {}", name ); Zfs::destroy_dataset(name).or_else(|err| { - // If we can't find the dataset, that's fine -- it might - // not have been formatted yet. + // If we can't find the dataset, that's fine -- it + // might not have been formatted yet. if let DestroyDatasetErrorVariant::NotFound = err.err { @@ -588,6 +594,7 @@ impl Disk { do_format, encryption_details, size_details, + None, )?; if dataset.wipe {