diff --git a/Cargo.lock b/Cargo.lock index bdf2d44ea4..a448600863 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5277,6 +5277,8 @@ dependencies = [ "key-manager", "nexus-client 0.1.0", "omicron-common 0.1.0", + "omicron-test-utils", + "rand 0.8.5", "schemars", "serde", "serde_json", diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index e43f2d841d..3d3e544573 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -50,7 +50,7 @@ static KEY_MANAGER_READY: OnceLock<()> = OnceLock::new(); #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] - DiskError(#[from] sled_hardware::DiskError), + DiskError(#[from] sled_hardware::PooledDiskError), // TODO: We could add the context of "why are we doint this op", maybe? #[error(transparent)] @@ -610,7 +610,7 @@ impl StorageWorker { &mut self, unparsed_disk: UnparsedDisk, queued_u2_drives: &mut Option>, - ) -> Result { + ) -> Result { match sled_hardware::Disk::new( &self.log, unparsed_disk.clone(), @@ -619,7 +619,7 @@ impl StorageWorker { .await { Ok(disk) => Ok(disk), - Err(sled_hardware::DiskError::KeyManager(err)) => { + Err(sled_hardware::PooledDiskError::KeyManager(err)) => { warn!( self.log, "Transient error: {err} - queuing disk {:?}", unparsed_disk @@ -630,7 +630,7 @@ impl StorageWorker { *queued_u2_drives = Some(HashSet::from([unparsed_disk.into()])); } - Err(sled_hardware::DiskError::KeyManager(err)) + Err(sled_hardware::PooledDiskError::KeyManager(err)) } Err(err) => { error!( @@ -651,7 +651,7 @@ impl StorageWorker { &mut self, zpool_name: ZpoolName, queued_u2_drives: &mut Option>, - ) -> Result<(), sled_hardware::DiskError> { + ) -> Result<(), sled_hardware::PooledDiskError> { let synthetic_id = DiskIdentity { vendor: "fake_vendor".to_string(), serial: "fake_serial".to_string(), @@ -666,7 +666,7 @@ impl StorageWorker { .await { Ok(()) => Ok(()), - Err(sled_hardware::DiskError::KeyManager(err)) => { + Err(sled_hardware::PooledDiskError::KeyManager(err)) => { warn!( self.log, "Transient error: {err} - queuing synthetic disk: {:?}", @@ -678,7 +678,7 @@ impl StorageWorker { *queued_u2_drives = Some(HashSet::from([zpool_name.into()])); } - Err(sled_hardware::DiskError::KeyManager(err)) + Err(sled_hardware::PooledDiskError::KeyManager(err)) } Err(err) => { error!( diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index aec99ae3f8..bea7e23c73 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -4,34 +4,14 @@ use camino::{Utf8Path, Utf8PathBuf}; use illumos_utils::fstyp::Fstyp; -use illumos_utils::zfs; -use illumos_utils::zfs::DestroyDatasetErrorVariant; -use illumos_utils::zfs::EncryptionDetails; -use illumos_utils::zfs::Keypath; -use illumos_utils::zfs::Mountpoint; -use illumos_utils::zfs::SizeDetails; -use illumos_utils::zfs::Zfs; use illumos_utils::zpool::Zpool; use illumos_utils::zpool::ZpoolKind; use illumos_utils::zpool::ZpoolName; -use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; -use rand::distributions::{Alphanumeric, DistString}; use slog::Logger; use slog::{info, warn}; -use std::sync::OnceLock; -use tokio::fs::{remove_file, File}; -use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; use uuid::Uuid; -/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior -/// and to ensure it goes away on power off. -/// -/// We want minimize the time the key files are in memory, and so we rederive -/// the keys and recreate the files on demand when creating and mounting -/// encrypted filesystems. We then zero them and unlink them. -pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; - cfg_if::cfg_if! { if #[cfg(target_os = "illumos")] { use crate::illumos::*; @@ -41,7 +21,7 @@ cfg_if::cfg_if! { } #[derive(Debug, thiserror::Error)] -pub enum DiskError { +pub enum PooledDiskError { #[error("Cannot open {path} due to {error}")] IoError { path: Utf8PathBuf, error: std::io::Error }, #[error("Failed to open partition at {path} due to {error}")] @@ -51,10 +31,6 @@ pub enum DiskError { #[error("Requested partition {partition:?} not found on device {path}")] NotFound { path: Utf8PathBuf, partition: Partition }, #[error(transparent)] - DestroyFilesystem(#[from] illumos_utils::zfs::DestroyDatasetError), - #[error(transparent)] - EnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), - #[error(transparent)] ZpoolCreate(#[from] illumos_utils::zpool::CreateError), #[error("Cannot import zpool: {0}")] ZpoolImport(illumos_utils::zpool::Error), @@ -62,18 +38,6 @@ pub enum DiskError { CannotFormatMissingDevPath { path: Utf8PathBuf }, #[error("Formatting M.2 devices is not yet implemented")] CannotFormatM2NotImplemented, - #[error("KeyManager error: {0}")] - KeyManager(#[from] key_manager::Error), - #[error("Missing StorageKeyRequester when creating U.2 disk")] - MissingStorageKeyRequester, - #[error("Encrypted filesystem '{0}' missing 'oxide:epoch' property")] - CannotParseEpochProperty(String), - #[error("Encrypted dataset '{dataset}' cannot set 'oxide:agent' property: {err}")] - CannotSetAgentProperty { - dataset: String, - #[source] - err: Box, - }, } /// A partition (or 'slice') of a disk. @@ -126,17 +90,17 @@ impl DiskPaths { } // Finds the first 'variant' partition, and returns the path to it. - fn partition_device_path( + pub fn partition_device_path( &self, partitions: &[Partition], expected_partition: Partition, raw: bool, - ) -> Result { + ) -> Result { for (index, partition) in partitions.iter().enumerate() { if &expected_partition == partition { let path = self.partition_path(index, raw).ok_or_else(|| { - DiskError::NotFound { + PooledDiskError::NotFound { path: self.devfs_path.clone(), partition: expected_partition, } @@ -144,7 +108,7 @@ impl DiskPaths { return Ok(path); } } - Err(DiskError::NotFound { + Err(PooledDiskError::NotFound { path: self.devfs_path.clone(), partition: expected_partition, }) @@ -202,122 +166,33 @@ impl UnparsedDisk { } } -/// A physical disk conforming to the expected partition layout. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct Disk { - paths: DiskPaths, - slot: i64, - variant: DiskVariant, - identity: DiskIdentity, - is_boot_disk: bool, - partitions: Vec, - +/// A physical disk that is partitioned to contain exactly one zpool +/// +/// A PooledDisk relies on hardware specific information to be constructed +/// and is the highest level disk structure in the `sled-hardware` package. +/// The `sled-storage` package contains `Disk`s whose zpool and datasets can be +/// manipulated. This separation exists to remove the hardware dependent logic +/// from the ZFS related logic which can also operate on file backed zpools. +/// Doing things this way allows us to not put higher level concepts like +/// storage keys into this hardware related package. +pub struct PooledDisk { + pub paths: DiskPaths, + pub slot: i64, + pub variant: DiskVariant, + pub identity: DiskIdentity, + pub is_boot_disk: bool, + pub partitions: Vec, // This embeds the assumtion that there is exactly one parsed zpool per // disk. - zpool_name: ZpoolName, + pub zpool_name: ZpoolName, } -// Helper type for describing expected datasets and their optional quota. -#[derive(Clone, Copy, Debug)] -struct ExpectedDataset { - // Name for the dataset - name: &'static str, - // Optional quota, in _bytes_ - quota: Option, - // Identifies if the dataset should be deleted on boot - wipe: bool, - // Optional compression mode - compression: Option<&'static str>, -} - -impl ExpectedDataset { - const fn new(name: &'static str) -> Self { - ExpectedDataset { name, quota: None, wipe: false, compression: None } - } - - const fn quota(mut self, quota: usize) -> Self { - self.quota = Some(quota); - self - } - - const fn wipe(mut self) -> Self { - self.wipe = true; - self - } - - const fn compression(mut self, compression: &'static str) -> Self { - self.compression = Some(compression); - self - } -} - -pub const INSTALL_DATASET: &'static str = "install"; -pub const CRASH_DATASET: &'static str = "crash"; -pub const CLUSTER_DATASET: &'static str = "cluster"; -pub const CONFIG_DATASET: &'static str = "config"; -pub const M2_DEBUG_DATASET: &'static str = "debug"; -// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be -// tuned as needed. -pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); -// ditto. -pub const DUMP_DATASET_QUOTA: usize = 100 * (1 << 30); -// passed to zfs create -o compression= -pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; - -// U.2 datasets live under the encrypted dataset and inherit encryption -pub const ZONE_DATASET: &'static str = "crypt/zone"; -pub const DUMP_DATASET: &'static str = "crypt/debug"; -pub const U2_DEBUG_DATASET: &'static str = "crypt/debug"; - -// This is the root dataset for all U.2 drives. Encryption is inherited. -pub const CRYPT_DATASET: &'static str = "crypt"; - -const U2_EXPECTED_DATASET_COUNT: usize = 2; -static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ - // Stores filesystems for zones - ExpectedDataset::new(ZONE_DATASET).wipe(), - // For storing full kernel RAM dumps - ExpectedDataset::new(DUMP_DATASET) - .quota(DUMP_DATASET_QUOTA) - .compression(DUMP_DATASET_COMPRESSION), -]; - -const M2_EXPECTED_DATASET_COUNT: usize = 5; -static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ - // Stores software images. - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(INSTALL_DATASET), - // Stores crash dumps. - ExpectedDataset::new(CRASH_DATASET), - // Stores cluter configuration information. - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(CLUSTER_DATASET), - // Stores configuration data, including: - // - What services should be launched on this sled - // - Information about how to initialize the Sled Agent - // - (For scrimlets) RSS setup information - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(CONFIG_DATASET), - // Store debugging data, such as service bundles. - ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), -]; - -impl Disk { - /// Create a new Disk - /// - /// WARNING: In all cases where a U.2 is a possible `DiskVariant`, a - /// `StorageKeyRequester` must be passed so that disk encryption can - /// be used. The `StorageManager` for the sled-agent always has a - /// `StorageKeyRequester` available, and so the only place we should pass - /// `None` is for the M.2s touched by the Installinator. - pub async fn new( +impl PooledDisk { + /// Create a new PooledDisk + pub fn new( log: &Logger, unparsed_disk: UnparsedDisk, - key_requester: Option<&StorageKeyRequester>, - ) -> Result { + ) -> Result { let paths = &unparsed_disk.paths; let variant = unparsed_disk.variant; // Ensure the GPT has the right format. This does not necessarily @@ -335,13 +210,8 @@ impl Disk { )?; let zpool_name = Self::ensure_zpool_exists(log, variant, &zpool_path)?; - Self::ensure_zpool_ready( - log, - &zpool_name, - &unparsed_disk.identity, - key_requester, - ) - .await?; + Self::ensure_zpool_imported(log, &zpool_name)?; + Self::ensure_zpool_failmode_is_continue(log, &zpool_name)?; Ok(Self { paths: unparsed_disk.paths, @@ -354,29 +224,11 @@ impl Disk { }) } - pub async fn ensure_zpool_ready( - log: &Logger, - zpool_name: &ZpoolName, - disk_identity: &DiskIdentity, - key_requester: Option<&StorageKeyRequester>, - ) -> Result<(), DiskError> { - Self::ensure_zpool_imported(log, &zpool_name)?; - Self::ensure_zpool_failmode_is_continue(log, &zpool_name)?; - Self::ensure_zpool_has_datasets( - log, - &zpool_name, - disk_identity, - key_requester, - ) - .await?; - Ok(()) - } - fn ensure_zpool_exists( log: &Logger, variant: DiskVariant, zpool_path: &Utf8Path, - ) -> Result { + ) -> Result { let zpool_name = match Fstyp::get_zpool(&zpool_path) { Ok(zpool_name) => zpool_name, Err(_) => { @@ -407,7 +259,7 @@ impl Disk { }; Zpool::import(zpool_name.clone()).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(zpool_name) @@ -416,10 +268,10 @@ impl Disk { fn ensure_zpool_imported( log: &Logger, zpool_name: &ZpoolName, - ) -> Result<(), DiskError> { + ) -> Result<(), PooledDiskError> { Zpool::import(zpool_name.clone()).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(()) } @@ -427,7 +279,7 @@ impl Disk { fn ensure_zpool_failmode_is_continue( log: &Logger, zpool_name: &ZpoolName, - ) -> Result<(), DiskError> { + ) -> Result<(), PooledDiskError> { // Ensure failmode is set to `continue`. See // https://github.com/oxidecomputer/omicron/issues/2766 for details. The // short version is, each pool is only backed by one vdev. There is no @@ -440,212 +292,10 @@ impl Disk { log, "Failed to set failmode=continue on zpool {zpool_name}: {e}" ); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(()) } - - // Ensure that the zpool contains all the datasets we would like it to - // contain. - async fn ensure_zpool_has_datasets( - log: &Logger, - zpool_name: &ZpoolName, - disk_identity: &DiskIdentity, - key_requester: Option<&StorageKeyRequester>, - ) -> Result<(), DiskError> { - let (root, datasets) = match zpool_name.kind().into() { - DiskVariant::M2 => (None, M2_EXPECTED_DATASETS.iter()), - DiskVariant::U2 => { - (Some(CRYPT_DATASET), U2_EXPECTED_DATASETS.iter()) - } - }; - - let zoned = false; - let do_format = true; - - // Ensure the root encrypted filesystem exists - // Datasets below this in the hierarchy will inherit encryption - if let Some(dataset) = root { - let Some(key_requester) = key_requester else { - return Err(DiskError::MissingStorageKeyRequester); - }; - let mountpoint = zpool_name.dataset_mountpoint(dataset); - let keypath: Keypath = disk_identity.into(); - - let epoch = - if let Ok(epoch_str) = Zfs::get_oxide_value(dataset, "epoch") { - if let Ok(epoch) = epoch_str.parse::() { - epoch - } else { - return Err(DiskError::CannotParseEpochProperty( - dataset.to_string(), - )); - } - } else { - // We got an error trying to call `Zfs::get_oxide_value` - // which indicates that the dataset doesn't exist or there - // was a problem running the command. - // - // Note that `Zfs::get_oxide_value` will succeed even if - // the epoch is missing. `epoch_str` will show up as a dash - // (`-`) and will not parse into a `u64`. So we don't have - // to worry about that case here as it is handled above. - // - // If the error indicated that the command failed for some - // other reason, but the dataset actually existed, we will - // try to create the dataset below and that will fail. So - // there is no harm in just loading the latest secret here. - key_requester.load_latest_secret().await? - }; - - let key = - key_requester.get_key(epoch, disk_identity.clone()).await?; - - let mut keyfile = - KeyFile::create(keypath.clone(), key.expose_secret(), log) - .await - .map_err(|error| DiskError::IoError { - path: keypath.0.clone(), - error, - })?; - - let encryption_details = EncryptionDetails { keypath, epoch }; - - info!( - log, - "Ensuring encrypted filesystem: {} for epoch {}", - dataset, - epoch - ); - let result = Zfs::ensure_filesystem( - &format!("{}/{}", zpool_name, dataset), - Mountpoint::Path(mountpoint), - zoned, - do_format, - Some(encryption_details), - None, - ); - - keyfile.zero_and_unlink().await.map_err(|error| { - DiskError::IoError { path: keyfile.path().0.clone(), error } - })?; - - result?; - }; - - for dataset in datasets.into_iter() { - let mountpoint = zpool_name.dataset_mountpoint(dataset.name); - let name = &format!("{}/{}", zpool_name, dataset.name); - - // Use a value that's alive for the duration of this sled agent - // to answer the question: should we wipe this disk, or have - // we seen it before? - // - // If this value comes from a prior iteration of the sled agent, - // we opt to remove the corresponding dataset. - static AGENT_LOCAL_VALUE: OnceLock = OnceLock::new(); - let agent_local_value = AGENT_LOCAL_VALUE.get_or_init(|| { - Alphanumeric.sample_string(&mut rand::thread_rng(), 20) - }); - - if dataset.wipe { - match Zfs::get_oxide_value(name, "agent") { - Ok(v) if &v == agent_local_value => { - info!( - log, - "Skipping automatic wipe for dataset: {}", name - ); - } - Ok(_) | Err(_) => { - info!( - log, - "Automatically destroying dataset: {}", name - ); - Zfs::destroy_dataset(name).or_else(|err| { - // If we can't find the dataset, that's fine -- it might - // not have been formatted yet. - if let DestroyDatasetErrorVariant::NotFound = - err.err - { - Ok(()) - } else { - Err(err) - } - })?; - } - } - } - - let encryption_details = None; - let size_details = Some(SizeDetails { - quota: dataset.quota, - compression: dataset.compression, - }); - Zfs::ensure_filesystem( - name, - Mountpoint::Path(mountpoint), - zoned, - do_format, - encryption_details, - size_details, - )?; - - if dataset.wipe { - Zfs::set_oxide_value(name, "agent", agent_local_value) - .map_err(|err| DiskError::CannotSetAgentProperty { - dataset: name.clone(), - err: Box::new(err), - })?; - } - } - Ok(()) - } - - pub fn is_boot_disk(&self) -> bool { - self.is_boot_disk - } - - pub fn identity(&self) -> &DiskIdentity { - &self.identity - } - - pub fn variant(&self) -> DiskVariant { - self.variant - } - - pub fn devfs_path(&self) -> &Utf8PathBuf { - &self.paths.devfs_path - } - - pub fn zpool_name(&self) -> &ZpoolName { - &self.zpool_name - } - - pub fn boot_image_devfs_path( - &self, - raw: bool, - ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::BootImage, - raw, - ) - } - - pub fn dump_device_devfs_path( - &self, - raw: bool, - ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::DumpDevice, - raw, - ) - } - - pub fn slot(&self) -> i64 { - self.slot - } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -664,56 +314,6 @@ impl From for DiskVariant { } } -/// A file that wraps a zfs encryption key. -/// -/// We put this in a RAM backed filesystem and zero and delete it when we are -/// done with it. Unfortunately we cannot do this inside `Drop` because there is no -/// equivalent async drop. -pub struct KeyFile { - path: Keypath, - file: File, - log: Logger, -} - -impl KeyFile { - pub async fn create( - path: Keypath, - key: &[u8; 32], - log: &Logger, - ) -> std::io::Result { - // TODO: fix this to not truncate - // We want to overwrite any existing contents. - // If we truncate we may leave dirty pages around - // containing secrets. - let mut file = tokio::fs::OpenOptions::new() - .create(true) - .write(true) - .open(&path.0) - .await?; - file.write_all(key).await?; - info!(log, "Created keyfile {}", path); - Ok(KeyFile { path, file, log: log.clone() }) - } - - /// These keyfiles live on a tmpfs and we zero the file so the data doesn't - /// linger on the page in memory. - /// - /// It'd be nice to `impl Drop for `KeyFile` and then call `zero` - /// from within the drop handler, but async `Drop` isn't supported. - pub async fn zero_and_unlink(&mut self) -> std::io::Result<()> { - let zeroes = [0u8; 32]; - let _ = self.file.seek(SeekFrom::Start(0)).await?; - self.file.write_all(&zeroes).await?; - info!(self.log, "Zeroed and unlinked keyfile {}", self.path); - remove_file(&self.path().0).await?; - Ok(()) - } - - pub fn path(&self) -> &Keypath { - &self.path - } -} - #[cfg(test)] mod test { use super::*; @@ -825,7 +425,7 @@ mod test { paths .partition_device_path(&[], Partition::ZfsPool, false) .expect_err("Should not have found partition"), - DiskError::NotFound { .. }, + PooledDiskError::NotFound { .. }, )); } } diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs index 950074bd3a..ee745fc78b 100644 --- a/sled-hardware/src/illumos/partitions.rs +++ b/sled-hardware/src/illumos/partitions.rs @@ -5,7 +5,7 @@ //! illumos-specific mechanisms for parsing disk info. use crate::illumos::gpt; -use crate::{DiskError, DiskPaths, DiskVariant, Partition}; +use crate::{DiskPaths, DiskVariant, Partition, PooledDiskError}; use camino::Utf8Path; use illumos_utils::zpool::ZpoolName; use slog::info; @@ -41,9 +41,9 @@ fn parse_partition_types( path: &Utf8Path, partitions: &Vec, expected_partitions: &[Partition; N], -) -> Result, DiskError> { +) -> Result, PooledDiskError> { if partitions.len() != N { - return Err(DiskError::BadPartitionLayout { + return Err(PooledDiskError::BadPartitionLayout { path: path.to_path_buf(), why: format!( "Expected {} partitions, only saw {}", @@ -54,7 +54,7 @@ fn parse_partition_types( } for i in 0..N { if partitions[i].index() != i { - return Err(DiskError::BadPartitionLayout { + return Err(PooledDiskError::BadPartitionLayout { path: path.to_path_buf(), why: format!( "The {i}-th partition has index {}", @@ -80,7 +80,7 @@ pub fn ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { internal_ensure_partition_layout::(log, paths, variant) } @@ -90,7 +90,7 @@ fn internal_ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { // Open the "Whole Disk" as a raw device to be parsed by the // libefi-illumos library. This lets us peek at the GPT before // making too many assumptions about it. @@ -114,7 +114,9 @@ fn internal_ensure_partition_layout( let dev_path = if let Some(dev_path) = &paths.dev_path { dev_path } else { - return Err(DiskError::CannotFormatMissingDevPath { path }); + return Err(PooledDiskError::CannotFormatMissingDevPath { + path, + }); }; match variant { DiskVariant::U2 => { @@ -129,12 +131,12 @@ fn internal_ensure_partition_layout( // the expected partitions? Or would it be wiser to infer // that this indicates an unexpected error conditions that // needs mitigation? - return Err(DiskError::CannotFormatM2NotImplemented); + return Err(PooledDiskError::CannotFormatM2NotImplemented); } } } Err(err) => { - return Err(DiskError::Gpt { + return Err(PooledDiskError::Gpt { path, error: anyhow::Error::new(err), }); @@ -197,7 +199,7 @@ mod test { DiskVariant::U2, ); match result { - Err(DiskError::CannotFormatMissingDevPath { .. }) => {} + Err(PooledDiskError::CannotFormatMissingDevPath { .. }) => {} _ => panic!("Should have failed with a missing dev path error"), } @@ -373,7 +375,7 @@ mod test { DiskVariant::M2, ) .expect_err("Should have failed parsing empty GPT"), - DiskError::BadPartitionLayout { .. } + PooledDiskError::BadPartitionLayout { .. } )); logctx.cleanup_successful(); @@ -398,7 +400,7 @@ mod test { DiskVariant::U2, ) .expect_err("Should have failed parsing empty GPT"), - DiskError::BadPartitionLayout { .. } + PooledDiskError::BadPartitionLayout { .. } )); logctx.cleanup_successful(); diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index 03f0f608de..ae9718382d 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -14,6 +14,7 @@ key-manager.workspace = true # We could put this in the nexus-client instead nexus-client.workspace = true omicron-common.workspace = true +rand.workspace = true schemars = { workspace = true, features = [ "chrono", "uuid1" ] } serde.workspace = true serde_json.workspace = true @@ -25,3 +26,7 @@ slog.workspace = true thiserror.workspace = true tokio.workspace = true uuid.workspace = true + +[dev-dependencies] +illumos-utils = { workspace = true, features = ["testing"] } +omicron-test-utils.workspace = true diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index e521dd963a..3c40dc10f0 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -2,10 +2,112 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +//! ZFS dataset related functionality + +use crate::keyfile::KeyFile; +use camino::Utf8PathBuf; +use illumos_utils::zfs::{ + self, DestroyDatasetErrorVariant, EncryptionDetails, Keypath, Mountpoint, + SizeDetails, Zfs, +}; use illumos_utils::zpool::ZpoolName; +use key_manager::StorageKeyRequester; +use omicron_common::disk::DiskIdentity; +use rand::distributions::{Alphanumeric, DistString}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_hardware::DiskVariant; +use slog::{info, Logger}; use std::str::FromStr; +use std::sync::OnceLock; + +pub const INSTALL_DATASET: &'static str = "install"; +pub const CRASH_DATASET: &'static str = "crash"; +pub const CLUSTER_DATASET: &'static str = "cluster"; +pub const CONFIG_DATASET: &'static str = "config"; +pub const M2_DEBUG_DATASET: &'static str = "debug"; +// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be +// tuned as needed. +pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); +// ditto. +pub const DUMP_DATASET_QUOTA: usize = 100 * (1 << 30); +// passed to zfs create -o compression= +pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; + +// U.2 datasets live under the encrypted dataset and inherit encryption +pub const ZONE_DATASET: &'static str = "crypt/zone"; +pub const DUMP_DATASET: &'static str = "crypt/debug"; +pub const U2_DEBUG_DATASET: &'static str = "crypt/debug"; + +// This is the root dataset for all U.2 drives. Encryption is inherited. +pub const CRYPT_DATASET: &'static str = "crypt"; + +const U2_EXPECTED_DATASET_COUNT: usize = 2; +static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ + // Stores filesystems for zones + ExpectedDataset::new(ZONE_DATASET).wipe(), + // For storing full kernel RAM dumps + ExpectedDataset::new(DUMP_DATASET) + .quota(DUMP_DATASET_QUOTA) + .compression(DUMP_DATASET_COMPRESSION), +]; + +const M2_EXPECTED_DATASET_COUNT: usize = 5; +static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ + // Stores software images. + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(INSTALL_DATASET), + // Stores crash dumps. + ExpectedDataset::new(CRASH_DATASET), + // Stores cluter configuration information. + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(CLUSTER_DATASET), + // Stores configuration data, including: + // - What services should be launched on this sled + // - Information about how to initialize the Sled Agent + // - (For scrimlets) RSS setup information + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(CONFIG_DATASET), + // Store debugging data, such as service bundles. + ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), +]; + +// Helper type for describing expected datasets and their optional quota. +#[derive(Clone, Copy, Debug)] +struct ExpectedDataset { + // Name for the dataset + name: &'static str, + // Optional quota, in _bytes_ + quota: Option, + // Identifies if the dataset should be deleted on boot + wipe: bool, + // Optional compression mode + compression: Option<&'static str>, +} + +impl ExpectedDataset { + const fn new(name: &'static str) -> Self { + ExpectedDataset { name, quota: None, wipe: false, compression: None } + } + + const fn quota(mut self, quota: usize) -> Self { + self.quota = Some(quota); + self + } + + const fn wipe(mut self) -> Self { + self.wipe = true; + self + } + + const fn compression(mut self, compression: &'static str) -> Self { + self.compression = Some(compression); + self + } +} /// The type of a dataset, and an auxiliary information necessary /// to successfully launch a zone managing the associated data. @@ -105,6 +207,178 @@ impl From for sled_agent_client::types::DatasetName { } } +#[derive(Debug, thiserror::Error)] +pub enum DatasetError { + #[error("Cannot open {path} due to {error}")] + IoError { path: Utf8PathBuf, error: std::io::Error }, + #[error(transparent)] + DestroyFilesystem(#[from] illumos_utils::zfs::DestroyDatasetError), + #[error(transparent)] + EnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), + #[error("KeyManager error: {0}")] + KeyManager(#[from] key_manager::Error), + #[error("Missing StorageKeyRequester when creating U.2 disk")] + MissingStorageKeyRequester, + #[error("Encrypted filesystem '{0}' missing 'oxide:epoch' property")] + CannotParseEpochProperty(String), + #[error("Encrypted dataset '{dataset}' cannot set 'oxide:agent' property: {err}")] + CannotSetAgentProperty { + dataset: String, + #[source] + err: Box, + }, +} + +/// Ensure that the zpool contains all the datasets we would like it to +/// contain. +/// +/// WARNING: In all cases where a U.2 is a possible `DiskVariant`, a +/// `StorageKeyRequester` must be passed so that disk encryption can +/// be used. The `StorageManager` for the sled-agent always has a +/// `StorageKeyRequester` available, and so the only place we should pass +/// `None` is for the M.2s touched by the Installinator. +pub async fn ensure_zpool_has_datasets( + log: &Logger, + zpool_name: &ZpoolName, + disk_identity: &DiskIdentity, + key_requester: Option<&StorageKeyRequester>, +) -> Result<(), DatasetError> { + let (root, datasets) = match zpool_name.kind().into() { + DiskVariant::M2 => (None, M2_EXPECTED_DATASETS.iter()), + DiskVariant::U2 => (Some(CRYPT_DATASET), U2_EXPECTED_DATASETS.iter()), + }; + + let zoned = false; + let do_format = true; + + // Ensure the root encrypted filesystem exists + // Datasets below this in the hierarchy will inherit encryption + if let Some(dataset) = root { + let Some(key_requester) = key_requester else { + return Err(DatasetError::MissingStorageKeyRequester); + }; + let mountpoint = zpool_name.dataset_mountpoint(dataset); + let keypath: Keypath = disk_identity.into(); + + let epoch = + if let Ok(epoch_str) = Zfs::get_oxide_value(dataset, "epoch") { + if let Ok(epoch) = epoch_str.parse::() { + epoch + } else { + return Err(DatasetError::CannotParseEpochProperty( + dataset.to_string(), + )); + } + } else { + // We got an error trying to call `Zfs::get_oxide_value` + // which indicates that the dataset doesn't exist or there + // was a problem running the command. + // + // Note that `Zfs::get_oxide_value` will succeed even if + // the epoch is missing. `epoch_str` will show up as a dash + // (`-`) and will not parse into a `u64`. So we don't have + // to worry about that case here as it is handled above. + // + // If the error indicated that the command failed for some + // other reason, but the dataset actually existed, we will + // try to create the dataset below and that will fail. So + // there is no harm in just loading the latest secret here. + key_requester.load_latest_secret().await? + }; + + let key = key_requester.get_key(epoch, disk_identity.clone()).await?; + + let mut keyfile = + KeyFile::create(keypath.clone(), key.expose_secret(), log) + .await + .map_err(|error| DatasetError::IoError { + path: keypath.0.clone(), + error, + })?; + + let encryption_details = EncryptionDetails { keypath, epoch }; + + info!( + log, + "Ensuring encrypted filesystem: {} for epoch {}", dataset, epoch + ); + let result = Zfs::ensure_filesystem( + &format!("{}/{}", zpool_name, dataset), + Mountpoint::Path(mountpoint), + zoned, + do_format, + Some(encryption_details), + None, + ); + + keyfile.zero_and_unlink().await.map_err(|error| { + DatasetError::IoError { path: keyfile.path().0.clone(), error } + })?; + + result?; + }; + + for dataset in datasets.into_iter() { + let mountpoint = zpool_name.dataset_mountpoint(dataset.name); + let name = &format!("{}/{}", zpool_name, dataset.name); + + // Use a value that's alive for the duration of this sled agent + // to answer the question: should we wipe this disk, or have + // we seen it before? + // + // If this value comes from a prior iteration of the sled agent, + // we opt to remove the corresponding dataset. + static AGENT_LOCAL_VALUE: OnceLock = OnceLock::new(); + let agent_local_value = AGENT_LOCAL_VALUE.get_or_init(|| { + Alphanumeric.sample_string(&mut rand::thread_rng(), 20) + }); + + if dataset.wipe { + match Zfs::get_oxide_value(name, "agent") { + Ok(v) if &v == agent_local_value => { + info!(log, "Skipping automatic wipe for dataset: {}", name); + } + Ok(_) | Err(_) => { + info!(log, "Automatically destroying dataset: {}", name); + Zfs::destroy_dataset(name).or_else(|err| { + // If we can't find the dataset, that's fine -- it might + // not have been formatted yet. + if let DestroyDatasetErrorVariant::NotFound = err.err { + Ok(()) + } else { + Err(err) + } + })?; + } + } + } + + let encryption_details = None; + let size_details = Some(SizeDetails { + quota: dataset.quota, + compression: dataset.compression, + }); + Zfs::ensure_filesystem( + name, + Mountpoint::Path(mountpoint), + zoned, + do_format, + encryption_details, + size_details, + )?; + + if dataset.wipe { + Zfs::set_oxide_value(name, "agent", agent_local_value).map_err( + |err| DatasetError::CannotSetAgentProperty { + dataset: name.clone(), + err: Box::new(err), + }, + )?; + } + } + Ok(()) +} + #[cfg(test)] mod test { use super::*; diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index aef68528bf..d7e02d8c97 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -6,8 +6,15 @@ use camino::Utf8PathBuf; use illumos_utils::zpool::{ZpoolKind, ZpoolName}; +use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; -use sled_hardware::{Disk, DiskVariant}; +use sled_hardware::{ + DiskPaths, DiskVariant, Partition, PooledDisk, PooledDiskError, + UnparsedDisk, +}; +use slog::Logger; + +use crate::dataset; /// A wrapper around real disks or synthetic disks backed by a file #[derive(Debug, PartialEq, Eq, Clone)] @@ -55,3 +62,105 @@ impl DiskWrapper { } } } + +#[derive(Debug, thiserror::Error)] +pub enum DiskError { + #[error(transparent)] + Dataset(#[from] crate::dataset::DatasetError), + #[error(transparent)] + PooledDisk(#[from] sled_hardware::PooledDiskError), +} + +/// A physical disk conforming to the expected partition layout +/// and which contains provisioned zpools and datasets. This disk +/// is ready for usage by higher level software. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Disk { + paths: DiskPaths, + slot: i64, + variant: DiskVariant, + identity: DiskIdentity, + is_boot_disk: bool, + partitions: Vec, + + // This embeds the assumtion that there is exactly one parsed zpool per + // disk. + zpool_name: ZpoolName, +} + +impl Disk { + pub async fn new( + log: &Logger, + unparsed_disk: UnparsedDisk, + key_requester: Option<&StorageKeyRequester>, + ) -> Result { + let disk = PooledDisk::new(log, unparsed_disk)?; + dataset::ensure_zpool_has_datasets( + log, + &disk.zpool_name, + &disk.identity, + key_requester, + ) + .await?; + Ok(disk.into()) + } + pub fn is_boot_disk(&self) -> bool { + self.is_boot_disk + } + + pub fn identity(&self) -> &DiskIdentity { + &self.identity + } + + pub fn variant(&self) -> DiskVariant { + self.variant + } + + pub fn devfs_path(&self) -> &Utf8PathBuf { + &self.paths.devfs_path + } + + pub fn zpool_name(&self) -> &ZpoolName { + &self.zpool_name + } + + pub fn boot_image_devfs_path( + &self, + raw: bool, + ) -> Result { + self.paths.partition_device_path( + &self.partitions, + Partition::BootImage, + raw, + ) + } + + pub fn dump_device_devfs_path( + &self, + raw: bool, + ) -> Result { + self.paths.partition_device_path( + &self.partitions, + Partition::DumpDevice, + raw, + ) + } + + pub fn slot(&self) -> i64 { + self.slot + } +} + +impl From for Disk { + fn from(pd: PooledDisk) -> Self { + Self { + paths: pd.paths, + slot: pd.slot, + variant: pd.variant, + identity: pd.identity, + is_boot_disk: pd.is_boot_disk, + partitions: pd.partitions, + zpool_name: pd.zpool_name, + } + } +} diff --git a/sled-storage/src/dump_setup.rs b/sled-storage/src/dump_setup.rs index ea51251f84..5befa8e8c8 100644 --- a/sled-storage/src/dump_setup.rs +++ b/sled-storage/src/dump_setup.rs @@ -1,3 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Dump dataset setup + +use crate::dataset::{CRASH_DATASET, DUMP_DATASET}; use crate::disk::DiskWrapper; use camino::Utf8PathBuf; use derive_more::{AsRef, Deref, From}; @@ -70,11 +77,11 @@ trait GetMountpoint: std::ops::Deref { } impl GetMountpoint for DebugZpool { type NewType = DebugDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::DUMP_DATASET; + const MOUNTPOINT: &'static str = DUMP_DATASET; } impl GetMountpoint for CoreZpool { type NewType = CoreDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::CRASH_DATASET; + const MOUNTPOINT: &'static str = CRASH_DATASET; } struct DumpSetupWorker { diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index d2a2a473b1..04c4f7ec07 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -12,7 +12,7 @@ use uuid::Uuid; #[derive(thiserror::Error, Debug)] pub enum Error { #[error(transparent)] - DiskError(#[from] sled_hardware::DiskError), + DiskError(#[from] sled_hardware::PooledDiskError), // TODO: We could add the context of "why are we doint this op", maybe? #[error(transparent)] diff --git a/sled-storage/src/keyfile.rs b/sled-storage/src/keyfile.rs new file mode 100644 index 0000000000..396c860fc5 --- /dev/null +++ b/sled-storage/src/keyfile.rs @@ -0,0 +1,68 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Key file support for ZFS dataset encryption + +use illumos_utils::zfs::Keypath; +use slog::{info, Logger}; +use tokio::fs::{remove_file, File}; +use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; + +/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior +/// and to ensure it goes away on power off. +/// +/// We want minimize the time the key files are in memory, and so we rederive +/// the keys and recreate the files on demand when creating and mounting +/// encrypted filesystems. We then zero them and unlink them. +pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; + +/// A file that wraps a zfs encryption key. +/// +/// We put this in a RAM backed filesystem and zero and delete it when we are +/// done with it. Unfortunately we cannot do this inside `Drop` because there is no +/// equivalent async drop. +pub struct KeyFile { + path: Keypath, + file: File, + log: Logger, +} + +impl KeyFile { + pub async fn create( + path: Keypath, + key: &[u8; 32], + log: &Logger, + ) -> std::io::Result { + // TODO: fix this to not truncate + // We want to overwrite any existing contents. + // If we truncate we may leave dirty pages around + // containing secrets. + let mut file = tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .open(&path.0) + .await?; + file.write_all(key).await?; + info!(log, "Created keyfile {}", path); + Ok(KeyFile { path, file, log: log.clone() }) + } + + /// These keyfiles live on a tmpfs and we zero the file so the data doesn't + /// linger on the page in memory. + /// + /// It'd be nice to `impl Drop for `KeyFile` and then call `zero` + /// from within the drop handler, but async `Drop` isn't supported. + pub async fn zero_and_unlink(&mut self) -> std::io::Result<()> { + let zeroes = [0u8; 32]; + let _ = self.file.seek(SeekFrom::Start(0)).await?; + self.file.write_all(&zeroes).await?; + info!(self.log, "Zeroed and unlinked keyfile {}", self.path); + remove_file(&self.path().0).await?; + Ok(()) + } + + pub fn path(&self) -> &Keypath { + &self.path + } +} diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index a1bd4eecfb..783eaf6642 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -3,10 +3,15 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Local storage abstraction for use by sled-agent +//! +//! This abstraction operates at the ZFS level and relies on zpool setup on +//! hardware partitions from the `sled-hardware` crate. It utilizes the +//! `illumos-utils` crate to actually perform ZFS related OS calls. pub(crate) mod dataset; pub(crate) mod disk; pub(crate) mod dump_setup; pub mod error; +pub(crate) mod keyfile; pub(crate) mod pool; pub mod state; diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs index 4a9960da4c..1abf43c1de 100644 --- a/sled-storage/src/pool.rs +++ b/sled-storage/src/pool.rs @@ -9,9 +9,9 @@ use illumos_utils::zpool::{ZpoolInfo, ZpoolName}; use omicron_common::disk::DiskIdentity; #[cfg(test)] -use illumos_utils::{zfs::MockZfs as Zfs, zpool::MockZpool as Zpool}; +use illumos_utils::zpool::MockZpool as Zpool; #[cfg(not(test))] -use illumos_utils::{zfs::Zfs, zpool::Zpool}; +use illumos_utils::zpool::Zpool; /// A ZFS storage pool #[derive(Debug, Clone)] @@ -25,12 +25,12 @@ impl Pool { /// Queries for an existing Zpool by name. /// /// Returns Ok if the pool exists. - fn new(name: ZpoolName, parent: DiskIdentity) -> Result { + pub fn new(name: ZpoolName, parent: DiskIdentity) -> Result { let info = Zpool::get_info(&name.to_string())?; Ok(Pool { name, info, parent }) } - fn parent(&self) -> &DiskIdentity { + pub fn parent(&self) -> &DiskIdentity { &self.parent } } diff --git a/sled-storage/src/state.rs b/sled-storage/src/state.rs index a7de70999e..8a0be34f63 100644 --- a/sled-storage/src/state.rs +++ b/sled-storage/src/state.rs @@ -4,6 +4,7 @@ //! The internal state of the storage manager task +use crate::dataset::M2_DEBUG_DATASET; use crate::disk::DiskWrapper; use crate::pool::Pool; use camino::Utf8PathBuf; @@ -110,7 +111,7 @@ impl State { /// Return the directories for storing zone service bundles. pub fn all_zone_bundle_directories(&self) -> Vec { - self.all_m2_mountpoints(sled_hardware::disk::M2_DEBUG_DATASET) + self.all_m2_mountpoints(M2_DEBUG_DATASET) .into_iter() .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) .collect()