diff --git a/Cargo.lock b/Cargo.lock index c5a1b61122..c0e623d968 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3480,6 +3480,7 @@ dependencies = [ "reqwest", "sha2", "sled-hardware", + "sled-storage", "slog", "slog-async", "slog-envlogger", @@ -4293,6 +4294,8 @@ dependencies = [ "schemars", "serde", "serde_json", + "sled-hardware", + "sled-storage", "slog", "uuid", ] @@ -5297,6 +5300,7 @@ dependencies = [ "sha3", "sled-agent-client", "sled-hardware", + "sled-storage", "slog", "slog-async", "slog-dtrace", @@ -8181,6 +8185,7 @@ dependencies = [ "reqwest", "schemars", "serde", + "sled-storage", "slog", "uuid", ] @@ -8195,11 +8200,9 @@ dependencies = [ "futures", "illumos-devinfo", "illumos-utils", - "key-manager", "libc", "libefi-illumos", "macaddr", - "nexus-client 0.1.0", "omicron-common 0.1.0", "omicron-test-utils", "omicron-workspace-hack", @@ -8214,6 +8217,32 @@ dependencies = [ "uuid", ] +[[package]] +name = "sled-storage" +version = "0.1.0" +dependencies = [ + "async-trait", + "camino", + "camino-tempfile", + "cfg-if 1.0.0", + "derive_more", + "glob", + "illumos-utils", + "key-manager", + "omicron-common 0.1.0", + "omicron-test-utils", + "omicron-workspace-hack", + "rand 0.8.5", + "schemars", + "serde", + "serde_json", + "sled-hardware", + "slog", + "thiserror", + "tokio", + "uuid", +] + [[package]] name = "slog" version = "2.7.0" diff --git a/Cargo.toml b/Cargo.toml index 0e13946533..dfc6fe9c76 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ members = [ "rpaths", "sled-agent", "sled-hardware", + "sled-storage", "sp-sim", "test-utils", "tufaceous-lib", @@ -122,6 +123,7 @@ default-members = [ "rpaths", "sled-agent", "sled-hardware", + "sled-storage", "sp-sim", "test-utils", "tufaceous-lib", @@ -329,6 +331,7 @@ similar-asserts = "1.5.0" sled = "0.34" sled-agent-client = { path = "clients/sled-agent-client" } sled-hardware = { path = "sled-hardware" } +sled-storage = { path = "sled-storage" } slog = { version = "2.7", features = [ "dynamic-keys", "max_level_trace", "release_max_level_debug" ] } slog-async = "2.8" slog-dtrace = "0.2" diff --git a/clients/nexus-client/Cargo.toml b/clients/nexus-client/Cargo.toml index 2734142f9f..239cb77789 100644 --- a/clients/nexus-client/Cargo.toml +++ b/clients/nexus-client/Cargo.toml @@ -10,6 +10,8 @@ futures.workspace = true ipnetwork.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true +sled-hardware.workspace = true +sled-storage.workspace = true progenitor.workspace = true regress.workspace = true reqwest = { workspace = true, features = ["rustls-tls", "stream"] } diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 23ceb114fc..9f81492d10 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -388,3 +388,36 @@ impl From } } } + +impl From for types::PhysicalDiskKind { + fn from(value: sled_hardware::DiskVariant) -> Self { + match value { + sled_hardware::DiskVariant::U2 => types::PhysicalDiskKind::U2, + sled_hardware::DiskVariant::M2 => types::PhysicalDiskKind::M2, + } + } +} + +impl From for types::Baseboard { + fn from(b: sled_hardware::Baseboard) -> types::Baseboard { + types::Baseboard { + serial_number: b.identifier().to_string(), + part_number: b.model().to_string(), + revision: b.revision(), + } + } +} + +impl From for types::DatasetKind { + fn from(k: sled_storage::dataset::DatasetKind) -> Self { + use sled_storage::dataset::DatasetKind::*; + match k { + CockroachDb => Self::Cockroach, + Crucible => Self::Crucible, + Clickhouse => Self::Clickhouse, + ClickhouseKeeper => Self::ClickhouseKeeper, + ExternalDns => Self::ExternalDns, + InternalDns => Self::InternalDns, + } + } +} diff --git a/clients/sled-agent-client/Cargo.toml b/clients/sled-agent-client/Cargo.toml index 8630030b24..18ca342a2b 100644 --- a/clients/sled-agent-client/Cargo.toml +++ b/clients/sled-agent-client/Cargo.toml @@ -15,5 +15,6 @@ reqwest = { workspace = true, features = [ "json", "rustls-tls", "stream" ] } schemars.workspace = true serde.workspace = true slog.workspace = true +sled-storage.workspace = true uuid.workspace = true omicron-workspace-hack.workspace = true diff --git a/common/src/disk.rs b/common/src/disk.rs index 3ea8091326..3ae9c31e01 100644 --- a/common/src/disk.rs +++ b/common/src/disk.rs @@ -5,7 +5,7 @@ //! Disk related types shared among crates /// Uniquely identifies a disk. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] pub struct DiskIdentity { pub vendor: String, pub serial: String, diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml index a291a15e78..497454e047 100644 --- a/illumos-utils/Cargo.toml +++ b/illumos-utils/Cargo.toml @@ -44,3 +44,6 @@ toml.workspace = true [features] # Enable to generate MockZones testing = ["mockall"] +# Useful for tests that want real functionality and ability to run without +# pfexec +tmp_keypath = [] diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs index 345f097ae2..1faa4c5c37 100644 --- a/illumos-utils/src/lib.rs +++ b/illumos-utils/src/lib.rs @@ -4,6 +4,9 @@ //! Wrappers around illumos-specific commands. +#[allow(unused)] +use std::sync::atomic::{AtomicBool, Ordering}; + use cfg_if::cfg_if; pub mod addrobj; @@ -93,7 +96,7 @@ mod inner { // Helper function for starting the process and checking the // exit code result. - pub fn execute( + pub fn execute_helper( command: &mut std::process::Command, ) -> Result { let output = command.output().map_err(|err| { @@ -108,6 +111,34 @@ mod inner { } } +// Due to feature unification, the `testing` feature is enabled when some tests +// don't actually want to use it. We allow them to opt out of the use of the +// free function here. We also explicitly opt-in where mocks are used. +// +// Note that this only works if the tests that use mocks and those that don't +// are run sequentially. However, this is how we do things in CI with nextest, +// so there is no problem currently. +// +// We can remove all this when we get rid of the mocks. +#[cfg(any(test, feature = "testing"))] +pub static USE_MOCKS: AtomicBool = AtomicBool::new(false); + +pub fn execute( + command: &mut std::process::Command, +) -> Result { + cfg_if! { + if #[cfg(any(test, feature = "testing"))] { + if USE_MOCKS.load(Ordering::SeqCst) { + mock_inner::execute_helper(command) + } else { + inner::execute_helper(command) + } + } else { + inner::execute_helper(command) + } + } +} + cfg_if! { if #[cfg(any(test, feature = "testing"))] { pub use mock_inner::*; diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index a6af997619..e9554100af 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -20,7 +20,16 @@ pub const ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT: &str = "/zone"; pub const ZONE_ZFS_RAMDISK_DATASET: &str = "rpool/zone"; pub const ZFS: &str = "/usr/sbin/zfs"; + +/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior +/// and to ensure it goes away on power off. +/// +/// We want minimize the time the key files are in memory, and so we rederive +/// the keys and recreate the files on demand when creating and mounting +/// encrypted filesystems. We then zero them and unlink them. pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; +// Use /tmp so we don't have to worry about running tests with pfexec +pub const TEST_KEYPATH_ROOT: &str = "/tmp"; /// Error returned by [`Zfs::list_datasets`]. #[derive(thiserror::Error, Debug)] @@ -158,19 +167,27 @@ impl fmt::Display for Keypath { } } +#[cfg(not(feature = "tmp_keypath"))] +impl From<&DiskIdentity> for Keypath { + fn from(id: &DiskIdentity) -> Self { + build_keypath(id, KEYPATH_ROOT) + } +} + +#[cfg(feature = "tmp_keypath")] impl From<&DiskIdentity> for Keypath { fn from(id: &DiskIdentity) -> Self { - let filename = format!( - "{}-{}-{}-zfs-aes-256-gcm.key", - id.vendor, id.serial, id.model - ); - let mut path = Utf8PathBuf::new(); - path.push(KEYPATH_ROOT); - path.push(filename); - Keypath(path) + build_keypath(id, TEST_KEYPATH_ROOT) } } +fn build_keypath(id: &DiskIdentity, root: &str) -> Keypath { + let filename = + format!("{}-{}-{}-zfs-aes-256-gcm.key", id.vendor, id.serial, id.model); + let path: Utf8PathBuf = [root, &filename].iter().collect(); + Keypath(path) +} + #[derive(Debug)] pub struct EncryptionDetails { pub keypath: Keypath, diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index 81ded2655e..f2c395e22b 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -39,6 +39,13 @@ pub struct CreateError { err: Error, } +#[derive(thiserror::Error, Debug)] +#[error("Failed to destroy zpool: {err}")] +pub struct DestroyError { + #[from] + err: Error, +} + #[derive(thiserror::Error, Debug)] #[error("Failed to list zpools: {err}")] pub struct ListError { @@ -89,7 +96,7 @@ impl FromStr for ZpoolHealth { } /// Describes a Zpool. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct ZpoolInfo { name: String, size: u64, @@ -121,6 +128,17 @@ impl ZpoolInfo { pub fn health(&self) -> ZpoolHealth { self.health } + + #[cfg(any(test, feature = "testing"))] + pub fn new_hardcoded(name: String) -> ZpoolInfo { + ZpoolInfo { + name, + size: 1024 * 1024 * 64, + allocated: 1024, + free: 1024 * 1023 * 64, + health: ZpoolHealth::Online, + } + } } impl FromStr for ZpoolInfo { @@ -167,7 +185,10 @@ pub struct Zpool {} #[cfg_attr(any(test, feature = "testing"), mockall::automock, allow(dead_code))] impl Zpool { - pub fn create(name: ZpoolName, vdev: &Utf8Path) -> Result<(), CreateError> { + pub fn create( + name: &ZpoolName, + vdev: &Utf8Path, + ) -> Result<(), CreateError> { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear(); cmd.env("LC_ALL", "C.UTF-8"); @@ -189,7 +210,17 @@ impl Zpool { Ok(()) } - pub fn import(name: ZpoolName) -> Result<(), Error> { + pub fn destroy(name: &ZpoolName) -> Result<(), DestroyError> { + let mut cmd = std::process::Command::new(PFEXEC); + cmd.env_clear(); + cmd.env("LC_ALL", "C.UTF-8"); + cmd.arg(ZPOOL).arg("destroy"); + cmd.arg(&name.to_string()); + execute(&mut cmd).map_err(Error::from)?; + Ok(()) + } + + pub fn import(name: &ZpoolName) -> Result<(), Error> { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear(); cmd.env("LC_ALL", "C.UTF-8"); diff --git a/installinator/Cargo.toml b/installinator/Cargo.toml index 5a7c6bd0e5..d489e73ec1 100644 --- a/installinator/Cargo.toml +++ b/installinator/Cargo.toml @@ -27,6 +27,7 @@ omicron-common.workspace = true reqwest.workspace = true sha2.workspace = true sled-hardware.workspace = true +sled-storage.workspace = true slog.workspace = true slog-async.workspace = true slog-envlogger.workspace = true diff --git a/installinator/src/hardware.rs b/installinator/src/hardware.rs index ffa0b74739..b037384cbe 100644 --- a/installinator/src/hardware.rs +++ b/installinator/src/hardware.rs @@ -6,10 +6,11 @@ use anyhow::anyhow; use anyhow::ensure; use anyhow::Context; use anyhow::Result; -use sled_hardware::Disk; use sled_hardware::DiskVariant; use sled_hardware::HardwareManager; use sled_hardware::SledMode; +use sled_storage::disk::Disk; +use sled_storage::disk::RawDisk; use slog::info; use slog::Logger; @@ -28,7 +29,8 @@ impl Hardware { anyhow!("failed to create HardwareManager: {err}") })?; - let disks = hardware.disks(); + let disks: Vec = + hardware.disks().into_iter().map(|disk| disk.into()).collect(); info!( log, "found gimlet hardware"; diff --git a/installinator/src/write.rs b/installinator/src/write.rs index 6c0c1f63c7..22dd2adbf6 100644 --- a/installinator/src/write.rs +++ b/installinator/src/write.rs @@ -122,8 +122,9 @@ impl WriteDestination { ); let zpool_name = disk.zpool_name().clone(); - let control_plane_dir = zpool_name - .dataset_mountpoint(sled_hardware::INSTALL_DATASET); + let control_plane_dir = zpool_name.dataset_mountpoint( + sled_storage::dataset::INSTALL_DATASET, + ); match drives.entry(slot) { Entry::Vacant(entry) => { diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 827cb131cb..46148304f8 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -58,6 +58,7 @@ serde_json = {workspace = true, features = ["raw_value"]} sha3.workspace = true sled-agent-client.workspace = true sled-hardware.workspace = true +sled-storage.workspace = true slog.workspace = true slog-async.workspace = true slog-dtrace.workspace = true @@ -94,7 +95,8 @@ slog-async.workspace = true slog-term.workspace = true tempfile.workspace = true -illumos-utils = { workspace = true, features = ["testing"] } +illumos-utils = { workspace = true, features = ["testing", "tmp_keypath"] } +sled-storage = { workspace = true, features = ["testing"] } # # Disable doc builds by default for our binaries to work around issue diff --git a/sled-agent/src/backing_fs.rs b/sled-agent/src/backing_fs.rs index 6ecb9dac43..2e9ea4c8d9 100644 --- a/sled-agent/src/backing_fs.rs +++ b/sled-agent/src/backing_fs.rs @@ -128,7 +128,7 @@ pub(crate) fn ensure_backing_fs( let dataset = format!( "{}/{}/{}", boot_zpool_name, - sled_hardware::disk::M2_BACKING_DATASET, + sled_storage::dataset::M2_BACKING_DATASET, bfs.name ); let mountpoint = Mountpoint::Path(Utf8PathBuf::from(bfs.mountpoint)); diff --git a/sled-agent/src/bootstrap/bootstore.rs b/sled-agent/src/bootstrap/bootstore_setup.rs similarity index 55% rename from sled-agent/src/bootstrap/bootstore.rs rename to sled-agent/src/bootstrap/bootstore_setup.rs index 17267bef55..9eb0a87c03 100644 --- a/sled-agent/src/bootstrap/bootstore.rs +++ b/sled-agent/src/bootstrap/bootstore_setup.rs @@ -5,124 +5,78 @@ //! Helpers for configuring and starting the bootstore during bootstrap agent //! startup. +#![allow(clippy::result_large_err)] + use super::config::BOOTSTORE_PORT; use super::server::StartError; -use crate::storage_manager::StorageResources; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use ddm_admin_client::Client as DdmAdminClient; use sled_hardware::underlay::BootstrapInterface; use sled_hardware::Baseboard; +use sled_storage::dataset::CLUSTER_DATASET; +use sled_storage::resources::StorageResources; use slog::Logger; use std::collections::BTreeSet; use std::net::Ipv6Addr; use std::net::SocketAddrV6; use std::time::Duration; -use tokio::task::JoinHandle; const BOOTSTORE_FSM_STATE_FILE: &str = "bootstore-fsm-state.json"; const BOOTSTORE_NETWORK_CONFIG_FILE: &str = "bootstore-network-config.json"; -pub(super) struct BootstoreHandles { - pub(super) node_handle: bootstore::NodeHandle, - - // These two are never used; we keep them to show ownership of the spawned - // tasks. - _node_task_handle: JoinHandle<()>, - _peer_update_task_handle: JoinHandle<()>, -} - -impl BootstoreHandles { - pub(super) async fn spawn( - storage_resources: &StorageResources, - ddm_admin_client: DdmAdminClient, - baseboard: Baseboard, - global_zone_bootstrap_ip: Ipv6Addr, - base_log: &Logger, - ) -> Result { - let config = bootstore::Config { - id: baseboard, - addr: SocketAddrV6::new( - global_zone_bootstrap_ip, - BOOTSTORE_PORT, - 0, - 0, - ), - time_per_tick: Duration::from_millis(250), - learn_timeout: Duration::from_secs(5), - rack_init_timeout: Duration::from_secs(300), - rack_secret_request_timeout: Duration::from_secs(5), - fsm_state_ledger_paths: bootstore_fsm_state_paths( - &storage_resources, - ) - .await?, - network_config_ledger_paths: bootstore_network_config_paths( - &storage_resources, - ) - .await?, - }; - - let (mut node, node_handle) = - bootstore::Node::new(config, base_log).await; - - let join_handle = tokio::spawn(async move { node.run().await }); - - // Spawn a task for polling DDMD and updating bootstore - let peer_update_handle = - tokio::spawn(poll_ddmd_for_bootstore_peer_update( - base_log.new(o!("component" => "bootstore_ddmd_poller")), - node_handle.clone(), - ddm_admin_client, - )); - - Ok(Self { - node_handle, - _node_task_handle: join_handle, - _peer_update_task_handle: peer_update_handle, - }) - } +pub fn new_bootstore_config( + storage_resources: &StorageResources, + baseboard: Baseboard, + global_zone_bootstrap_ip: Ipv6Addr, +) -> Result { + Ok(bootstore::Config { + id: baseboard, + addr: SocketAddrV6::new(global_zone_bootstrap_ip, BOOTSTORE_PORT, 0, 0), + time_per_tick: Duration::from_millis(250), + learn_timeout: Duration::from_secs(5), + rack_init_timeout: Duration::from_secs(300), + rack_secret_request_timeout: Duration::from_secs(5), + fsm_state_ledger_paths: bootstore_fsm_state_paths(&storage_resources)?, + network_config_ledger_paths: bootstore_network_config_paths( + &storage_resources, + )?, + }) } -async fn bootstore_fsm_state_paths( +fn bootstore_fsm_state_paths( storage: &StorageResources, ) -> Result, StartError> { let paths: Vec<_> = storage - .all_m2_mountpoints(sled_hardware::disk::CLUSTER_DATASET) - .await + .all_m2_mountpoints(CLUSTER_DATASET) .into_iter() .map(|p| p.join(BOOTSTORE_FSM_STATE_FILE)) .collect(); if paths.is_empty() { - return Err(StartError::MissingM2Paths( - sled_hardware::disk::CLUSTER_DATASET, - )); + return Err(StartError::MissingM2Paths(CLUSTER_DATASET)); } Ok(paths) } -async fn bootstore_network_config_paths( +fn bootstore_network_config_paths( storage: &StorageResources, ) -> Result, StartError> { let paths: Vec<_> = storage - .all_m2_mountpoints(sled_hardware::disk::CLUSTER_DATASET) - .await + .all_m2_mountpoints(CLUSTER_DATASET) .into_iter() .map(|p| p.join(BOOTSTORE_NETWORK_CONFIG_FILE)) .collect(); if paths.is_empty() { - return Err(StartError::MissingM2Paths( - sled_hardware::disk::CLUSTER_DATASET, - )); + return Err(StartError::MissingM2Paths(CLUSTER_DATASET)); } Ok(paths) } -async fn poll_ddmd_for_bootstore_peer_update( +pub async fn poll_ddmd_for_bootstore_peer_update( log: Logger, bootstore_node_handle: bootstore::NodeHandle, - ddmd_client: DdmAdminClient, ) { let mut current_peers: BTreeSet = BTreeSet::new(); // We're talking to a service's admin interface on localhost and @@ -132,7 +86,7 @@ async fn poll_ddmd_for_bootstore_peer_update( // We also use this timeout in the case of spurious ddmd failures // that require a reconnection from the ddmd_client. const RETRY: tokio::time::Duration = tokio::time::Duration::from_secs(5); - + let ddmd_client = DdmAdminClient::localhost(&log).unwrap(); loop { match ddmd_client .derive_bootstrap_addrs_from_prefixes(&[ @@ -154,7 +108,7 @@ async fn poll_ddmd_for_bootstore_peer_update( log, concat!( "Bootstore comms error: {}. ", - "bootstore::Node task must have paniced", + "bootstore::Node task must have panicked", ), e ); diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index c69bdeb0ce..7c32bf48a5 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -12,7 +12,6 @@ use super::BootstrapError; use super::RssAccessError; use crate::bootstrap::params::RackInitializeRequest; use crate::bootstrap::rack_ops::{RackInitId, RackResetId}; -use crate::storage_manager::StorageResources; use crate::updates::ConfigUpdates; use crate::updates::{Component, UpdateManager}; use bootstore::schemes::v0 as bootstore; @@ -25,6 +24,7 @@ use omicron_common::api::external::Error; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_hardware::Baseboard; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::Ipv6Addr; use tokio::sync::mpsc::error::TrySendError; @@ -33,7 +33,7 @@ use tokio::sync::{mpsc, oneshot}; pub(crate) struct BootstrapServerContext { pub(crate) base_log: Logger, pub(crate) global_zone_bootstrap_ip: Ipv6Addr, - pub(crate) storage_resources: StorageResources, + pub(crate) storage_manager: StorageHandle, pub(crate) bootstore_node_handle: bootstore::NodeHandle, pub(crate) baseboard: Baseboard, pub(crate) rss_access: RssAccess, @@ -50,7 +50,7 @@ impl BootstrapServerContext { self.rss_access.start_initializing( &self.base_log, self.global_zone_bootstrap_ip, - &self.storage_resources, + &self.storage_manager, &self.bootstore_node_handle, request, ) diff --git a/sled-agent/src/bootstrap/mod.rs b/sled-agent/src/bootstrap/mod.rs index 96e674acf3..590e13c891 100644 --- a/sled-agent/src/bootstrap/mod.rs +++ b/sled-agent/src/bootstrap/mod.rs @@ -4,7 +4,7 @@ //! Bootstrap-related utilities -mod bootstore; +pub(crate) mod bootstore_setup; pub mod client; pub mod config; pub mod early_networking; @@ -14,7 +14,7 @@ pub(crate) mod params; mod pre_server; mod rack_ops; pub(crate) mod rss_handle; -mod secret_retriever; +pub mod secret_retriever; pub mod server; mod sprockets_server; mod views; diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 05493f5aa3..02710ff583 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -11,13 +11,15 @@ #![allow(clippy::result_large_err)] use super::maghemite; -use super::secret_retriever::LrtqOrHardcodedSecretRetriever; use super::server::StartError; use crate::config::Config; use crate::config::SidecarRevision; +use crate::long_running_tasks::{ + spawn_all_longrunning_tasks, LongRunningTaskHandles, +}; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; -use crate::storage_manager::StorageManager; +use crate::storage_monitor::UnderlayAccess; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; @@ -30,115 +32,16 @@ use illumos_utils::zfs; use illumos_utils::zfs::Zfs; use illumos_utils::zone; use illumos_utils::zone::Zones; -use key_manager::KeyManager; -use key_manager::StorageKeyRequester; use omicron_common::address::Ipv6Subnet; use omicron_common::FileKv; use sled_hardware::underlay; use sled_hardware::DendriteAsic; -use sled_hardware::HardwareManager; -use sled_hardware::HardwareUpdate; use sled_hardware::SledMode; use slog::Drain; use slog::Logger; use std::net::IpAddr; use std::net::Ipv6Addr; -use tokio::sync::broadcast; -use tokio::task::JoinHandle; - -pub(super) struct BootstrapManagers { - pub(super) hardware: HardwareManager, - pub(super) storage: StorageManager, - pub(super) service: ServiceManager, -} - -impl BootstrapManagers { - pub(super) async fn handle_hardware_update( - &self, - update: Result, - sled_agent: Option<&SledAgent>, - log: &Logger, - ) { - match update { - Ok(update) => match update { - HardwareUpdate::TofinoLoaded => { - let baseboard = self.hardware.baseboard(); - if let Err(e) = self - .service - .activate_switch( - sled_agent.map(|sa| sa.switch_zone_underlay_info()), - baseboard, - ) - .await - { - warn!(log, "Failed to activate switch: {e}"); - } - } - HardwareUpdate::TofinoUnloaded => { - if let Err(e) = self.service.deactivate_switch().await { - warn!(log, "Failed to deactivate switch: {e}"); - } - } - HardwareUpdate::TofinoDeviceChange => { - if let Some(sled_agent) = sled_agent { - sled_agent.notify_nexus_about_self(log); - } - } - HardwareUpdate::DiskAdded(disk) => { - self.storage.upsert_disk(disk).await; - } - HardwareUpdate::DiskRemoved(disk) => { - self.storage.delete_disk(disk).await; - } - }, - Err(broadcast::error::RecvError::Lagged(count)) => { - warn!(log, "Hardware monitor missed {count} messages"); - self.check_latest_hardware_snapshot(sled_agent, log).await; - } - Err(broadcast::error::RecvError::Closed) => { - // The `HardwareManager` monitoring task is an infinite loop - - // the only way for us to get `Closed` here is if it panicked, - // so we will propagate such a panic. - panic!("Hardware manager monitor task panicked"); - } - } - } - - // Observe the current hardware state manually. - // - // We use this when we're monitoring hardware for the first - // time, and if we miss notifications. - pub(super) async fn check_latest_hardware_snapshot( - &self, - sled_agent: Option<&SledAgent>, - log: &Logger, - ) { - let underlay_network = sled_agent.map(|sled_agent| { - sled_agent.notify_nexus_about_self(log); - sled_agent.switch_zone_underlay_info() - }); - info!( - log, "Checking current full hardware snapshot"; - "underlay_network_info" => ?underlay_network, - ); - if self.hardware.is_scrimlet_driver_loaded() { - let baseboard = self.hardware.baseboard(); - if let Err(e) = - self.service.activate_switch(underlay_network, baseboard).await - { - warn!(log, "Failed to activate switch: {e}"); - } - } else { - if let Err(e) = self.service.deactivate_switch().await { - warn!(log, "Failed to deactivate switch: {e}"); - } - } - - self.storage - .ensure_using_exactly_these_disks(self.hardware.disks()) - .await; - } -} +use tokio::sync::oneshot; pub(super) struct BootstrapAgentStartup { pub(super) config: Config, @@ -146,8 +49,10 @@ pub(super) struct BootstrapAgentStartup { pub(super) ddm_admin_localhost_client: DdmAdminClient, pub(super) base_log: Logger, pub(super) startup_log: Logger, - pub(super) managers: BootstrapManagers, - pub(super) key_manager_handle: JoinHandle<()>, + pub(super) service_manager: ServiceManager, + pub(super) long_running_task_handles: LongRunningTaskHandles, + pub(super) sled_agent_started_tx: oneshot::Sender, + pub(super) underlay_available_tx: oneshot::Sender, } impl BootstrapAgentStartup { @@ -201,36 +106,23 @@ impl BootstrapAgentStartup { // This should be a no-op if already enabled. BootstrapNetworking::enable_ipv6_forwarding().await?; - // Spawn the `KeyManager` which is needed by the the StorageManager to - // retrieve encryption keys. - let (storage_key_requester, key_manager_handle) = - spawn_key_manager_task(&base_log); - + // Are we a gimlet or scrimlet? let sled_mode = sled_mode_from_config(&config)?; - // Start monitoring hardware. This is blocking so we use - // `spawn_blocking`; similar to above, we move some things in and (on - // success) it gives them back. - let (base_log, log, hardware_manager) = { - tokio::task::spawn_blocking(move || { - info!( - log, "Starting hardware monitor"; - "sled_mode" => ?sled_mode, - ); - let hardware_manager = - HardwareManager::new(&base_log, sled_mode) - .map_err(StartError::StartHardwareManager)?; - Ok::<_, StartError>((base_log, log, hardware_manager)) - }) - .await - .unwrap()? - }; - - // Create a `StorageManager` and (possibly) synthetic disks. - let storage_manager = - StorageManager::new(&base_log, storage_key_requester).await; - upsert_synthetic_zpools_if_needed(&log, &storage_manager, &config) - .await; + // Spawn all important long running tasks that live for the lifetime of + // the process and are used by both the bootstrap agent and sled agent + let ( + long_running_task_handles, + sled_agent_started_tx, + service_manager_ready_tx, + underlay_available_tx, + ) = spawn_all_longrunning_tasks( + &base_log, + sled_mode, + startup_networking.global_zone_bootstrap_ip, + &config, + ) + .await; let global_zone_bootstrap_ip = startup_networking.global_zone_bootstrap_ip; @@ -243,22 +135,27 @@ impl BootstrapAgentStartup { config.skip_timesync, config.sidecar_revision.clone(), config.switch_zone_maghemite_links.clone(), - storage_manager.resources().clone(), - storage_manager.zone_bundler().clone(), + long_running_task_handles.storage_manager.clone(), + long_running_task_handles.zone_bundler.clone(), ); + // Inform the hardware monitor that the service manager is ready + // This is a onetime operation, and so we use a oneshot channel + service_manager_ready_tx + .send(service_manager.clone()) + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); + Ok(Self { config, global_zone_bootstrap_ip, ddm_admin_localhost_client, base_log, startup_log: log, - managers: BootstrapManagers { - hardware: hardware_manager, - storage: storage_manager, - service: service_manager, - }, - key_manager_handle, + service_manager, + long_running_task_handles, + sled_agent_started_tx, + underlay_available_tx, }) } } @@ -359,13 +256,10 @@ fn ensure_zfs_key_directory_exists(log: &Logger) -> Result<(), StartError> { // to create and mount encrypted datasets. info!( log, "Ensuring zfs key directory exists"; - "path" => sled_hardware::disk::KEYPATH_ROOT, + "path" => zfs::KEYPATH_ROOT, ); - std::fs::create_dir_all(sled_hardware::disk::KEYPATH_ROOT).map_err(|err| { - StartError::CreateZfsKeyDirectory { - dir: sled_hardware::disk::KEYPATH_ROOT, - err, - } + std::fs::create_dir_all(zfs::KEYPATH_ROOT).map_err(|err| { + StartError::CreateZfsKeyDirectory { dir: zfs::KEYPATH_ROOT, err } }) } @@ -388,23 +282,6 @@ fn ensure_zfs_ramdisk_dataset() -> Result<(), StartError> { .map_err(StartError::EnsureZfsRamdiskDataset) } -async fn upsert_synthetic_zpools_if_needed( - log: &Logger, - storage_manager: &StorageManager, - config: &Config, -) { - if let Some(pools) = &config.zpools { - for pool in pools { - info!( - log, - "Upserting synthetic zpool to Storage Manager: {}", - pool.to_string() - ); - storage_manager.upsert_synthetic_disk(pool.clone()).await; - } - } -} - // Combine the `sled_mode` config with the build-time switch type to determine // the actual sled mode. fn sled_mode_from_config(config: &Config) -> Result { @@ -447,19 +324,6 @@ fn sled_mode_from_config(config: &Config) -> Result { Ok(sled_mode) } -fn spawn_key_manager_task( - log: &Logger, -) -> (StorageKeyRequester, JoinHandle<()>) { - let secret_retriever = LrtqOrHardcodedSecretRetriever::new(); - let (mut key_manager, storage_key_requester) = - KeyManager::new(log, secret_retriever); - - let key_manager_handle = - tokio::spawn(async move { key_manager.run().await }); - - (storage_key_requester, key_manager_handle) -} - #[derive(Debug, Clone)] pub(crate) struct BootstrapNetworking { pub(crate) bootstrap_etherstub: dladm::Etherstub, diff --git a/sled-agent/src/bootstrap/rack_ops.rs b/sled-agent/src/bootstrap/rack_ops.rs index b8721f8332..5cfd0b074a 100644 --- a/sled-agent/src/bootstrap/rack_ops.rs +++ b/sled-agent/src/bootstrap/rack_ops.rs @@ -8,11 +8,11 @@ use crate::bootstrap::http_entrypoints::RackOperationStatus; use crate::bootstrap::params::RackInitializeRequest; use crate::bootstrap::rss_handle::RssHandle; use crate::rack_setup::service::SetupServiceError; -use crate::storage_manager::StorageResources; use bootstore::schemes::v0 as bootstore; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::mem; use std::net::Ipv6Addr; @@ -171,7 +171,7 @@ impl RssAccess { &self, parent_log: &Logger, global_zone_bootstrap_ip: Ipv6Addr, - storage_resources: &StorageResources, + storage_manager: &StorageHandle, bootstore_node_handle: &bootstore::NodeHandle, request: RackInitializeRequest, ) -> Result { @@ -207,14 +207,14 @@ impl RssAccess { mem::drop(status); let parent_log = parent_log.clone(); - let storage_resources = storage_resources.clone(); + let storage_manager = storage_manager.clone(); let bootstore_node_handle = bootstore_node_handle.clone(); let status = Arc::clone(&self.status); tokio::spawn(async move { let result = rack_initialize( &parent_log, global_zone_bootstrap_ip, - storage_resources, + storage_manager, bootstore_node_handle, request, ) @@ -342,7 +342,7 @@ enum RssStatus { async fn rack_initialize( parent_log: &Logger, global_zone_bootstrap_ip: Ipv6Addr, - storage_resources: StorageResources, + storage_manager: StorageHandle, bootstore_node_handle: bootstore::NodeHandle, request: RackInitializeRequest, ) -> Result<(), SetupServiceError> { @@ -350,7 +350,7 @@ async fn rack_initialize( parent_log, request, global_zone_bootstrap_ip, - storage_resources, + storage_manager, bootstore_node_handle, ) .await diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index c82873d91d..5d9c01e7f2 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -9,7 +9,6 @@ use super::params::StartSledAgentRequest; use crate::rack_setup::config::SetupServiceConfig; use crate::rack_setup::service::RackSetupService; use crate::rack_setup::service::SetupServiceError; -use crate::storage_manager::StorageResources; use ::bootstrap_agent_client::Client as BootstrapAgentClient; use bootstore::schemes::v0 as bootstore; use futures::stream::FuturesUnordered; @@ -17,6 +16,7 @@ use futures::StreamExt; use omicron_common::backoff::retry_notify; use omicron_common::backoff::retry_policy_local; use omicron_common::backoff::BackoffError; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::Ipv6Addr; use std::net::SocketAddrV6; @@ -46,7 +46,7 @@ impl RssHandle { log: &Logger, config: SetupServiceConfig, our_bootstrap_address: Ipv6Addr, - storage_resources: StorageResources, + storage_manager: StorageHandle, bootstore: bootstore::NodeHandle, ) -> Result<(), SetupServiceError> { let (tx, rx) = rss_channel(our_bootstrap_address); @@ -54,7 +54,7 @@ impl RssHandle { let rss = RackSetupService::new( log.new(o!("component" => "RSS")), config, - storage_resources, + storage_manager, tx, bootstore, ); diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index e53fab8ffa..999e4cc0c8 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -8,12 +8,10 @@ use super::config::BOOTSTRAP_AGENT_HTTP_PORT; use super::http_entrypoints; use super::params::RackInitializeRequest; use super::params::StartSledAgentRequest; -use super::pre_server::BootstrapManagers; use super::rack_ops::RackInitId; use super::views::SledAgentResponse; use super::BootstrapError; use super::RssAccessError; -use crate::bootstrap::bootstore::BootstoreHandles; use crate::bootstrap::config::BOOTSTRAP_AGENT_RACK_INIT_PORT; use crate::bootstrap::http_entrypoints::api as http_api; use crate::bootstrap::http_entrypoints::BootstrapServerContext; @@ -24,16 +22,17 @@ use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; use crate::bootstrap::sprockets_server::SprocketsServer; use crate::config::Config as SledConfig; use crate::config::ConfigError; +use crate::long_running_tasks::LongRunningTaskHandles; use crate::server::Server as SledAgentServer; +use crate::services::ServiceManager; use crate::sled_agent::SledAgent; -use crate::storage_manager::StorageResources; +use crate::storage_monitor::UnderlayAccess; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use ddm_admin_client::Client as DdmAdminClient; use ddm_admin_client::DdmError; use dropshot::HttpServer; -use futures::Future; use futures::StreamExt; use illumos_utils::dladm; use illumos_utils::zfs; @@ -42,12 +41,12 @@ use illumos_utils::zone::Zones; use omicron_common::ledger; use omicron_common::ledger::Ledger; use sled_hardware::underlay; -use sled_hardware::HardwareUpdate; +use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::io; use std::net::SocketAddr; use std::net::SocketAddrV6; -use tokio::sync::broadcast; use tokio::sync::mpsc; use tokio::sync::oneshot; use tokio::task::JoinHandle; @@ -175,65 +174,18 @@ impl Server { ddm_admin_localhost_client, base_log, startup_log, - managers, - key_manager_handle, + service_manager, + long_running_task_handles, + sled_agent_started_tx, + underlay_available_tx, } = BootstrapAgentStartup::run(config).await?; - // From this point on we will listen for hardware notifications and - // potentially start the switch zone and be notified of new disks; we - // are responsible for responding to updates from this point on. - let mut hardware_monitor = managers.hardware.monitor(); - let storage_resources = managers.storage.resources(); - - // Check the latest hardware snapshot; we could have missed events - // between the creation of the hardware manager and our subscription of - // its monitor. - managers.check_latest_hardware_snapshot(None, &startup_log).await; - - // Wait for our boot M.2 to show up. - wait_while_handling_hardware_updates( - wait_for_boot_m2(storage_resources, &startup_log), - &mut hardware_monitor, - &managers, - None, // No underlay network yet - &startup_log, - "waiting for boot M.2", - ) - .await; - - // Wait for the bootstore to start. - let bootstore_handles = wait_while_handling_hardware_updates( - BootstoreHandles::spawn( - storage_resources, - ddm_admin_localhost_client.clone(), - managers.hardware.baseboard(), - global_zone_bootstrap_ip, - &base_log, - ), - &mut hardware_monitor, - &managers, - None, // No underlay network yet - &startup_log, - "initializing bootstore", - ) - .await?; - // Do we have a StartSledAgentRequest stored in the ledger? - let maybe_ledger = wait_while_handling_hardware_updates( - async { - let paths = sled_config_paths(storage_resources).await?; - let maybe_ledger = - Ledger::::new(&startup_log, paths) - .await; - Ok::<_, StartError>(maybe_ledger) - }, - &mut hardware_monitor, - &managers, - None, // No underlay network yet - &startup_log, - "loading sled-agent request from ledger", - ) - .await?; + let paths = + sled_config_paths(&long_running_task_handles.storage_manager) + .await?; + let maybe_ledger = + Ledger::::new(&startup_log, paths).await; // We don't yet _act_ on the `StartSledAgentRequest` if we have one, but // if we have one we init our `RssAccess` noting that we're already @@ -250,9 +202,9 @@ impl Server { let bootstrap_context = BootstrapServerContext { base_log: base_log.clone(), global_zone_bootstrap_ip, - storage_resources: storage_resources.clone(), - bootstore_node_handle: bootstore_handles.node_handle.clone(), - baseboard: managers.hardware.baseboard(), + storage_manager: long_running_task_handles.storage_manager.clone(), + bootstore_node_handle: long_running_task_handles.bootstore.clone(), + baseboard: long_running_task_handles.hardware_manager.baseboard(), rss_access, updates: config.updates.clone(), sled_reset_tx, @@ -284,55 +236,36 @@ impl Server { // Do we have a persistent sled-agent request that we need to restore? let state = if let Some(ledger) = maybe_ledger { let start_sled_agent_request = ledger.into_inner(); - let sled_agent_server = wait_while_handling_hardware_updates( - start_sled_agent( - &config, - start_sled_agent_request, - &bootstore_handles.node_handle, - &managers, - &ddm_admin_localhost_client, - &base_log, - &startup_log, - ), - &mut hardware_monitor, - &managers, - None, // No underlay network yet + let sled_agent_server = start_sled_agent( + &config, + start_sled_agent_request, + long_running_task_handles.clone(), + underlay_available_tx, + service_manager.clone(), + &ddm_admin_localhost_client, + &base_log, &startup_log, - "restoring sled-agent (cold boot)", ) .await?; + // Give the HardwareMonitory access to the `SledAgent` let sled_agent = sled_agent_server.sled_agent(); - - // We've created sled-agent; we need to (possibly) reconfigure the - // switch zone, if we're a scrimlet, to give it our underlay network - // information. - let underlay_network_info = sled_agent.switch_zone_underlay_info(); - info!( - startup_log, "Sled Agent started; rescanning hardware"; - "underlay_network_info" => ?underlay_network_info, - ); - managers - .check_latest_hardware_snapshot(Some(&sled_agent), &startup_log) - .await; + sled_agent_started_tx + .send(sled_agent.clone()) + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); // For cold boot specifically, we now need to load the services // we're responsible for, while continuing to handle hardware // notifications. This cannot fail: we retry indefinitely until // we're done loading services. - wait_while_handling_hardware_updates( - sled_agent.load_services(), - &mut hardware_monitor, - &managers, - Some(&sled_agent), - &startup_log, - "restoring sled-agent services (cold boot)", - ) - .await; - + sled_agent.load_services().await; SledAgentState::ServerStarted(sled_agent_server) } else { - SledAgentState::Bootstrapping + SledAgentState::Bootstrapping( + Some(sled_agent_started_tx), + Some(underlay_available_tx), + ) }; // Spawn our inner task that handles any future hardware updates and any @@ -340,15 +273,13 @@ impl Server { // agent state. let inner = Inner { config, - hardware_monitor, state, sled_init_rx, sled_reset_rx, - managers, ddm_admin_localhost_client, - bootstore_handles, + long_running_task_handles, + service_manager, _sprockets_server_handle: sprockets_server_handle, - _key_manager_handle: key_manager_handle, base_log, }; let inner_task = tokio::spawn(inner.run()); @@ -377,20 +308,14 @@ impl Server { // bootstrap server). enum SledAgentState { // We're still in the bootstrapping phase, waiting for a sled-agent request. - Bootstrapping, + Bootstrapping( + Option>, + Option>, + ), // ... or the sled agent server is running. ServerStarted(SledAgentServer), } -impl SledAgentState { - fn sled_agent(&self) -> Option<&SledAgent> { - match self { - SledAgentState::Bootstrapping => None, - SledAgentState::ServerStarted(server) => Some(server.sled_agent()), - } - } -} - #[derive(thiserror::Error, Debug)] pub enum SledAgentServerStartError { #[error("Failed to start sled-agent server: {0}")] @@ -425,11 +350,13 @@ impl From for StartError { } } +#[allow(clippy::too_many_arguments)] async fn start_sled_agent( config: &SledConfig, request: StartSledAgentRequest, - bootstore: &bootstore::NodeHandle, - managers: &BootstrapManagers, + long_running_task_handles: LongRunningTaskHandles, + underlay_available_tx: oneshot::Sender, + service_manager: ServiceManager, ddmd_client: &DdmAdminClient, base_log: &Logger, log: &Logger, @@ -444,7 +371,10 @@ async fn start_sled_agent( if request.body.use_trust_quorum { info!(log, "KeyManager: using lrtq secret retriever"); let salt = request.hash_rack_id(); - LrtqOrHardcodedSecretRetriever::init_lrtq(salt, bootstore.clone()) + LrtqOrHardcodedSecretRetriever::init_lrtq( + salt, + long_running_task_handles.bootstore.clone(), + ) } else { info!(log, "KeyManager: using hardcoded secret retriever"); LrtqOrHardcodedSecretRetriever::init_hardcoded(); @@ -452,7 +382,7 @@ async fn start_sled_agent( if request.body.use_trust_quorum && request.body.is_lrtq_learner { info!(log, "Initializing sled as learner"); - match bootstore.init_learner().await { + match long_running_task_handles.bootstore.init_learner().await { Err(bootstore::NodeRequestError::Fsm( bootstore::ApiError::AlreadyInitialized, )) => { @@ -464,7 +394,7 @@ async fn start_sled_agent( } // Inform the storage service that the key manager is available - managers.storage.key_manager_ready().await; + long_running_task_handles.storage_manager.key_manager_ready().await; // Start trying to notify ddmd of our sled prefix so it can // advertise it to other sleds. @@ -484,9 +414,9 @@ async fn start_sled_agent( config, base_log.clone(), request.clone(), - managers.service.clone(), - managers.storage.clone(), - bootstore.clone(), + long_running_task_handles.clone(), + service_manager, + underlay_available_tx, ) .await .map_err(SledAgentServerStartError::FailedStartingServer)?; @@ -495,7 +425,8 @@ async fn start_sled_agent( // Record this request so the sled agent can be automatically // initialized on the next boot. - let paths = sled_config_paths(managers.storage.resources()).await?; + let paths = + sled_config_paths(&long_running_task_handles.storage_manager).await?; let mut ledger = Ledger::new_with(&log, paths, request); ledger.commit().await?; @@ -534,28 +465,6 @@ fn start_dropshot_server( Ok(http_server) } -/// Wait for at least the M.2 we booted from to show up. -/// -/// TODO-correctness Subsequent steps may assume all M.2s that will ever be -/// present are present once we return from this function; see -/// . -async fn wait_for_boot_m2(storage_resources: &StorageResources, log: &Logger) { - // Wait for at least the M.2 we booted from to show up. - loop { - match storage_resources.boot_disk().await { - Some(disk) => { - info!(log, "Found boot disk M.2: {disk:?}"); - break; - } - None => { - info!(log, "Waiting for boot disk M.2..."); - tokio::time::sleep(core::time::Duration::from_millis(250)) - .await; - } - } - } -} - struct MissingM2Paths(&'static str); impl From for StartError { @@ -571,56 +480,21 @@ impl From for SledAgentServerStartError { } async fn sled_config_paths( - storage: &StorageResources, + storage: &StorageHandle, ) -> Result, MissingM2Paths> { - let paths: Vec<_> = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) - .await + let resources = storage.get_latest_resources().await; + let paths: Vec<_> = resources + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(SLED_AGENT_REQUEST_FILE)) .collect(); if paths.is_empty() { - return Err(MissingM2Paths(sled_hardware::disk::CONFIG_DATASET)); + return Err(MissingM2Paths(CONFIG_DATASET)); } Ok(paths) } -// Helper function to wait for `fut` while handling any updates about hardware. -async fn wait_while_handling_hardware_updates, T>( - fut: F, - hardware_monitor: &mut broadcast::Receiver, - managers: &BootstrapManagers, - sled_agent: Option<&SledAgent>, - log: &Logger, - log_phase: &str, -) -> T { - tokio::pin!(fut); - loop { - tokio::select! { - // Cancel-safe per the docs on `broadcast::Receiver::recv()`. - hardware_update = hardware_monitor.recv() => { - info!( - log, - "Handling hardware update message"; - "phase" => log_phase, - "update" => ?hardware_update, - ); - - managers.handle_hardware_update( - hardware_update, - sled_agent, - log, - ).await; - } - - // Cancel-safe: we're using a `&mut Future`; dropping the - // reference does not cancel the underlying future. - result = &mut fut => return result, - } - } -} - /// Runs the OpenAPI generator, emitting the spec to stdout. pub fn run_openapi() -> Result<(), String> { http_api() @@ -634,18 +508,16 @@ pub fn run_openapi() -> Result<(), String> { struct Inner { config: SledConfig, - hardware_monitor: broadcast::Receiver, state: SledAgentState, sled_init_rx: mpsc::Receiver<( StartSledAgentRequest, oneshot::Sender>, )>, sled_reset_rx: mpsc::Receiver>>, - managers: BootstrapManagers, ddm_admin_localhost_client: DdmAdminClient, - bootstore_handles: BootstoreHandles, + long_running_task_handles: LongRunningTaskHandles, + service_manager: ServiceManager, _sprockets_server_handle: JoinHandle<()>, - _key_manager_handle: JoinHandle<()>, base_log: Logger, } @@ -653,14 +525,7 @@ impl Inner { async fn run(mut self) { let log = self.base_log.new(o!("component" => "SledAgentMain")); loop { - // TODO-correctness We pause handling hardware update messages while - // we handle sled init/reset requests - is that okay? tokio::select! { - // Cancel-safe per the docs on `broadcast::Receiver::recv()`. - hardware_update = self.hardware_monitor.recv() => { - self.handle_hardware_update(hardware_update, &log).await; - } - // Cancel-safe per the docs on `mpsc::Receiver::recv()`. Some((request, response_tx)) = self.sled_init_rx.recv() => { self.handle_start_sled_agent_request( @@ -688,41 +553,36 @@ impl Inner { } } - async fn handle_hardware_update( - &self, - hardware_update: Result, - log: &Logger, - ) { - info!( - log, - "Handling hardware update message"; - "phase" => "bootstore-steady-state", - "update" => ?hardware_update, - ); - - self.managers - .handle_hardware_update( - hardware_update, - self.state.sled_agent(), - &log, - ) - .await; - } - async fn handle_start_sled_agent_request( &mut self, request: StartSledAgentRequest, response_tx: oneshot::Sender>, log: &Logger, ) { - let request_id = request.body.id; - match &self.state { - SledAgentState::Bootstrapping => { + match &mut self.state { + SledAgentState::Bootstrapping( + sled_agent_started_tx, + underlay_available_tx, + ) => { + let request_id = request.body.id; + + // Extract from options to satisfy the borrow checker. + // It is not possible for `start_sled_agent` to be cancelled + // or fail in a safe, restartable manner. Therefore, for now, + // we explicitly unwrap here, and panic on error below. + // + // See https://github.com/oxidecomputer/omicron/issues/4494 + let sled_agent_started_tx = + sled_agent_started_tx.take().unwrap(); + let underlay_available_tx = + underlay_available_tx.take().unwrap(); + let response = match start_sled_agent( &self.config, request, - &self.bootstore_handles.node_handle, - &self.managers, + self.long_running_task_handles.clone(), + underlay_available_tx, + self.service_manager.clone(), &self.ddm_admin_localhost_client, &self.base_log, &log, @@ -733,17 +593,19 @@ impl Inner { // We've created sled-agent; we need to (possibly) // reconfigure the switch zone, if we're a scrimlet, to // give it our underlay network information. - self.managers - .check_latest_hardware_snapshot( - Some(server.sled_agent()), - log, - ) - .await; - + sled_agent_started_tx + .send(server.sled_agent().clone()) + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); self.state = SledAgentState::ServerStarted(server); Ok(SledAgentResponse { id: request_id }) } - Err(err) => Err(format!("{err:#}")), + Err(err) => { + // This error is unrecoverable, and if returned we'd + // end up in maintenance mode anyway. + error!(log, "Failed to start sled agent: {err:#}"); + panic!("Failed to start sled agent"); + } }; _ = response_tx.send(response); } @@ -787,11 +649,11 @@ impl Inner { async fn uninstall_sled_local_config(&self) -> Result<(), BootstrapError> { let config_dirs = self - .managers - .storage - .resources() - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .long_running_task_handles + .storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter(); for dir in config_dirs { diff --git a/sled-agent/src/storage/dump_setup.rs b/sled-agent/src/dump_setup.rs similarity index 93% rename from sled-agent/src/storage/dump_setup.rs rename to sled-agent/src/dump_setup.rs index 9b5edc0a7e..e675e6e12d 100644 --- a/sled-agent/src/storage/dump_setup.rs +++ b/sled-agent/src/dump_setup.rs @@ -1,4 +1,3 @@ -use crate::storage_manager::DiskWrapper; use camino::Utf8PathBuf; use derive_more::{AsRef, Deref, From}; use illumos_utils::dumpadm::DumpAdmError; @@ -6,13 +5,15 @@ use illumos_utils::zone::{AdmError, Zones}; use illumos_utils::zpool::{ZpoolHealth, ZpoolName}; use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; +use sled_storage::dataset::{CRASH_DATASET, DUMP_DATASET}; +use sled_storage::disk::Disk; +use sled_storage::pool::Pool; use slog::Logger; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashSet}; use std::ffi::OsString; use std::path::{Path, PathBuf}; use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; -use tokio::sync::MutexGuard; pub struct DumpSetup { worker: Arc>, @@ -70,11 +71,11 @@ trait GetMountpoint: std::ops::Deref { } impl GetMountpoint for DebugZpool { type NewType = DebugDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::DUMP_DATASET; + const MOUNTPOINT: &'static str = DUMP_DATASET; } impl GetMountpoint for CoreZpool { type NewType = CoreDataset; - const MOUNTPOINT: &'static str = sled_hardware::disk::CRASH_DATASET; + const MOUNTPOINT: &'static str = CRASH_DATASET; } struct DumpSetupWorker { @@ -99,50 +100,51 @@ const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); impl DumpSetup { pub(crate) async fn update_dumpdev_setup( &self, - disks: &mut MutexGuard<'_, HashMap>, + disks: &BTreeMap, ) { let log = &self.log; let mut m2_dump_slices = Vec::new(); let mut u2_debug_datasets = Vec::new(); let mut m2_core_datasets = Vec::new(); - for (_id, disk_wrapper) in disks.iter() { - match disk_wrapper { - DiskWrapper::Real { disk, .. } => match disk.variant() { - DiskVariant::M2 => { - match disk.dump_device_devfs_path(false) { - Ok(path) => { - m2_dump_slices.push(DumpSlicePath(path)) - } - Err(err) => { - warn!(log, "Error getting dump device devfs path: {err:?}"); - } + for (_id, (disk, _)) in disks.iter() { + if disk.is_synthetic() { + // We only setup dump devices on real disks + continue; + } + match disk.variant() { + DiskVariant::M2 => { + match disk.dump_device_devfs_path(false) { + Ok(path) => m2_dump_slices.push(DumpSlicePath(path)), + Err(err) => { + warn!( + log, + "Error getting dump device devfs path: {err:?}" + ); } - let name = disk.zpool_name(); - if let Ok(info) = illumos_utils::zpool::Zpool::get_info( - &name.to_string(), - ) { - if info.health() == ZpoolHealth::Online { - m2_core_datasets.push(CoreZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); - } + } + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + { + if info.health() == ZpoolHealth::Online { + m2_core_datasets.push(CoreZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); } } - DiskVariant::U2 => { - let name = disk.zpool_name(); - if let Ok(info) = illumos_utils::zpool::Zpool::get_info( - &name.to_string(), - ) { - if info.health() == ZpoolHealth::Online { - u2_debug_datasets - .push(DebugZpool(name.clone())); - } else { - warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); - } + } + DiskVariant::U2 => { + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + { + if info.health() == ZpoolHealth::Online { + u2_debug_datasets.push(DebugZpool(name.clone())); + } else { + warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); } } - }, - DiskWrapper::Synthetic { .. } => {} + } } } diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs new file mode 100644 index 0000000000..698d2d4608 --- /dev/null +++ b/sled-agent/src/hardware_monitor.rs @@ -0,0 +1,257 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A task that listens for hardware events from the +//! [`sled_hardware::HardwareManager`] and dispatches them to other parts +//! of the bootstrap agent and sled-agent code. + +use crate::services::ServiceManager; +use crate::sled_agent::SledAgent; +use sled_hardware::{Baseboard, HardwareManager, HardwareUpdate}; +use sled_storage::disk::RawDisk; +use sled_storage::manager::StorageHandle; +use slog::Logger; +use tokio::sync::broadcast::error::RecvError; +use tokio::sync::{broadcast, oneshot}; + +// A thin wrapper around the the [`ServiceManager`] that caches the state +// whether or not the tofino is loaded if the [`ServiceManager`] doesn't exist +// yet. +enum TofinoManager { + Ready(ServiceManager), + NotReady { tofino_loaded: bool }, +} + +impl TofinoManager { + pub fn new() -> TofinoManager { + TofinoManager::NotReady { tofino_loaded: false } + } + + // Must only be called once on the transition from `NotReady` to `Ready`. + // Panics otherwise. + // + // Returns whether the tofino was loaded or not + pub fn become_ready(&mut self, service_manager: ServiceManager) -> bool { + let tofino_loaded = match self { + Self::Ready(_) => panic!("ServiceManager is already available"), + Self::NotReady { tofino_loaded } => *tofino_loaded, + }; + *self = Self::Ready(service_manager); + tofino_loaded + } + + pub fn is_ready(&self) -> bool { + match self { + TofinoManager::Ready(_) => true, + _ => false, + } + } +} + +// A monitor for hardware events +pub struct HardwareMonitor { + log: Logger, + + baseboard: Baseboard, + + // Receive a onetime notification that the SledAgent has started + sled_agent_started_rx: oneshot::Receiver, + + // Receive a onetime notification that the ServiceManager is ready + service_manager_ready_rx: oneshot::Receiver, + + // Receive messages from the [`HardwareManager`] + hardware_rx: broadcast::Receiver, + + // A reference to the hardware manager + hardware_manager: HardwareManager, + + // A handle to [`sled_hardware::manager::StorageManger`] + storage_manager: StorageHandle, + + // A handle to the sled-agent + // + // This will go away once Nexus updates are polled: + // See: + // * https://github.com/oxidecomputer/omicron/issues/1917 + // * https://rfd.shared.oxide.computer/rfd/0433 + sled_agent: Option, + + // The [`ServiceManager`] is instantiated after we start the [`HardwareMonitor`] + // task. However, it is only used to load and unload the switch zone when thes + // state of the tofino changes. We keep track of the tofino state so that we + // can properly load the tofino when the [`ServiceManager`] becomes available + // available. + tofino_manager: TofinoManager, +} + +impl HardwareMonitor { + pub fn new( + log: &Logger, + hardware_manager: &HardwareManager, + storage_manager: &StorageHandle, + ) -> ( + HardwareMonitor, + oneshot::Sender, + oneshot::Sender, + ) { + let (sled_agent_started_tx, sled_agent_started_rx) = oneshot::channel(); + let (service_manager_ready_tx, service_manager_ready_rx) = + oneshot::channel(); + let baseboard = hardware_manager.baseboard(); + let hardware_rx = hardware_manager.monitor(); + let log = log.new(o!("component" => "HardwareMonitor")); + let tofino_manager = TofinoManager::new(); + ( + HardwareMonitor { + log, + baseboard, + sled_agent_started_rx, + service_manager_ready_rx, + hardware_rx, + hardware_manager: hardware_manager.clone(), + storage_manager: storage_manager.clone(), + sled_agent: None, + tofino_manager, + }, + sled_agent_started_tx, + service_manager_ready_tx, + ) + } + + /// Run the main receive loop of the `HardwareMonitor` + /// + /// This should be spawned into a tokio task + pub async fn run(&mut self) { + // Check the latest hardware snapshot; we could have missed events + // between the creation of the hardware manager and our subscription of + // its monitor. + self.check_latest_hardware_snapshot().await; + + loop { + tokio::select! { + Ok(sled_agent) = &mut self.sled_agent_started_rx, + if self.sled_agent.is_none() => + { + info!(self.log, "Sled Agent Started"); + self.sled_agent = Some(sled_agent); + self.check_latest_hardware_snapshot().await; + } + Ok(service_manager) = &mut self.service_manager_ready_rx, + if !self.tofino_manager.is_ready() => + { + let tofino_loaded = + self.tofino_manager.become_ready(service_manager); + if tofino_loaded { + self.activate_switch().await; + } + } + update = self.hardware_rx.recv() => { + info!( + self.log, + "Received hardware update message"; + "update" => ?update, + ); + self.handle_hardware_update(update).await; + } + } + } + } + + // Handle an update from the [`HardwareMonitor`] + async fn handle_hardware_update( + &mut self, + update: Result, + ) { + match update { + Ok(update) => match update { + HardwareUpdate::TofinoLoaded => self.activate_switch().await, + HardwareUpdate::TofinoUnloaded => { + self.deactivate_switch().await + } + HardwareUpdate::TofinoDeviceChange => { + if let Some(sled_agent) = &mut self.sled_agent { + sled_agent.notify_nexus_about_self(&self.log); + } + } + HardwareUpdate::DiskAdded(disk) => { + self.storage_manager.upsert_disk(disk.into()).await; + } + HardwareUpdate::DiskRemoved(disk) => { + self.storage_manager.delete_disk(disk.into()).await; + } + }, + Err(broadcast::error::RecvError::Lagged(count)) => { + warn!(self.log, "Hardware monitor missed {count} messages"); + self.check_latest_hardware_snapshot().await; + } + Err(broadcast::error::RecvError::Closed) => { + // The `HardwareManager` monitoring task is an infinite loop - + // the only way for us to get `Closed` here is if it panicked, + // so we will propagate such a panic. + panic!("Hardware manager monitor task panicked"); + } + } + } + + async fn activate_switch(&mut self) { + match &mut self.tofino_manager { + TofinoManager::Ready(service_manager) => { + if let Err(e) = service_manager + .activate_switch( + self.sled_agent + .as_ref() + .map(|sa| sa.switch_zone_underlay_info()), + self.baseboard.clone(), + ) + .await + { + warn!(self.log, "Failed to activate switch: {e}"); + } + } + TofinoManager::NotReady { tofino_loaded } => { + *tofino_loaded = true; + } + } + } + + async fn deactivate_switch(&mut self) { + match &mut self.tofino_manager { + TofinoManager::Ready(service_manager) => { + if let Err(e) = service_manager.deactivate_switch().await { + warn!(self.log, "Failed to deactivate switch: {e}"); + } + } + TofinoManager::NotReady { tofino_loaded } => { + *tofino_loaded = false; + } + } + } + + // Observe the current hardware state manually. + // + // We use this when we're monitoring hardware for the first + // time, and if we miss notifications. + async fn check_latest_hardware_snapshot(&mut self) { + let underlay_network = self.sled_agent.as_ref().map(|sled_agent| { + sled_agent.notify_nexus_about_self(&self.log); + sled_agent.switch_zone_underlay_info() + }); + info!( + self.log, "Checking current full hardware snapshot"; + "underlay_network_info" => ?underlay_network, + ); + if self.hardware_manager.is_scrimlet_driver_loaded() { + self.activate_switch().await; + } else { + self.deactivate_switch().await; + } + + self.storage_manager + .ensure_using_exactly_these_disks( + self.hardware_manager.disks().into_iter().map(RawDisk::from), + ) + .await; + } +} diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 836790e190..9c3a079dac 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -349,7 +349,7 @@ async fn zpools_get( rqctx: RequestContext, ) -> Result>, HttpError> { let sa = rqctx.context(); - Ok(HttpResponseOk(sa.zpools_get().await.map_err(|e| Error::from(e))?)) + Ok(HttpResponseOk(sa.zpools_get().await)) } #[endpoint { diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 6db3b11740..f030078761 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -17,7 +17,6 @@ use crate::params::{ InstanceMigrationTargetParams, InstanceStateRequested, VpcFirewallRule, }; use crate::profile::*; -use crate::storage_manager::StorageResources; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; @@ -42,7 +41,8 @@ use omicron_common::backoff; use propolis_client::Client as PropolisClient; use rand::prelude::SliceRandom; use rand::SeedableRng; -use sled_hardware::disk::ZONE_DATASET; +use sled_storage::dataset::ZONE_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::net::IpAddr; use std::net::{SocketAddr, SocketAddrV6}; @@ -225,7 +225,7 @@ struct InstanceInner { nexus_client: NexusClientWithResolver, // Storage resources - storage: StorageResources, + storage: StorageHandle, // Object used to collect zone bundles from this instance when terminated. zone_bundler: ZoneBundler, @@ -899,8 +899,9 @@ impl Instance { let mut rng = rand::rngs::StdRng::from_entropy(); let root = inner .storage - .all_u2_mountpoints(ZONE_DATASET) + .get_latest_resources() .await + .all_u2_mountpoints(ZONE_DATASET) .choose(&mut rng) .ok_or_else(|| Error::U2NotFound)? .clone(); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 4b430812e1..fa40a876f0 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -12,7 +12,6 @@ use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, }; -use crate::storage_manager::StorageResources; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use illumos_utils::dladm::Etherstub; @@ -23,6 +22,7 @@ use omicron_common::api::external::ByteCount; use omicron_common::api::internal::nexus::InstanceRuntimeState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::api::internal::nexus::VmmRuntimeState; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::net::SocketAddr; @@ -74,7 +74,7 @@ struct InstanceManagerInternal { vnic_allocator: VnicAllocator, port_manager: PortManager, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, } @@ -82,7 +82,7 @@ pub(crate) struct InstanceManagerServices { pub nexus_client: NexusClientWithResolver, pub vnic_allocator: VnicAllocator, pub port_manager: PortManager, - pub storage: StorageResources, + pub storage: StorageHandle, pub zone_bundler: ZoneBundler, } @@ -98,7 +98,7 @@ impl InstanceManager { nexus_client: NexusClientWithResolver, etherstub: Etherstub, port_manager: PortManager, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ) -> Result { Ok(InstanceManager { diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index d812136192..d77ec7a3c0 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -20,9 +20,12 @@ pub mod common; mod backing_fs; pub mod bootstrap; pub mod config; +pub(crate) mod dump_setup; +pub(crate) mod hardware_monitor; mod http_entrypoints; mod instance; mod instance_manager; +mod long_running_tasks; mod metrics; mod nexus; pub mod params; @@ -33,8 +36,7 @@ pub mod services; pub mod services_migration; mod sled_agent; mod smf_helper; -pub(crate) mod storage; -mod storage_manager; +mod storage_monitor; mod swap_device; mod updates; mod zone_bundle; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs new file mode 100644 index 0000000000..f4a665c098 --- /dev/null +++ b/sled-agent/src/long_running_tasks.rs @@ -0,0 +1,241 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This module is responsible for spawning, starting, and managing long running +//! tasks and task driven subsystems. These tasks run for the remainder of the +//! sled-agent process from the moment they begin. Primarily they include the +//! "managers", like `StorageManager`, `InstanceManager`, etc..., and are used +//! by both the bootstrap agent and the sled-agent. +//! +//! We don't bother keeping track of the spawned tasks handles because we know +//! these tasks are supposed to run forever, and they can shutdown if their +//! handles are dropped. + +use crate::bootstrap::bootstore_setup::{ + new_bootstore_config, poll_ddmd_for_bootstore_peer_update, +}; +use crate::bootstrap::secret_retriever::LrtqOrHardcodedSecretRetriever; +use crate::config::Config; +use crate::hardware_monitor::HardwareMonitor; +use crate::services::ServiceManager; +use crate::sled_agent::SledAgent; +use crate::storage_monitor::{StorageMonitor, UnderlayAccess}; +use crate::zone_bundle::{CleanupContext, ZoneBundler}; +use bootstore::schemes::v0 as bootstore; +use key_manager::{KeyManager, StorageKeyRequester}; +use sled_hardware::{HardwareManager, SledMode}; +use sled_storage::disk::SyntheticDisk; +use sled_storage::manager::{StorageHandle, StorageManager}; +use slog::{info, Logger}; +use std::net::Ipv6Addr; +use tokio::sync::oneshot; + +/// A mechanism for interacting with all long running tasks that can be shared +/// between the bootstrap-agent and sled-agent code. +#[derive(Clone)] +pub struct LongRunningTaskHandles { + /// A mechanism for retrieving storage keys. This interacts with the + /// [`KeyManager`] task. In the future, there may be other handles for + /// retrieving different types of keys. Separating the handles limits the + /// access for a given key type to the code that holds the handle. + pub storage_key_requester: StorageKeyRequester, + + /// A mechanism for talking to the [`StorageManager`] which is responsible + /// for establishing zpools on disks and managing their datasets. + pub storage_manager: StorageHandle, + + /// A mechanism for interacting with the hardware device tree + pub hardware_manager: HardwareManager, + + // A handle for interacting with the bootstore + pub bootstore: bootstore::NodeHandle, + + // A reference to the object used to manage zone bundles + pub zone_bundler: ZoneBundler, +} + +/// Spawn all long running tasks +pub async fn spawn_all_longrunning_tasks( + log: &Logger, + sled_mode: SledMode, + global_zone_bootstrap_ip: Ipv6Addr, + config: &Config, +) -> ( + LongRunningTaskHandles, + oneshot::Sender, + oneshot::Sender, + oneshot::Sender, +) { + let storage_key_requester = spawn_key_manager(log); + let mut storage_manager = + spawn_storage_manager(log, storage_key_requester.clone()); + + let underlay_available_tx = + spawn_storage_monitor(log, storage_manager.clone()); + + let hardware_manager = spawn_hardware_manager(log, sled_mode).await; + + // Start monitoring for hardware changes + let (sled_agent_started_tx, service_manager_ready_tx) = + spawn_hardware_monitor(log, &hardware_manager, &storage_manager); + + // Add some synthetic disks if necessary. + upsert_synthetic_zpools_if_needed(&log, &storage_manager, &config).await; + + // Wait for the boot disk so that we can work with any ledgers, + // such as those needed by the bootstore and sled-agent + info!(log, "Waiting for boot disk"); + let (disk_id, _) = storage_manager.wait_for_boot_disk().await; + info!(log, "Found boot disk {:?}", disk_id); + + let bootstore = spawn_bootstore_tasks( + log, + &mut storage_manager, + &hardware_manager, + global_zone_bootstrap_ip, + ) + .await; + + let zone_bundler = spawn_zone_bundler_tasks(log, &mut storage_manager); + + ( + LongRunningTaskHandles { + storage_key_requester, + storage_manager, + hardware_manager, + bootstore, + zone_bundler, + }, + sled_agent_started_tx, + service_manager_ready_tx, + underlay_available_tx, + ) +} + +fn spawn_key_manager(log: &Logger) -> StorageKeyRequester { + info!(log, "Starting KeyManager"); + let secret_retriever = LrtqOrHardcodedSecretRetriever::new(); + let (mut key_manager, storage_key_requester) = + KeyManager::new(log, secret_retriever); + tokio::spawn(async move { key_manager.run().await }); + storage_key_requester +} + +fn spawn_storage_manager( + log: &Logger, + key_requester: StorageKeyRequester, +) -> StorageHandle { + info!(log, "Starting StorageManager"); + let (manager, handle) = StorageManager::new(log, key_requester); + tokio::spawn(async move { + manager.run().await; + }); + handle +} + +fn spawn_storage_monitor( + log: &Logger, + storage_handle: StorageHandle, +) -> oneshot::Sender { + info!(log, "Starting StorageMonitor"); + let (storage_monitor, underlay_available_tx) = + StorageMonitor::new(log, storage_handle); + tokio::spawn(async move { + storage_monitor.run().await; + }); + underlay_available_tx +} + +async fn spawn_hardware_manager( + log: &Logger, + sled_mode: SledMode, +) -> HardwareManager { + // The `HardwareManager` does not use the the "task/handle" pattern + // and spawns its worker task inside `HardwareManager::new`. Instead of returning + // a handle to send messages to that task, the "Inner/Mutex" pattern is used + // which shares data between the task, the manager itself, and the users of the manager + // since the manager can be freely cloned and passed around. + // + // There are pros and cons to both methods, but the reason to mention it here is that + // the handle in this case is the `HardwareManager` itself. + info!(log, "Starting HardwareManager"; "sled_mode" => ?sled_mode); + let log = log.clone(); + tokio::task::spawn_blocking(move || { + HardwareManager::new(&log, sled_mode).unwrap() + }) + .await + .unwrap() +} + +fn spawn_hardware_monitor( + log: &Logger, + hardware_manager: &HardwareManager, + storage_handle: &StorageHandle, +) -> (oneshot::Sender, oneshot::Sender) { + info!(log, "Starting HardwareMonitor"); + let (mut monitor, sled_agent_started_tx, service_manager_ready_tx) = + HardwareMonitor::new(log, hardware_manager, storage_handle); + tokio::spawn(async move { + monitor.run().await; + }); + (sled_agent_started_tx, service_manager_ready_tx) +} + +async fn spawn_bootstore_tasks( + log: &Logger, + storage_handle: &mut StorageHandle, + hardware_manager: &HardwareManager, + global_zone_bootstrap_ip: Ipv6Addr, +) -> bootstore::NodeHandle { + let storage_resources = storage_handle.get_latest_resources().await; + let config = new_bootstore_config( + &storage_resources, + hardware_manager.baseboard(), + global_zone_bootstrap_ip, + ) + .unwrap(); + + // Create and spawn the bootstore + info!(log, "Starting Bootstore"); + let (mut node, node_handle) = bootstore::Node::new(config, log).await; + tokio::spawn(async move { node.run().await }); + + // Spawn a task for polling DDMD and updating bootstore with peer addresses + info!(log, "Starting Bootstore DDMD poller"); + let log = log.new(o!("component" => "bootstore_ddmd_poller")); + let node_handle2 = node_handle.clone(); + tokio::spawn(async move { + poll_ddmd_for_bootstore_peer_update(log, node_handle2).await + }); + + node_handle +} + +// `ZoneBundler::new` spawns a periodic cleanup task that runs indefinitely +fn spawn_zone_bundler_tasks( + log: &Logger, + storage_handle: &mut StorageHandle, +) -> ZoneBundler { + info!(log, "Starting ZoneBundler related tasks"); + let log = log.new(o!("component" => "ZoneBundler")); + ZoneBundler::new(log, storage_handle.clone(), CleanupContext::default()) +} + +async fn upsert_synthetic_zpools_if_needed( + log: &Logger, + storage_manager: &StorageHandle, + config: &Config, +) { + if let Some(pools) = &config.zpools { + for pool in pools { + info!( + log, + "Upserting synthetic zpool to Storage Manager: {}", + pool.to_string() + ); + let disk = SyntheticDisk::new(pool.clone()).into(); + storage_manager.upsert_disk(disk).await; + } + } +} diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 88f567ceed..138a539fdc 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -2,7 +2,6 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use crate::storage::dataset::DatasetName; use crate::zone_bundle::PriorityOrder; pub use crate::zone_bundle::ZoneBundleCause; pub use crate::zone_bundle::ZoneBundleId; @@ -22,6 +21,8 @@ use omicron_common::api::internal::shared::{ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; pub use sled_hardware::DendriteAsic; +use sled_storage::dataset::DatasetKind; +use sled_storage::dataset::DatasetName; use std::fmt::{Debug, Display, Formatter, Result as FormatResult}; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::str::FromStr; @@ -230,50 +231,6 @@ pub struct Zpool { pub disk_type: DiskType, } -/// The type of a dataset, and an auxiliary information necessary -/// to successfully launch a zone managing the associated data. -#[derive( - Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, -)] -#[serde(tag = "type", rename_all = "snake_case")] -pub enum DatasetKind { - CockroachDb, - Crucible, - Clickhouse, - ClickhouseKeeper, - ExternalDns, - InternalDns, -} - -impl From for nexus_client::types::DatasetKind { - fn from(k: DatasetKind) -> Self { - use DatasetKind::*; - match k { - CockroachDb => Self::Cockroach, - Crucible => Self::Crucible, - Clickhouse => Self::Clickhouse, - ClickhouseKeeper => Self::ClickhouseKeeper, - ExternalDns => Self::ExternalDns, - InternalDns => Self::InternalDns, - } - } -} - -impl std::fmt::Display for DatasetKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use DatasetKind::*; - let s = match self { - Crucible => "crucible", - CockroachDb { .. } => "cockroachdb", - Clickhouse => "clickhouse", - ClickhouseKeeper => "clickhouse_keeper", - ExternalDns { .. } => "external_dns", - InternalDns { .. } => "internal_dns", - }; - write!(f, "{}", s) - } -} - /// The type of zone that Sled Agent may run #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 1d85b422ed..d1ed372bc9 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -5,12 +5,8 @@ //! Plan generation for "where should services be initialized". use crate::bootstrap::params::StartSledAgentRequest; -use crate::params::{ - DatasetKind, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, -}; +use crate::params::{OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType}; use crate::rack_setup::config::SetupServiceConfig as Config; -use crate::storage::dataset::DatasetName; -use crate::storage_manager::StorageResources; use camino::Utf8PathBuf; use dns_service_client::types::DnsConfigParams; use illumos_utils::zpool::ZpoolName; @@ -35,6 +31,8 @@ use serde::{Deserialize, Serialize}; use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; +use sled_storage::dataset::{DatasetKind, DatasetName, CONFIG_DATASET}; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::{BTreeSet, HashMap, HashSet}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV6}; @@ -98,18 +96,12 @@ pub enum PlanError { NotEnoughSleds, } -#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +#[derive(Clone, Debug, Default, Serialize, Deserialize, JsonSchema)] pub struct SledConfig { /// zones configured for this sled pub zones: Vec, } -impl Default for SledConfig { - fn default() -> Self { - SledConfig { zones: Vec::new() } - } -} - #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct Plan { pub services: HashMap, @@ -127,11 +119,12 @@ const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan.json"; impl Plan { pub async fn load( log: &Logger, - storage: &StorageResources, + storage_manager: &StorageHandle, ) -> Result, PlanError> { - let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SERVICE_PLAN_FILENAME)) .collect(); @@ -659,7 +652,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, - storage: &StorageResources, + storage_manager: &StorageHandle, sleds: &HashMap, ) -> Result { // Load the information we need about each Sled to be able to allocate @@ -691,9 +684,10 @@ impl Plan { let plan = Self::create_transient(config, sled_info)?; // Once we've constructed a plan, write it down to durable storage. - let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SERVICE_PLAN_FILENAME)) .collect(); diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index 163b24cd45..07f33893fc 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -9,11 +9,12 @@ use crate::bootstrap::{ config::BOOTSTRAP_AGENT_RACK_INIT_PORT, params::StartSledAgentRequest, }; use crate::rack_setup::config::SetupServiceConfig as Config; -use crate::storage_manager::StorageResources; use camino::Utf8PathBuf; use omicron_common::ledger::{self, Ledger, Ledgerable}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::{HashMap, HashSet}; use std::net::{Ipv6Addr, SocketAddrV6}; @@ -55,11 +56,12 @@ pub struct Plan { impl Plan { pub async fn load( log: &Logger, - storage: &StorageResources, + storage: &StorageHandle, ) -> Result, PlanError> { let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SLED_PLAN_FILENAME)) .collect(); @@ -78,7 +80,7 @@ impl Plan { pub async fn create( log: &Logger, config: &Config, - storage: &StorageResources, + storage_manager: &StorageHandle, bootstrap_addrs: HashSet, use_trust_quorum: bool, ) -> Result { @@ -123,9 +125,10 @@ impl Plan { let plan = Self { rack_id, sleds, config: config.clone() }; // Once we've constructed a plan, write it down to durable storage. - let paths: Vec = storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_SLED_PLAN_FILENAME)) .collect(); diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index ed9cca5fc6..1db7f65a06 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -74,7 +74,6 @@ use crate::rack_setup::plan::service::{ use crate::rack_setup::plan::sled::{ Plan as SledPlan, PlanError as SledPlanError, }; -use crate::storage_manager::StorageResources; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; @@ -95,6 +94,8 @@ use sled_agent_client::{ types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError, }; use sled_hardware::underlay::BootstrapInterface; +use sled_storage::dataset::CONFIG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeSet; use std::collections::{HashMap, HashSet}; @@ -188,7 +189,7 @@ impl RackSetupService { pub(crate) fn new( log: Logger, config: Config, - storage_resources: StorageResources, + storage_manager: StorageHandle, local_bootstrap_agent: BootstrapAgentHandle, bootstore: bootstore::NodeHandle, ) -> Self { @@ -197,7 +198,7 @@ impl RackSetupService { if let Err(e) = svc .run( &config, - &storage_resources, + &storage_manager, local_bootstrap_agent, bootstore, ) @@ -776,7 +777,7 @@ impl ServiceInner { async fn run( &self, config: &Config, - storage_resources: &StorageResources, + storage_manager: &StorageHandle, local_bootstrap_agent: BootstrapAgentHandle, bootstore: bootstore::NodeHandle, ) -> Result<(), SetupServiceError> { @@ -787,9 +788,10 @@ impl ServiceInner { config.az_subnet(), )?; - let marker_paths: Vec = storage_resources - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) + let marker_paths: Vec = storage_manager + .get_latest_resources() .await + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(RSS_COMPLETED_FILENAME)) .collect(); @@ -810,7 +812,7 @@ impl ServiceInner { "RSS configuration looks like it has already been applied", ); - let sled_plan = SledPlan::load(&self.log, storage_resources) + let sled_plan = SledPlan::load(&self.log, storage_manager) .await? .expect("Sled plan should exist if completed marker exists"); if &sled_plan.config != config { @@ -818,7 +820,7 @@ impl ServiceInner { "Configuration changed".to_string(), )); } - let service_plan = ServicePlan::load(&self.log, storage_resources) + let service_plan = ServicePlan::load(&self.log, storage_manager) .await? .expect("Service plan should exist if completed marker exists"); @@ -852,7 +854,7 @@ impl ServiceInner { BootstrapAddressDiscovery::OnlyThese { addrs } => addrs.clone(), }; let maybe_sled_plan = - SledPlan::load(&self.log, storage_resources).await?; + SledPlan::load(&self.log, storage_manager).await?; if let Some(plan) = &maybe_sled_plan { let stored_peers: HashSet = plan.sleds.keys().map(|a| *a.ip()).collect(); @@ -884,7 +886,7 @@ impl ServiceInner { SledPlan::create( &self.log, config, - &storage_resources, + &storage_manager, bootstrap_addrs, config.trust_quorum_peers.is_some(), ) @@ -939,14 +941,14 @@ impl ServiceInner { }) .collect(); let service_plan = if let Some(plan) = - ServicePlan::load(&self.log, storage_resources).await? + ServicePlan::load(&self.log, storage_manager).await? { plan } else { ServicePlan::create( &self.log, &config, - &storage_resources, + &storage_manager, &plan.sleds, ) .await? diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index 156547627c..903c8dabaa 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -8,14 +8,15 @@ use super::config::Config; use super::http_entrypoints::api as http_api; use super::sled_agent::SledAgent; use crate::bootstrap::params::StartSledAgentRequest; +use crate::long_running_tasks::LongRunningTaskHandles; use crate::nexus::NexusClientWithResolver; use crate::services::ServiceManager; -use crate::storage_manager::StorageManager; -use bootstore::schemes::v0 as bootstore; +use crate::storage_monitor::UnderlayAccess; use internal_dns::resolver::Resolver; use slog::Logger; use std::net::SocketAddr; use std::sync::Arc; +use tokio::sync::oneshot; use uuid::Uuid; /// Packages up a [`SledAgent`], running the sled agent API under a Dropshot @@ -39,9 +40,9 @@ impl Server { config: &Config, log: Logger, request: StartSledAgentRequest, + long_running_tasks_handles: LongRunningTaskHandles, services: ServiceManager, - storage: StorageManager, - bootstore: bootstore::NodeHandle, + underlay_available_tx: oneshot::Sender, ) -> Result { info!(log, "setting up sled agent server"); @@ -63,8 +64,8 @@ impl Server { nexus_client, request, services, - storage, - bootstore, + long_running_tasks_handles, + underlay_available_tx, ) .await .map_err(|e| e.to_string())?; diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index fbd475d05f..a6dc6f72b2 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -5,7 +5,7 @@ //! Sled-local service management. //! //! For controlling zone-based storage services, refer to -//! [crate::storage_manager::StorageManager]. +//! [sled_storage::manager::StorageManager]. //! //! For controlling virtual machine instances, refer to //! [crate::instance_manager::InstanceManager]. @@ -38,7 +38,6 @@ use crate::profile::*; use crate::services_migration::{AllZoneRequests, SERVICES_LEDGER_FILENAME}; use crate::smf_helper::Service; use crate::smf_helper::SmfHelper; -use crate::storage_manager::StorageResources; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; @@ -90,12 +89,13 @@ use omicron_common::nexus_config::{ }; use once_cell::sync::OnceCell; use rand::prelude::SliceRandom; -use sled_hardware::disk::ZONE_DATASET; use sled_hardware::is_gimlet; use sled_hardware::underlay; use sled_hardware::underlay::BOOTSTRAP_PREFIX; use sled_hardware::Baseboard; use sled_hardware::SledMode; +use sled_storage::dataset::{CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET}; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::collections::HashSet; @@ -519,7 +519,7 @@ pub struct ServiceManagerInner { advertised_prefixes: Mutex>>, sled_info: OnceCell, switch_zone_bootstrap_address: Ipv6Addr, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ledger_directory_override: OnceCell, image_directory_override: OnceCell, @@ -564,10 +564,11 @@ impl ServiceManager { skip_timesync: Option, sidecar_revision: SidecarRevision, switch_zone_maghemite_links: Vec, - storage: StorageResources, + storage: StorageHandle, zone_bundler: ZoneBundler, ) -> Self { let log = log.new(o!("component" => "ServiceManager")); + info!(log, "Creating ServiceManager"); Self { inner: Arc::new(ServiceManagerInner { log: log.clone(), @@ -622,10 +623,9 @@ impl ServiceManager { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; } - self.inner - .storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) - .await + let resources = self.inner.storage.get_latest_resources().await; + resources + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(SERVICES_LEDGER_FILENAME)) .collect() @@ -635,10 +635,9 @@ impl ServiceManager { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(ZONES_LEDGER_FILENAME)]; } - self.inner - .storage - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) - .await + let resources = self.inner.storage.get_latest_resources().await; + resources + .all_m2_mountpoints(CONFIG_DATASET) .into_iter() .map(|p| p.join(ZONES_LEDGER_FILENAME)) .collect() @@ -1433,11 +1432,11 @@ impl ServiceManager { // If the boot disk exists, look for the image in the "install" dataset // there too. - if let Some((_, boot_zpool)) = self.inner.storage.boot_disk().await { - zone_image_paths.push( - boot_zpool - .dataset_mountpoint(sled_hardware::disk::INSTALL_DATASET), - ); + if let Some((_, boot_zpool)) = + self.inner.storage.get_latest_resources().await.boot_disk() + { + zone_image_paths + .push(boot_zpool.dataset_mountpoint(INSTALL_DATASET)); } let zone_type_str = match &request { @@ -2772,8 +2771,12 @@ impl ServiceManager { } // Create zones that should be running - let all_u2_roots = - self.inner.storage.all_u2_mountpoints(ZONE_DATASET).await; + let all_u2_roots = self + .inner + .storage + .get_latest_resources() + .await + .all_u2_mountpoints(ZONE_DATASET); let mut new_zones = Vec::new(); for zone in zones_to_be_added { // Check if we think the zone should already be running @@ -3526,7 +3529,7 @@ impl ServiceManager { #[cfg(test)] mod test { use super::*; - use async_trait::async_trait; + use illumos_utils::zpool::ZpoolName; use illumos_utils::{ dladm::{ Etherstub, MockDladm, BOOTSTRAP_ETHERSTUB_NAME, @@ -3535,10 +3538,10 @@ mod test { svc, zone::MockZones, }; - use key_manager::{ - SecretRetriever, SecretRetrieverError, SecretState, VersionedIkm, - }; use omicron_common::address::OXIMETER_PORT; + use sled_storage::disk::{RawDisk, SyntheticDisk}; + + use sled_storage::manager::{FakeStorageManager, StorageHandle}; use std::net::{Ipv6Addr, SocketAddrV6}; use std::os::unix::process::ExitStatusExt; use uuid::Uuid; @@ -3566,6 +3569,7 @@ mod test { // Returns the expectations for a new service to be created. fn expect_new_service() -> Vec> { + illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().return_once( @@ -3608,8 +3612,7 @@ mod test { let wait_ctx = svc::wait_for_service_context(); wait_ctx.expect().return_once(|_, _, _| Ok(())); - // Import the manifest, enable the service - let execute_ctx = illumos_utils::execute_context(); + let execute_ctx = illumos_utils::execute_helper_context(); execute_ctx.expect().times(..).returning(|_| { Ok(std::process::Output { status: std::process::ExitStatus::from_raw(0), @@ -3635,6 +3638,7 @@ mod test { // This is looser than the expectations created by ensure_new_service() // because these functions may return any number of times. fn expect_new_services() -> Vec> { + illumos_utils::USE_MOCKS.store(true, Ordering::SeqCst); // Create a VNIC let create_vnic_ctx = MockDladm::create_vnic_context(); create_vnic_ctx.expect().returning( @@ -3651,7 +3655,7 @@ mod test { Ok(()) }); - // // Boot the zone. + // Boot the zone. let boot_ctx = MockZones::boot_context(); boot_ctx.expect().returning(|name| { assert!(name.starts_with(EXPECTED_ZONE_NAME_PREFIX)); @@ -3683,7 +3687,7 @@ mod test { wait_ctx.expect().returning(|_, _, _| Ok(())); // Import the manifest, enable the service - let execute_ctx = illumos_utils::execute_context(); + let execute_ctx = illumos_utils::execute_helper_context(); execute_ctx.expect().times(..).returning(|_| { Ok(std::process::Output { status: std::process::ExitStatus::from_raw(0), @@ -3794,29 +3798,24 @@ mod test { } } - pub struct TestSecretRetriever {} + async fn setup_storage() -> StorageHandle { + let (manager, handle) = FakeStorageManager::new(); - #[async_trait] - impl SecretRetriever for TestSecretRetriever { - async fn get_latest( - &self, - ) -> Result { - let epoch = 0; - let salt = [0u8; 32]; - let secret = [0x1d; 32]; + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); - Ok(VersionedIkm::new(epoch, salt, &secret)) - } + let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let internal_disk: RawDisk = + SyntheticDisk::new(internal_zpool_name).into(); + handle.upsert_disk(internal_disk).await; + let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let external_disk: RawDisk = + SyntheticDisk::new(external_zpool_name).into(); + handle.upsert_disk(external_disk).await; - async fn get( - &self, - epoch: u64, - ) -> Result { - if epoch != 0 { - return Err(SecretRetrieverError::NoSuchEpoch(epoch)); - } - Ok(SecretState::Current(self.get_latest().await?)) - } + handle } #[tokio::test] @@ -3827,10 +3826,10 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3841,7 +3840,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources, + storage_handle, zone_bundler, ); test_config.override_paths(&mgr); @@ -3889,10 +3888,10 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3903,7 +3902,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources, + storage_handle, zone_bundler, ); test_config.override_paths(&mgr); @@ -3950,10 +3949,10 @@ mod test { // First, spin up a ServiceManager, create a new service, and tear it // down. - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -3964,7 +3963,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -3998,7 +3997,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -4040,10 +4039,10 @@ mod test { // First, spin up a ServiceManager, create a new zone, and then tear // down the ServiceManager. - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -4054,7 +4053,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -4094,7 +4093,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle, zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -4131,10 +4130,10 @@ mod test { let log = logctx.log.clone(); let test_config = TestConfig::new().await; - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -4145,7 +4144,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources, + storage_handle, zone_bundler, ); test_config.override_paths(&mgr); @@ -4266,10 +4265,10 @@ mod test { let ddmd_client = DdmAdminClient::localhost(&log).unwrap(); let bootstrap_networking = make_bootstrap_networking_config(); - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -4280,7 +4279,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -4347,7 +4346,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); @@ -4384,10 +4383,10 @@ mod test { let ddmd_client = DdmAdminClient::localhost(&log).unwrap(); let bootstrap_networking = make_bootstrap_networking_config(); - let resources = StorageResources::new_for_test(); + let storage_handle = setup_storage().await; let zone_bundler = ZoneBundler::new( log.clone(), - resources.clone(), + storage_handle.clone(), Default::default(), ); let mgr = ServiceManager::new( @@ -4398,7 +4397,7 @@ mod test { Some(true), SidecarRevision::Physical("rev-test".to_string()), vec![], - resources.clone(), + storage_handle.clone(), zone_bundler.clone(), ); test_config.override_paths(&mgr); diff --git a/sled-agent/src/services_migration.rs b/sled-agent/src/services_migration.rs index d61250ebbe..ac3400a2c6 100644 --- a/sled-agent/src/services_migration.rs +++ b/sled-agent/src/services_migration.rs @@ -24,11 +24,10 @@ //! past this change. use crate::params::{ - DatasetKind, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, - ZoneType, OMICRON_ZONES_CONFIG_INITIAL_VERSION, + OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, ZoneType, + OMICRON_ZONES_CONFIG_INITIAL_VERSION, }; use crate::services::{OmicronZoneConfigLocal, OmicronZonesConfigLocal}; -use crate::storage::dataset::DatasetName; use anyhow::{anyhow, ensure, Context}; use camino::Utf8PathBuf; use omicron_common::api::external::Generation; @@ -38,6 +37,7 @@ use omicron_common::api::internal::shared::{ use omicron_common::ledger::Ledgerable; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_storage::dataset::{DatasetKind, DatasetName}; use std::fmt::Debug; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use uuid::Uuid; diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 835f15485f..38fd8bad0e 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -11,6 +11,7 @@ use crate::bootstrap::early_networking::{ use crate::bootstrap::params::StartSledAgentRequest; use crate::config::Config; use crate::instance_manager::{InstanceManager, ReservoirMode}; +use crate::long_running_tasks::LongRunningTaskHandles; use crate::metrics::MetricsManager; use crate::nexus::{NexusClientWithResolver, NexusRequestQueue}; use crate::params::{ @@ -20,7 +21,7 @@ use crate::params::{ VpcFirewallRule, ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; -use crate::storage_manager::{self, StorageManager}; +use crate::storage_monitor::UnderlayAccess; use crate::updates::{ConfigUpdates, UpdateManager}; use crate::zone_bundle; use crate::zone_bundle::BundleError; @@ -57,13 +58,13 @@ use omicron_common::backoff::{ retry_policy_internal_service_aggressive, BackoffError, }; use oximeter::types::ProducerRegistry; -use sled_hardware::underlay; -use sled_hardware::HardwareManager; -use sled_hardware::{underlay::BootstrapInterface, Baseboard}; +use sled_hardware::{underlay, Baseboard, HardwareManager}; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::sync::Arc; +use tokio::sync::oneshot; use uuid::Uuid; #[cfg(not(test))] @@ -110,7 +111,7 @@ pub enum Error { Instance(#[from] crate::instance_manager::Error), #[error("Error managing storage: {0}")] - Storage(#[from] crate::storage_manager::Error), + Storage(#[from] sled_storage::error::Error), #[error("Error updating: {0}")] Download(#[from] crate::updates::Error), @@ -227,7 +228,7 @@ struct SledAgentInner { start_request: StartSledAgentRequest, // Component of Sled Agent responsible for storage and dataset management. - storage: StorageManager, + storage: StorageHandle, // Component of Sled Agent responsible for managing Propolis instances. instances: InstanceManager, @@ -287,8 +288,8 @@ impl SledAgent { nexus_client: NexusClientWithResolver, request: StartSledAgentRequest, services: ServiceManager, - storage: StorageManager, - bootstore: bootstore::NodeHandle, + long_running_task_handles: LongRunningTaskHandles, + underlay_available_tx: oneshot::Sender, ) -> Result { // Pass the "parent_log" to all subcomponents that want to set their own // "component" value. @@ -301,14 +302,14 @@ impl SledAgent { )); info!(&log, "SledAgent::new(..) starting"); - let boot_disk = storage - .resources() - .boot_disk() + let storage_manager = &long_running_task_handles.storage_manager; + let boot_disk = storage_manager + .get_latest_resources() .await + .boot_disk() .ok_or_else(|| Error::BootDiskNotFound)?; - // Configure a swap device of the configured size before other system - // setup. + // Configure a swap device of the configured size before other system setup. match config.swap_device_size_gb { Some(sz) if sz > 0 => { info!(log, "Requested swap device of size {} GiB", sz); @@ -363,28 +364,23 @@ impl SledAgent { *sled_address.ip(), ); - storage - .setup_underlay_access(storage_manager::UnderlayAccess { + // Inform the `StorageMonitor` that the underlay is available so that + // it can try to contact nexus. + underlay_available_tx + .send(UnderlayAccess { nexus_client: nexus_client.clone(), sled_id: request.body.id, }) - .await?; - - // TODO-correctness The bootstrap agent _also_ has a `HardwareManager`. - // We only use it for reading properties, but it's not `Clone`able - // because it's holding an inner task handle. Could we add a way to get - // a read-only handle to it, and have bootstrap agent give us that - // instead of creating a new full one ourselves? - let hardware = HardwareManager::new(&parent_log, services.sled_mode()) - .map_err(|e| Error::Hardware(e))?; + .map_err(|_| ()) + .expect("Failed to send to StorageMonitor"); let instances = InstanceManager::new( parent_log.clone(), nexus_client.clone(), etherstub.clone(), port_manager.clone(), - storage.resources().clone(), - storage.zone_bundler().clone(), + storage_manager.clone(), + long_running_task_handles.zone_bundler.clone(), )?; // Configure the VMM reservoir as either a percentage of DRAM or as an @@ -409,7 +405,10 @@ impl SledAgent { } _ => { instances - .set_reservoir_size(&hardware, reservoir_mode) + .set_reservoir_size( + &long_running_task_handles.hardware_manager, + reservoir_mode, + ) .map_err(|e| { error!(log, "Failed to setup VMM reservoir: {e}"); e @@ -431,7 +430,8 @@ impl SledAgent { // until we have this, as we need to know which switches have uplinks to // correctly set up services. let get_network_config = || async { - let serialized_config = bootstore + let serialized_config = long_running_task_handles + .bootstore .get_network_config() .await .map_err(|err| BackoffError::transient(err.to_string()))? @@ -477,7 +477,7 @@ impl SledAgent { let mut metrics_manager = MetricsManager::new( request.body.id, request.body.rack_id, - hardware.baseboard(), + long_running_task_handles.hardware_manager.baseboard(), log.new(o!("component" => "MetricsManager")), )?; @@ -514,15 +514,14 @@ impl SledAgent { endpoint, )); - let zone_bundler = storage.zone_bundler().clone(); let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.body.id, subnet: request.body.subnet, start_request: request, - storage, + storage: long_running_task_handles.storage_manager.clone(), instances, - hardware, + hardware: long_running_task_handles.hardware_manager.clone(), updates, port_manager, services, @@ -536,8 +535,8 @@ impl SledAgent { // request queue? nexus_request_queue: NexusRequestQueue::new(), rack_network_config, - zone_bundler, - bootstore: bootstore.clone(), + zone_bundler: long_running_task_handles.zone_bundler.clone(), + bootstore: long_running_task_handles.bootstore.clone(), metrics_manager, }), log: log.clone(), @@ -557,6 +556,7 @@ impl SledAgent { /// Blocks until all services have started, retrying indefinitely on /// failure. pub(crate) async fn load_services(&self) { + info!(self.log, "Loading cold boot services"); retry_notify( retry_policy_internal_service_aggressive(), || async { @@ -663,12 +663,15 @@ impl SledAgent { if call_count == 0 { info!( log, - "failed to notify nexus about sled agent"; "error" => err, + "failed to notify nexus about sled agent"; + "error" => %err, ); } else if total_duration > std::time::Duration::from_secs(30) { warn!( log, - "failed to notify nexus about sled agent"; "error" => err, "total duration" => ?total_duration, + "failed to notify nexus about sled agent"; + "error" => %err, + "total duration" => ?total_duration, ); } }; @@ -841,9 +844,18 @@ impl SledAgent { } /// Gets the sled's current list of all zpools. - pub async fn zpools_get(&self) -> Result, Error> { - let zpools = self.inner.storage.get_zpools().await?; - Ok(zpools) + pub async fn zpools_get(&self) -> Vec { + self.inner + .storage + .get_latest_resources() + .await + .get_all_zpools() + .into_iter() + .map(|(name, variant)| Zpool { + id: name.id(), + disk_type: variant.into(), + }) + .collect() } /// Returns whether or not the sled believes itself to be a scrimlet @@ -1083,7 +1095,9 @@ pub async fn add_sled_to_initialized_rack( // Get all known bootstrap addresses via DDM let ddm_admin_client = DdmAdminClient::localhost(&log)?; let addrs = ddm_admin_client - .derive_bootstrap_addrs_from_prefixes(&[BootstrapInterface::GlobalZone]) + .derive_bootstrap_addrs_from_prefixes(&[ + underlay::BootstrapInterface::GlobalZone, + ]) .await?; // Create a set of futures to concurrently map the baseboard to bootstrap ip diff --git a/sled-agent/src/storage/dataset.rs b/sled-agent/src/storage/dataset.rs deleted file mode 100644 index eda78cec06..0000000000 --- a/sled-agent/src/storage/dataset.rs +++ /dev/null @@ -1,50 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use crate::params::DatasetKind; -use illumos_utils::zpool::ZpoolName; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; - -#[derive( - Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Clone, JsonSchema, -)] -pub struct DatasetName { - // A unique identifier for the Zpool on which the dataset is stored. - pool_name: ZpoolName, - // A name for the dataset within the Zpool. - kind: DatasetKind, -} - -impl DatasetName { - pub fn new(pool_name: ZpoolName, kind: DatasetKind) -> Self { - Self { pool_name, kind } - } - - pub fn pool(&self) -> &ZpoolName { - &self.pool_name - } - - pub fn dataset(&self) -> &DatasetKind { - &self.kind - } - - pub fn full(&self) -> String { - format!("{}/{}", self.pool_name, self.kind) - } -} - -#[cfg(test)] -mod test { - use super::*; - use uuid::Uuid; - - #[test] - fn serialize_dataset_name() { - let pool = ZpoolName::new_internal(Uuid::new_v4()); - let kind = DatasetKind::Crucible; - let name = DatasetName::new(pool, kind); - toml::to_string(&name).unwrap(); - } -} diff --git a/sled-agent/src/storage/mod.rs b/sled-agent/src/storage/mod.rs deleted file mode 100644 index 74bd59a151..0000000000 --- a/sled-agent/src/storage/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Management of local storage - -pub(crate) mod dataset; -pub(crate) mod dump_setup; diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs deleted file mode 100644 index c31a4dc0bc..0000000000 --- a/sled-agent/src/storage_manager.rs +++ /dev/null @@ -1,1432 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Management of sled-local storage. - -use crate::nexus::NexusClientWithResolver; -use crate::storage::dataset::DatasetName; -use crate::storage::dump_setup::DumpSetup; -use crate::zone_bundle::ZoneBundler; -use camino::Utf8PathBuf; -use derive_more::From; -use futures::stream::FuturesOrdered; -use futures::FutureExt; -use futures::StreamExt; -use illumos_utils::zpool::{ZpoolKind, ZpoolName}; -use illumos_utils::{zfs::Mountpoint, zpool::ZpoolInfo}; -use key_manager::StorageKeyRequester; -use nexus_client::types::PhysicalDiskDeleteRequest; -use nexus_client::types::PhysicalDiskKind; -use nexus_client::types::PhysicalDiskPutRequest; -use nexus_client::types::ZpoolPutRequest; -use omicron_common::api::external::{ByteCount, ByteCountRangeError}; -use omicron_common::backoff; -use omicron_common::disk::DiskIdentity; -use sled_hardware::{Disk, DiskVariant, UnparsedDisk}; -use slog::Logger; -use std::collections::hash_map; -use std::collections::HashMap; -use std::collections::HashSet; -use std::convert::TryFrom; -use std::pin::Pin; -use std::sync::Arc; -use std::sync::OnceLock; -use std::time::Duration; -use tokio::sync::{mpsc, oneshot, Mutex}; -use tokio::task::JoinHandle; -use tokio::time::{interval, MissedTickBehavior}; -use uuid::Uuid; - -use illumos_utils::dumpadm::DumpHdrError; -#[cfg(test)] -use illumos_utils::{zfs::MockZfs as Zfs, zpool::MockZpool as Zpool}; -#[cfg(not(test))] -use illumos_utils::{zfs::Zfs, zpool::Zpool}; - -// A key manager can only become ready once. This occurs during RSS or cold -// boot when the bootstore has detected it has a key share. -static KEY_MANAGER_READY: OnceLock<()> = OnceLock::new(); - -#[derive(thiserror::Error, Debug)] -pub enum Error { - #[error(transparent)] - DiskError(#[from] sled_hardware::DiskError), - - // TODO: We could add the context of "why are we doint this op", maybe? - #[error(transparent)] - ZfsListDataset(#[from] illumos_utils::zfs::ListDatasetsError), - - #[error(transparent)] - ZfsEnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), - - #[error(transparent)] - ZfsSetValue(#[from] illumos_utils::zfs::SetValueError), - - #[error(transparent)] - ZfsGetValue(#[from] illumos_utils::zfs::GetValueError), - - #[error(transparent)] - GetZpoolInfo(#[from] illumos_utils::zpool::GetInfoError), - - #[error(transparent)] - Fstyp(#[from] illumos_utils::fstyp::Error), - - #[error(transparent)] - ZoneCommand(#[from] illumos_utils::running_zone::RunCommandError), - - #[error(transparent)] - ZoneBoot(#[from] illumos_utils::running_zone::BootError), - - #[error(transparent)] - ZoneEnsureAddress(#[from] illumos_utils::running_zone::EnsureAddressError), - - #[error(transparent)] - ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), - - #[error("No U.2 Zpools found")] - NoU2Zpool, - - #[error("Failed to parse UUID from {path}: {err}")] - ParseUuid { - path: Utf8PathBuf, - #[source] - err: uuid::Error, - }, - - #[error("Dataset {name:?} exists with a different uuid (has {old}, requested {new})")] - UuidMismatch { name: Box, old: Uuid, new: Uuid }, - - #[error("Error parsing pool {name}'s size: {err}")] - BadPoolSize { - name: String, - #[source] - err: ByteCountRangeError, - }, - - #[error("Failed to parse the dataset {name}'s UUID: {err}")] - ParseDatasetUuid { - name: String, - #[source] - err: uuid::Error, - }, - - #[error("Zpool Not Found: {0}")] - ZpoolNotFound(String), - - #[error("Failed to serialize toml (intended for {path:?}): {err}")] - Serialize { - path: Utf8PathBuf, - #[source] - err: toml::ser::Error, - }, - - #[error("Failed to deserialize toml from {path:?}: {err}")] - Deserialize { - path: Utf8PathBuf, - #[source] - err: toml::de::Error, - }, - - #[error("Failed to perform I/O: {message}: {err}")] - Io { - message: String, - #[source] - err: std::io::Error, - }, - - #[error("Underlay not yet initialized")] - UnderlayNotInitialized, - - #[error("Encountered error checking dump device flags: {0}")] - DumpHdr(#[from] DumpHdrError), -} - -/// A ZFS storage pool. -struct Pool { - name: ZpoolName, - info: ZpoolInfo, - parent: DiskIdentity, -} - -impl Pool { - /// Queries for an existing Zpool by name. - /// - /// Returns Ok if the pool exists. - fn new(name: ZpoolName, parent: DiskIdentity) -> Result { - let info = Zpool::get_info(&name.to_string())?; - Ok(Pool { name, info, parent }) - } - - fn parent(&self) -> &DiskIdentity { - &self.parent - } -} - -// The type of a future which is used to send a notification to Nexus. -type NotifyFut = - Pin> + Send>>; - -#[derive(Debug)] -struct NewFilesystemRequest { - dataset_id: Uuid, - dataset_name: DatasetName, - responder: oneshot::Sender>, -} - -struct UnderlayRequest { - underlay: UnderlayAccess, - responder: oneshot::Sender>, -} - -#[derive(PartialEq, Eq, Clone)] -pub(crate) enum DiskWrapper { - Real { disk: Disk, devfs_path: Utf8PathBuf }, - Synthetic { zpool_name: ZpoolName }, -} - -impl From for DiskWrapper { - fn from(disk: Disk) -> Self { - let devfs_path = disk.devfs_path().clone(); - Self::Real { disk, devfs_path } - } -} - -impl DiskWrapper { - fn identity(&self) -> DiskIdentity { - match self { - DiskWrapper::Real { disk, .. } => disk.identity().clone(), - DiskWrapper::Synthetic { zpool_name } => { - let id = zpool_name.id(); - DiskIdentity { - vendor: "synthetic-vendor".to_string(), - serial: format!("synthetic-serial-{id}"), - model: "synthetic-model".to_string(), - } - } - } - } - - fn variant(&self) -> DiskVariant { - match self { - DiskWrapper::Real { disk, .. } => disk.variant(), - DiskWrapper::Synthetic { zpool_name } => match zpool_name.kind() { - ZpoolKind::External => DiskVariant::U2, - ZpoolKind::Internal => DiskVariant::M2, - }, - } - } - - fn zpool_name(&self) -> &ZpoolName { - match self { - DiskWrapper::Real { disk, .. } => disk.zpool_name(), - DiskWrapper::Synthetic { zpool_name } => zpool_name, - } - } -} - -#[derive(Clone)] -pub struct StorageResources { - // All disks, real and synthetic, being managed by this sled - disks: Arc>>, - - // A map of "Uuid" to "pool". - pools: Arc>>, -} - -// The directory within the debug dataset in which bundles are created. -const BUNDLE_DIRECTORY: &str = "bundle"; - -// The directory for zone bundles. -const ZONE_BUNDLE_DIRECTORY: &str = "zone"; - -impl StorageResources { - /// Creates a fabricated view of storage resources. - /// - /// Use this only when you want to reference the disks, but not actually - /// access them. Creates one internal and one external disk. - #[cfg(test)] - pub fn new_for_test() -> Self { - let new_disk_identity = || DiskIdentity { - vendor: "vendor".to_string(), - serial: Uuid::new_v4().to_string(), - model: "model".to_string(), - }; - - Self { - disks: Arc::new(Mutex::new(HashMap::from([ - ( - new_disk_identity(), - DiskWrapper::Synthetic { - zpool_name: ZpoolName::new_internal(Uuid::new_v4()), - }, - ), - ( - new_disk_identity(), - DiskWrapper::Synthetic { - zpool_name: ZpoolName::new_external(Uuid::new_v4()), - }, - ), - ]))), - pools: Arc::new(Mutex::new(HashMap::new())), - } - } - - /// Returns the identity of the boot disk. - /// - /// If this returns `None`, we have not processed the boot disk yet. - pub async fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { - let disks = self.disks.lock().await; - disks.iter().find_map(|(id, disk)| { - match disk { - // This is the "real" use-case: if we have real disks, query - // their properties to identify if they truly are the boot disk. - DiskWrapper::Real { disk, .. } => { - if disk.is_boot_disk() { - return Some((id.clone(), disk.zpool_name().clone())); - } - } - // This is the "less real" use-case: if we have synthetic disks, - // just label the first M.2-looking one as a "boot disk". - DiskWrapper::Synthetic { .. } => { - if matches!(disk.variant(), DiskVariant::M2) { - return Some((id.clone(), disk.zpool_name().clone())); - } - } - }; - None - }) - } - - // TODO: Could be generic over DiskVariant - - /// Returns all M.2 zpools - pub async fn all_m2_zpools(&self) -> Vec { - self.all_zpools(DiskVariant::M2).await - } - - /// Returns all U.2 zpools - pub async fn all_u2_zpools(&self) -> Vec { - self.all_zpools(DiskVariant::U2).await - } - - /// Returns all mountpoints within all M.2s for a particular dataset. - pub async fn all_m2_mountpoints(&self, dataset: &str) -> Vec { - let m2_zpools = self.all_m2_zpools().await; - m2_zpools - .iter() - .map(|zpool| zpool.dataset_mountpoint(dataset)) - .collect() - } - - /// Returns all mountpoints within all U.2s for a particular dataset. - pub async fn all_u2_mountpoints(&self, dataset: &str) -> Vec { - let u2_zpools = self.all_u2_zpools().await; - u2_zpools - .iter() - .map(|zpool| zpool.dataset_mountpoint(dataset)) - .collect() - } - - /// Returns all zpools of a particular variant - pub async fn all_zpools(&self, variant: DiskVariant) -> Vec { - let disks = self.disks.lock().await; - disks - .values() - .filter_map(|disk| { - if disk.variant() == variant { - return Some(disk.zpool_name().clone()); - } - None - }) - .collect() - } - - /// Return the directories for storing zone service bundles. - pub async fn all_zone_bundle_directories(&self) -> Vec { - self.all_m2_mountpoints(sled_hardware::disk::M2_DEBUG_DATASET) - .await - .into_iter() - .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) - .collect() - } -} - -/// Describes the access to the underlay used by the StorageManager. -pub struct UnderlayAccess { - pub nexus_client: NexusClientWithResolver, - pub sled_id: Uuid, -} - -// A worker that starts zones for pools as they are received. -struct StorageWorker { - log: Logger, - nexus_notifications: FuturesOrdered, - rx: mpsc::Receiver, - underlay: Arc>>, - - // A mechanism for requesting disk encryption keys from the - // [`key_manager::KeyManager`] - key_requester: StorageKeyRequester, - - // Invokes dumpadm(8) and savecore(8) when new disks are encountered - dump_setup: Arc, -} - -#[derive(Clone, Debug)] -enum NotifyDiskRequest { - Add { identity: DiskIdentity, variant: DiskVariant }, - Remove(DiskIdentity), -} - -#[derive(From, Clone, Debug, PartialEq, Eq, Hash)] -enum QueuedDiskCreate { - Real(UnparsedDisk), - Synthetic(ZpoolName), -} - -impl QueuedDiskCreate { - fn is_synthetic(&self) -> bool { - if let QueuedDiskCreate::Synthetic(_) = self { - true - } else { - false - } - } -} - -impl StorageWorker { - // Ensures the named dataset exists as a filesystem with a UUID, optionally - // creating it if `do_format` is true. - // - // Returns the UUID attached to the ZFS filesystem. - fn ensure_dataset( - &mut self, - dataset_id: Uuid, - dataset_name: &DatasetName, - ) -> Result<(), Error> { - let zoned = true; - let fs_name = &dataset_name.full(); - let do_format = true; - let encryption_details = None; - let size_details = None; - Zfs::ensure_filesystem( - &dataset_name.full(), - Mountpoint::Path(Utf8PathBuf::from("/data")), - zoned, - do_format, - encryption_details, - size_details, - None, - )?; - // Ensure the dataset has a usable UUID. - if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { - if let Ok(id) = id_str.parse::() { - if id != dataset_id { - return Err(Error::UuidMismatch { - name: Box::new(dataset_name.clone()), - old: id, - new: dataset_id, - }); - } - return Ok(()); - } - } - Zfs::set_oxide_value(&fs_name, "uuid", &dataset_id.to_string())?; - Ok(()) - } - - // Adds a "notification to nexus" to `nexus_notifications`, - // informing it about the addition of `pool_id` to this sled. - async fn add_zpool_notify(&mut self, pool: &Pool, size: ByteCount) { - // The underlay network is setup once at sled-agent startup. Before - // there is an underlay we want to avoid sending notifications to nexus for - // two reasons: - // 1. They can't possibly succeed - // 2. They increase the backoff time exponentially, so that once - // sled-agent does start it may take much longer to notify nexus - // than it would if we avoid this. This goes especially so for rack - // setup, when bootstrap agent is waiting an aribtrary time for RSS - // initialization. - if self.underlay.lock().await.is_none() { - return; - } - - let pool_id = pool.name.id(); - let DiskIdentity { vendor, serial, model } = pool.parent.clone(); - let underlay = self.underlay.clone(); - - let notify_nexus = move || { - let zpool_request = ZpoolPutRequest { - size: size.into(), - disk_vendor: vendor.clone(), - disk_serial: serial.clone(), - disk_model: model.clone(), - }; - let underlay = underlay.clone(); - - async move { - let underlay_guard = underlay.lock().await; - let Some(underlay) = underlay_guard.as_ref() else { - return Err(backoff::BackoffError::transient( - Error::UnderlayNotInitialized.to_string(), - )); - }; - let sled_id = underlay.sled_id; - let nexus_client = underlay.nexus_client.client().clone(); - drop(underlay_guard); - - nexus_client - .zpool_put(&sled_id, &pool_id, &zpool_request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - Ok(()) - } - }; - let log = self.log.clone(); - let name = pool.name.clone(); - let disk = pool.parent().clone(); - let log_post_failure = move |_, call_count, total_duration| { - if call_count == 0 { - info!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"); - } else if total_duration > std::time::Duration::from_secs(30) { - warn!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; - "total duration" => ?total_duration); - } - }; - self.nexus_notifications.push_back( - backoff::retry_notify_ext( - backoff::retry_policy_internal_service_aggressive(), - notify_nexus, - log_post_failure, - ) - .boxed(), - ); - } - - async fn ensure_using_exactly_these_disks( - &mut self, - resources: &StorageResources, - unparsed_disks: Vec, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - // Queue U.2 drives if necessary - // We clear all existing queued drives that are not synthetic and add - // new ones in the loop below - if let Some(queued) = queued_u2_drives { - info!( - self.log, - "Ensure exact disks: clearing non-synthetic queued disks." - ); - queued.retain(|d| d.is_synthetic()); - } - - let mut new_disks = HashMap::new(); - - // We may encounter errors while parsing any of the disks; keep track of - // any errors that occur and return any of them if something goes wrong. - // - // That being said, we should not prevent access to the other disks if - // only one failure occurs. - let mut err: Option = None; - - // Ensure all disks conform to the expected partition layout. - for disk in unparsed_disks.into_iter() { - if disk.variant() == DiskVariant::U2 { - if let Some(queued) = queued_u2_drives { - info!(self.log, "Queuing disk for upsert: {disk:?}"); - queued.insert(disk.into()); - continue; - } - } - match self.add_new_disk(disk, queued_u2_drives).await.map_err( - |err| { - warn!(self.log, "Could not ensure partitions: {err}"); - err - }, - ) { - Ok(disk) => { - new_disks.insert(disk.identity().clone(), disk); - } - Err(e) => { - warn!(self.log, "Cannot parse disk: {e}"); - err = Some(e.into()); - } - }; - } - - let mut disks = resources.disks.lock().await; - - // Remove disks that don't appear in the "new_disks" set. - // - // This also accounts for zpools and notifies Nexus. - let disks_to_be_removed = disks - .iter_mut() - .filter(|(key, old_disk)| { - // If this disk appears in the "new" and "old" set, it should - // only be removed if it has changed. - // - // This treats a disk changing in an unexpected way as a - // "removal and re-insertion". - match old_disk { - DiskWrapper::Real { disk, .. } => { - if let Some(new_disk) = new_disks.get(*key) { - // Changed Disk -> Disk should be removed. - new_disk != disk - } else { - // Real disk, not in the new set -> Disk should be removed. - true - } - } - // Synthetic disk -> Disk should NOT be removed. - DiskWrapper::Synthetic { .. } => false, - } - }) - .map(|(_key, disk)| disk.clone()) - .collect::>(); - - for disk in disks_to_be_removed { - if let Err(e) = self - .delete_disk_locked(&resources, &mut disks, &disk.identity()) - .await - { - warn!(self.log, "Failed to delete disk: {e}"); - err = Some(e); - } - } - - // Add new disks to `resources.disks`. - // - // This also accounts for zpools and notifies Nexus. - for (key, new_disk) in new_disks { - if let Some(old_disk) = disks.get(&key) { - // In this case, the disk should be unchanged. - // - // This assertion should be upheld by the filter above, which - // should remove disks that changed. - assert!(old_disk == &new_disk.into()); - } else { - let disk = DiskWrapper::Real { - disk: new_disk.clone(), - devfs_path: new_disk.devfs_path().clone(), - }; - if let Err(e) = - self.upsert_disk_locked(&resources, &mut disks, disk).await - { - warn!(self.log, "Failed to upsert disk: {e}"); - err = Some(e); - } - } - } - - if let Some(err) = err { - Err(err) - } else { - Ok(()) - } - } - - // Attempt to create a new disk via `sled_hardware::Disk::new()`. If the - // disk addition fails because the the key manager cannot load a secret, - // this indicates a transient error, and so we queue the disk so we can - // try again. - async fn add_new_disk( - &mut self, - unparsed_disk: UnparsedDisk, - queued_u2_drives: &mut Option>, - ) -> Result { - match sled_hardware::Disk::new( - &self.log, - unparsed_disk.clone(), - Some(&self.key_requester), - ) - .await - { - Ok(disk) => Ok(disk), - Err(sled_hardware::DiskError::KeyManager(err)) => { - warn!( - self.log, - "Transient error: {err} - queuing disk {:?}", unparsed_disk - ); - if let Some(queued) = queued_u2_drives { - queued.insert(unparsed_disk.into()); - } else { - *queued_u2_drives = - Some(HashSet::from([unparsed_disk.into()])); - } - Err(sled_hardware::DiskError::KeyManager(err)) - } - Err(err) => { - error!( - self.log, - "Persistent error: {err} - not queueing disk {:?}", - unparsed_disk - ); - Err(err) - } - } - } - - // Attempt to create a new synthetic disk via - // `sled_hardware::Disk::ensure_zpool_ready()`. If the disk addition fails - // because the the key manager cannot load a secret, this indicates a - // transient error, and so we queue the disk so we can try again. - async fn add_new_synthetic_disk( - &mut self, - zpool_name: ZpoolName, - queued_u2_drives: &mut Option>, - ) -> Result<(), sled_hardware::DiskError> { - let synthetic_id = DiskIdentity { - vendor: "fake_vendor".to_string(), - serial: "fake_serial".to_string(), - model: zpool_name.id().to_string(), - }; - match sled_hardware::Disk::ensure_zpool_ready( - &self.log, - &zpool_name, - &synthetic_id, - Some(&self.key_requester), - ) - .await - { - Ok(()) => Ok(()), - Err(sled_hardware::DiskError::KeyManager(err)) => { - warn!( - self.log, - "Transient error: {err} - queuing synthetic disk: {:?}", - zpool_name - ); - if let Some(queued) = queued_u2_drives { - queued.insert(zpool_name.into()); - } else { - *queued_u2_drives = - Some(HashSet::from([zpool_name.into()])); - } - Err(sled_hardware::DiskError::KeyManager(err)) - } - Err(err) => { - error!( - self.log, - "Persistent error: {} - not queueing synthetic disk {:?}", - err, - zpool_name - ); - Err(err) - } - } - } - - async fn upsert_disk( - &mut self, - resources: &StorageResources, - disk: UnparsedDisk, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - // Queue U.2 drives if necessary - if let Some(queued) = queued_u2_drives { - if disk.variant() == DiskVariant::U2 { - info!(self.log, "Queuing disk for upsert: {disk:?}"); - queued.insert(disk.into()); - return Ok(()); - } - } - - info!(self.log, "Upserting disk: {disk:?}"); - - // Ensure the disk conforms to an expected partition layout. - let disk = - self.add_new_disk(disk, queued_u2_drives).await.map_err(|err| { - warn!(self.log, "Could not ensure partitions: {err}"); - err - })?; - - let mut disks = resources.disks.lock().await; - let disk = DiskWrapper::Real { - disk: disk.clone(), - devfs_path: disk.devfs_path().clone(), - }; - self.upsert_disk_locked(resources, &mut disks, disk).await - } - - async fn upsert_synthetic_disk( - &mut self, - resources: &StorageResources, - zpool_name: ZpoolName, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - // Queue U.2 drives if necessary - if let Some(queued) = queued_u2_drives { - if zpool_name.kind() == ZpoolKind::External { - info!( - self.log, - "Queuing synthetic disk for upsert: {zpool_name:?}" - ); - queued.insert(zpool_name.into()); - return Ok(()); - } - } - - info!(self.log, "Upserting synthetic disk for: {zpool_name:?}"); - - self.add_new_synthetic_disk(zpool_name.clone(), queued_u2_drives) - .await?; - let disk = DiskWrapper::Synthetic { zpool_name }; - let mut disks = resources.disks.lock().await; - self.upsert_disk_locked(resources, &mut disks, disk).await - } - - async fn upsert_disk_locked( - &mut self, - resources: &StorageResources, - disks: &mut tokio::sync::MutexGuard< - '_, - HashMap, - >, - disk: DiskWrapper, - ) -> Result<(), Error> { - disks.insert(disk.identity(), disk.clone()); - self.physical_disk_notify(NotifyDiskRequest::Add { - identity: disk.identity(), - variant: disk.variant(), - }) - .await; - self.upsert_zpool(&resources, disk.identity(), disk.zpool_name()) - .await?; - - self.dump_setup.update_dumpdev_setup(disks).await; - - Ok(()) - } - - async fn delete_disk( - &mut self, - resources: &StorageResources, - disk: UnparsedDisk, - ) -> Result<(), Error> { - info!(self.log, "Deleting disk: {disk:?}"); - // TODO: Don't we need to do some accounting, e.g. for all the information - // that's no longer accessible? Or is that up to Nexus to figure out at - // a later point-in-time? - // - // If we're storing zone images on the M.2s for internal services, how - // do we reconcile them? - let mut disks = resources.disks.lock().await; - self.delete_disk_locked(resources, &mut disks, disk.identity()).await - } - - async fn delete_disk_locked( - &mut self, - resources: &StorageResources, - disks: &mut tokio::sync::MutexGuard< - '_, - HashMap, - >, - key: &DiskIdentity, - ) -> Result<(), Error> { - if let Some(parsed_disk) = disks.remove(key) { - resources.pools.lock().await.remove(&parsed_disk.zpool_name().id()); - self.physical_disk_notify(NotifyDiskRequest::Remove(key.clone())) - .await; - } - - self.dump_setup.update_dumpdev_setup(disks).await; - - Ok(()) - } - - /// When the underlay becomes available, we need to notify nexus about any - /// discovered disks and pools, since we don't attempt to notify until there - /// is an underlay available. - async fn notify_nexus_about_existing_resources( - &mut self, - resources: &StorageResources, - ) -> Result<(), Error> { - let disks = resources.disks.lock().await; - for disk in disks.values() { - self.physical_disk_notify(NotifyDiskRequest::Add { - identity: disk.identity(), - variant: disk.variant(), - }) - .await; - } - - // We may encounter errors while processing any of the pools; keep track of - // any errors that occur and return any of them if something goes wrong. - // - // That being said, we should not prevent notification to nexus of the - // other pools if only one failure occurs. - let mut err: Option = None; - - let pools = resources.pools.lock().await; - for pool in pools.values() { - match ByteCount::try_from(pool.info.size()).map_err(|err| { - Error::BadPoolSize { name: pool.name.to_string(), err } - }) { - Ok(size) => self.add_zpool_notify(pool, size).await, - Err(e) => { - warn!(self.log, "Failed to notify nexus about pool: {e}"); - err = Some(e) - } - } - } - - if let Some(err) = err { - Err(err) - } else { - Ok(()) - } - } - - // Adds a "notification to nexus" to `self.nexus_notifications`, informing it - // about the addition/removal of a physical disk to this sled. - async fn physical_disk_notify(&mut self, disk: NotifyDiskRequest) { - // The underlay network is setup once at sled-agent startup. Before - // there is an underlay we want to avoid sending notifications to nexus for - // two reasons: - // 1. They can't possibly succeed - // 2. They increase the backoff time exponentially, so that once - // sled-agent does start it may take much longer to notify nexus - // than it would if we avoid this. This goes especially so for rack - // setup, when bootstrap agent is waiting an aribtrary time for RSS - // initialization. - if self.underlay.lock().await.is_none() { - return; - } - let underlay = self.underlay.clone(); - let disk2 = disk.clone(); - let notify_nexus = move || { - let disk = disk.clone(); - let underlay = underlay.clone(); - async move { - let underlay_guard = underlay.lock().await; - let Some(underlay) = underlay_guard.as_ref() else { - return Err(backoff::BackoffError::transient( - Error::UnderlayNotInitialized.to_string(), - )); - }; - let sled_id = underlay.sled_id; - let nexus_client = underlay.nexus_client.client().clone(); - drop(underlay_guard); - - match &disk { - NotifyDiskRequest::Add { identity, variant } => { - let request = PhysicalDiskPutRequest { - model: identity.model.clone(), - serial: identity.serial.clone(), - vendor: identity.vendor.clone(), - variant: match variant { - DiskVariant::U2 => PhysicalDiskKind::U2, - DiskVariant::M2 => PhysicalDiskKind::M2, - }, - sled_id, - }; - nexus_client - .physical_disk_put(&request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - } - NotifyDiskRequest::Remove(disk_identity) => { - let request = PhysicalDiskDeleteRequest { - model: disk_identity.model.clone(), - serial: disk_identity.serial.clone(), - vendor: disk_identity.vendor.clone(), - sled_id, - }; - nexus_client - .physical_disk_delete(&request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - } - } - Ok(()) - } - }; - let log = self.log.clone(); - // This notification is often invoked before Nexus has started - // running, so avoid flagging any errors as concerning until some - // time has passed. - let log_post_failure = move |_, call_count, total_duration| { - if call_count == 0 { - info!(log, "failed to notify nexus about {disk2:?}"); - } else if total_duration > std::time::Duration::from_secs(30) { - warn!(log, "failed to notify nexus about {disk2:?}"; - "total duration" => ?total_duration); - } - }; - self.nexus_notifications.push_back( - backoff::retry_notify_ext( - backoff::retry_policy_internal_service_aggressive(), - notify_nexus, - log_post_failure, - ) - .boxed(), - ); - } - - async fn upsert_zpool( - &mut self, - resources: &StorageResources, - parent: DiskIdentity, - pool_name: &ZpoolName, - ) -> Result<(), Error> { - let mut pools = resources.pools.lock().await; - let zpool = Pool::new(pool_name.clone(), parent)?; - - let pool = match pools.entry(pool_name.id()) { - hash_map::Entry::Occupied(mut entry) => { - // The pool already exists. - entry.get_mut().info = zpool.info; - return Ok(()); - } - hash_map::Entry::Vacant(entry) => entry.insert(zpool), - }; - info!(&self.log, "Storage manager processing zpool: {:#?}", pool.info); - - let size = ByteCount::try_from(pool.info.size()).map_err(|err| { - Error::BadPoolSize { name: pool_name.to_string(), err } - })?; - // Notify Nexus of the zpool. - self.add_zpool_notify(&pool, size).await; - Ok(()) - } - - // Attempts to add a dataset within a zpool, according to `request`. - async fn add_dataset( - &mut self, - resources: &StorageResources, - request: &NewFilesystemRequest, - ) -> Result { - info!(self.log, "add_dataset: {:?}", request); - let mut pools = resources.pools.lock().await; - let pool = pools - .get_mut(&request.dataset_name.pool().id()) - .ok_or_else(|| { - Error::ZpoolNotFound(format!( - "{}, looked up while trying to add dataset", - request.dataset_name.pool(), - )) - })?; - let dataset_name = DatasetName::new( - pool.name.clone(), - request.dataset_name.dataset().clone(), - ); - self.ensure_dataset(request.dataset_id, &dataset_name)?; - Ok(dataset_name) - } - - // Small wrapper around `Self::do_work_internal` that ensures we always - // emit info to the log when we exit. - async fn do_work( - &mut self, - resources: StorageResources, - ) -> Result<(), Error> { - // We queue U.2 sleds until the StorageKeyRequester is ready to use. - let mut queued_u2_drives = Some(HashSet::new()); - loop { - match self.do_work_internal(&resources, &mut queued_u2_drives).await - { - Ok(()) => { - info!(self.log, "StorageWorker exited successfully"); - return Ok(()); - } - Err(e) => { - warn!( - self.log, - "StorageWorker encountered unexpected error: {}", e - ); - // ... for now, keep trying. - } - } - } - } - - async fn do_work_internal( - &mut self, - resources: &StorageResources, - queued_u2_drives: &mut Option>, - ) -> Result<(), Error> { - const QUEUED_DISK_RETRY_TIMEOUT: Duration = Duration::from_secs(5); - let mut interval = interval(QUEUED_DISK_RETRY_TIMEOUT); - interval.set_missed_tick_behavior(MissedTickBehavior::Delay); - loop { - tokio::select! { - _ = self.nexus_notifications.next(), - if !self.nexus_notifications.is_empty() => {}, - Some(request) = self.rx.recv() => { - // We want to queue failed requests related to the key manager - match self.handle_storage_worker_request( - resources, queued_u2_drives, request) - .await { - Err(Error::DiskError(_)) => { - // We already handle and log disk errors, no need to - // return here. - } - Err(e) => return Err(e), - Ok(()) => {} - } - } - _ = interval.tick(), if queued_u2_drives.is_some() && - KEY_MANAGER_READY.get().is_some()=> - { - self.upsert_queued_disks(resources, queued_u2_drives).await; - } - } - } - } - - async fn handle_storage_worker_request( - &mut self, - resources: &StorageResources, - queued_u2_drives: &mut Option>, - request: StorageWorkerRequest, - ) -> Result<(), Error> { - use StorageWorkerRequest::*; - match request { - AddDisk(disk) => { - self.upsert_disk(&resources, disk, queued_u2_drives).await?; - } - AddSyntheticDisk(zpool_name) => { - self.upsert_synthetic_disk( - &resources, - zpool_name, - queued_u2_drives, - ) - .await?; - } - RemoveDisk(disk) => { - self.delete_disk(&resources, disk).await?; - } - NewFilesystem(request) => { - let result = self.add_dataset(&resources, &request).await; - let _ = request.responder.send(result); - } - DisksChanged(disks) => { - self.ensure_using_exactly_these_disks( - &resources, - disks, - queued_u2_drives, - ) - .await?; - } - SetupUnderlayAccess(UnderlayRequest { underlay, responder }) => { - // If this is the first time establishing an - // underlay we should notify nexus of all existing - // disks and zpools. - // - // Instead of individual notifications, we should - // send a bulk notification as described in https:// - // github.com/oxidecomputer/omicron/issues/1917 - if self.underlay.lock().await.replace(underlay).is_none() { - self.notify_nexus_about_existing_resources(&resources) - .await?; - } - let _ = responder.send(Ok(())); - } - KeyManagerReady => { - let _ = KEY_MANAGER_READY.set(()); - self.upsert_queued_disks(resources, queued_u2_drives).await; - } - } - Ok(()) - } - - async fn upsert_queued_disks( - &mut self, - resources: &StorageResources, - queued_u2_drives: &mut Option>, - ) { - let queued = queued_u2_drives.take(); - if let Some(queued) = queued { - for disk in queued { - if let Some(saved) = queued_u2_drives { - // We already hit a transient error and recreated our queue. - // Add any remaining queued disks back on the queue so we - // can try again later. - saved.insert(disk); - } else { - match self.upsert_queued_disk(disk, resources).await { - Ok(()) => {} - Err((_, None)) => { - // We already logged this as a persistent error in - // `add_new_disk` or `add_new_synthetic_disk` - } - Err((_, Some(disk))) => { - // We already logged this as a transient error in - // `add_new_disk` or `add_new_synthetic_disk` - *queued_u2_drives = Some(HashSet::from([disk])); - } - } - } - } - } - if queued_u2_drives.is_none() { - info!(self.log, "upserted all queued disks"); - } else { - warn!( - self.log, - "failed to upsert all queued disks - will try again" - ); - } - } - - // Attempt to upsert a queued disk. Return the disk and error if the upsert - // fails due to a transient error. Examples of transient errors are key - // manager errors which indicate that there are not enough sleds available - // to unlock the rack. - async fn upsert_queued_disk( - &mut self, - disk: QueuedDiskCreate, - resources: &StorageResources, - ) -> Result<(), (Error, Option)> { - let mut temp: Option> = None; - let res = match disk { - QueuedDiskCreate::Real(disk) => { - self.upsert_disk(&resources, disk, &mut temp).await - } - QueuedDiskCreate::Synthetic(zpool_name) => { - self.upsert_synthetic_disk(&resources, zpool_name, &mut temp) - .await - } - }; - if let Some(mut disks) = temp.take() { - assert!(res.is_err()); - assert_eq!(disks.len(), 1); - return Err(( - res.unwrap_err(), - disks.drain().next().unwrap().into(), - )); - } - // Any error at this point is not transient. - // We don't requeue the disk. - res.map_err(|e| (e, None)) - } -} - -enum StorageWorkerRequest { - AddDisk(UnparsedDisk), - AddSyntheticDisk(ZpoolName), - RemoveDisk(UnparsedDisk), - DisksChanged(Vec), - NewFilesystem(NewFilesystemRequest), - SetupUnderlayAccess(UnderlayRequest), - KeyManagerReady, -} - -struct StorageManagerInner { - log: Logger, - - resources: StorageResources, - - tx: mpsc::Sender, - - // A handle to a worker which updates "pools". - task: JoinHandle>, -} - -/// A sled-local view of all attached storage. -#[derive(Clone)] -pub struct StorageManager { - inner: Arc, - zone_bundler: ZoneBundler, -} - -impl StorageManager { - /// Creates a new [`StorageManager`] which should manage local storage. - pub async fn new(log: &Logger, key_requester: StorageKeyRequester) -> Self { - let log = log.new(o!("component" => "StorageManager")); - let resources = StorageResources { - disks: Arc::new(Mutex::new(HashMap::new())), - pools: Arc::new(Mutex::new(HashMap::new())), - }; - let (tx, rx) = mpsc::channel(30); - - let zb_log = log.new(o!("component" => "ZoneBundler")); - let zone_bundler = - ZoneBundler::new(zb_log, resources.clone(), Default::default()); - - StorageManager { - inner: Arc::new(StorageManagerInner { - log: log.clone(), - resources: resources.clone(), - tx, - task: tokio::task::spawn(async move { - let dump_setup = Arc::new(DumpSetup::new(&log)); - let mut worker = StorageWorker { - log, - nexus_notifications: FuturesOrdered::new(), - rx, - underlay: Arc::new(Mutex::new(None)), - key_requester, - dump_setup, - }; - - worker.do_work(resources).await - }), - }), - zone_bundler, - } - } - - /// Return a reference to the object used to manage zone bundles. - /// - /// This can be cloned by other code wishing to create and manage their own - /// zone bundles. - pub fn zone_bundler(&self) -> &ZoneBundler { - &self.zone_bundler - } - - /// Ensures that the storage manager tracks exactly the provided disks. - /// - /// This acts similar to a batch [Self::upsert_disk] for all new disks, and - /// [Self::delete_disk] for all removed disks. - /// - /// If errors occur, an arbitrary "one" of them will be returned, but a - /// best-effort attempt to add all disks will still be attempted. - // Receiver implemented by [StorageWorker::ensure_using_exactly_these_disks] - pub async fn ensure_using_exactly_these_disks(&self, unparsed_disks: I) - where - I: IntoIterator, - { - self.inner - .tx - .send(StorageWorkerRequest::DisksChanged( - unparsed_disks.into_iter().collect::>(), - )) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send DisksChanged request"); - } - - /// Adds a disk and associated zpool to the storage manager. - // Receiver implemented by [StorageWorker::upsert_disk]. - pub async fn upsert_disk(&self, disk: UnparsedDisk) { - info!(self.inner.log, "Upserting disk: {disk:?}"); - self.inner - .tx - .send(StorageWorkerRequest::AddDisk(disk)) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send AddDisk request"); - } - - /// Removes a disk, if it's tracked by the storage manager, as well - /// as any associated zpools. - // Receiver implemented by [StorageWorker::delete_disk]. - pub async fn delete_disk(&self, disk: UnparsedDisk) { - info!(self.inner.log, "Deleting disk: {disk:?}"); - self.inner - .tx - .send(StorageWorkerRequest::RemoveDisk(disk)) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send RemoveDisk request"); - } - - /// Adds a synthetic zpool to the storage manager. - // Receiver implemented by [StorageWorker::upsert_synthetic_disk]. - pub async fn upsert_synthetic_disk(&self, name: ZpoolName) { - self.inner - .tx - .send(StorageWorkerRequest::AddSyntheticDisk(name)) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send AddSyntheticDisk request"); - } - - /// Adds underlay access to the storage manager. - pub async fn setup_underlay_access( - &self, - underlay: UnderlayAccess, - ) -> Result<(), Error> { - let (tx, rx) = oneshot::channel(); - self.inner - .tx - .send(StorageWorkerRequest::SetupUnderlayAccess(UnderlayRequest { - underlay, - responder: tx, - })) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send SetupUnderlayAccess request"); - rx.await.expect("Failed to await underlay setup") - } - - pub async fn get_zpools(&self) -> Result, Error> { - let disks = self.inner.resources.disks.lock().await; - let pools = self.inner.resources.pools.lock().await; - - let mut zpools = Vec::with_capacity(pools.len()); - - for (id, pool) in pools.iter() { - let disk_identity = &pool.parent; - let disk_type = if let Some(disk) = disks.get(&disk_identity) { - disk.variant().into() - } else { - // If the zpool claims to be attached to a disk that we - // don't know about, that's an error. - return Err(Error::ZpoolNotFound( - format!("zpool: {id} claims to be from unknown disk: {disk_identity:#?}") - )); - }; - zpools.push(crate::params::Zpool { id: *id, disk_type }); - } - - Ok(zpools) - } - - pub async fn upsert_filesystem( - &self, - dataset_id: Uuid, - dataset_name: DatasetName, - ) -> Result { - let (tx, rx) = oneshot::channel(); - let request = - NewFilesystemRequest { dataset_id, dataset_name, responder: tx }; - - self.inner - .tx - .send(StorageWorkerRequest::NewFilesystem(request)) - .await - .map_err(|e| e.to_string()) - .expect("Storage worker bug (not alive)"); - let dataset_name = rx.await.expect( - "Storage worker bug (dropped responder without responding)", - )?; - - Ok(dataset_name) - } - - /// Inform the storage worker that the KeyManager is capable of retrieving - /// secrets now and that any queued disks can be upserted. - pub async fn key_manager_ready(&self) { - info!(self.inner.log, "KeyManger ready"); - self.inner - .tx - .send(StorageWorkerRequest::KeyManagerReady) - .await - .map_err(|e| e.to_string()) - .expect("Failed to send KeyManagerReady request"); - } - - pub fn resources(&self) -> &StorageResources { - &self.inner.resources - } -} - -impl Drop for StorageManagerInner { - fn drop(&mut self) { - // NOTE: Ideally, with async drop, we'd await completion of the worker - // somehow. - // - // Without that option, we instead opt to simply cancel the worker - // task to ensure it does not remain alive beyond the StorageManager - // itself. - self.task.abort(); - } -} diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs new file mode 100644 index 0000000000..f552fdfd86 --- /dev/null +++ b/sled-agent/src/storage_monitor.rs @@ -0,0 +1,373 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A task that listens for storage events from [`sled_storage::manager::StorageManager`] +//! and dispatches them to other parst of the bootstrap agent and sled agent +//! code. + +use crate::dump_setup::DumpSetup; +use crate::nexus::NexusClientWithResolver; +use derive_more::From; +use futures::stream::FuturesOrdered; +use futures::FutureExt; +use futures::StreamExt; +use nexus_client::types::PhysicalDiskDeleteRequest; +use nexus_client::types::PhysicalDiskPutRequest; +use nexus_client::types::ZpoolPutRequest; +use omicron_common::api::external::ByteCount; +use omicron_common::backoff; +use omicron_common::disk::DiskIdentity; +use sled_storage::manager::StorageHandle; +use sled_storage::pool::Pool; +use sled_storage::resources::StorageResources; +use slog::Logger; +use std::fmt::Debug; +use std::pin::Pin; +use tokio::sync::oneshot; +use uuid::Uuid; + +#[derive(From, Clone, Debug)] +enum NexusDiskRequest { + Put(PhysicalDiskPutRequest), + Delete(PhysicalDiskDeleteRequest), +} + +/// Describes the access to the underlay used by the StorageManager. +#[derive(Clone)] +pub struct UnderlayAccess { + pub nexus_client: NexusClientWithResolver, + pub sled_id: Uuid, +} + +impl Debug for UnderlayAccess { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("UnderlayAccess") + .field("sled_id", &self.sled_id) + .finish() + } +} + +pub struct StorageMonitor { + log: Logger, + storage_manager: StorageHandle, + + // Receive a onetime notification that the underlay is available + underlay_available_rx: oneshot::Receiver, + + // A cached copy of the `StorageResources` from the last update + storage_resources: StorageResources, + + // Ability to access the underlay network + underlay: Option, + + // A queue for sending nexus notifications in order + nexus_notifications: FuturesOrdered, + + // Invokes dumpadm(8) and savecore(8) when new disks are encountered + dump_setup: DumpSetup, +} + +impl StorageMonitor { + pub fn new( + log: &Logger, + storage_manager: StorageHandle, + ) -> (StorageMonitor, oneshot::Sender) { + let (underlay_available_tx, underlay_available_rx) = oneshot::channel(); + let storage_resources = StorageResources::default(); + let dump_setup = DumpSetup::new(&log); + let log = log.new(o!("component" => "StorageMonitor")); + ( + StorageMonitor { + log, + storage_manager, + underlay_available_rx, + storage_resources, + underlay: None, + nexus_notifications: FuturesOrdered::new(), + dump_setup, + }, + underlay_available_tx, + ) + } + + /// Run the main receive loop of the `StorageMonitor` + /// + /// This should be spawned into a tokio task + pub async fn run(mut self) { + loop { + tokio::select! { + res = self.nexus_notifications.next(), + if !self.nexus_notifications.is_empty() => + { + match res { + Some(Ok(s)) => { + info!(self.log, "Nexus notification complete: {s}"); + } + e => error!(self.log, "Nexus notification error: {e:?}") + } + } + resources = self.storage_manager.wait_for_changes() => { + info!( + self.log, + "Received storage manager update"; + "resources" => ?resources + ); + self.handle_resource_update(resources).await; + } + Ok(underlay) = &mut self.underlay_available_rx, + if self.underlay.is_none() => + { + let sled_id = underlay.sled_id; + info!( + self.log, + "Underlay Available"; "sled_id" => %sled_id + ); + self.underlay = Some(underlay); + self.notify_nexus_about_existing_resources(sled_id).await; + } + } + } + } + + /// When the underlay becomes available, we need to notify nexus about any + /// discovered disks and pools, since we don't attempt to notify until there + /// is an underlay available. + async fn notify_nexus_about_existing_resources(&mut self, sled_id: Uuid) { + let current = StorageResources::default(); + let updated = &self.storage_resources; + let nexus_updates = + compute_resource_diffs(&self.log, &sled_id, ¤t, updated); + for put in nexus_updates.disk_puts { + self.physical_disk_notify(put.into()).await; + } + for (pool, put) in nexus_updates.zpool_puts { + self.add_zpool_notify(pool, put).await; + } + } + + async fn handle_resource_update( + &mut self, + updated_resources: StorageResources, + ) { + // If the underlay isn't available, we only record the changes. Nexus + // isn't yet reachable to notify. + if self.underlay.is_some() { + let nexus_updates = compute_resource_diffs( + &self.log, + &self.underlay.as_ref().unwrap().sled_id, + &self.storage_resources, + &updated_resources, + ); + + for put in nexus_updates.disk_puts { + self.physical_disk_notify(put.into()).await; + } + for del in nexus_updates.disk_deletes { + self.physical_disk_notify(del.into()).await; + } + for (pool, put) in nexus_updates.zpool_puts { + self.add_zpool_notify(pool, put).await; + } + } + self.dump_setup.update_dumpdev_setup(updated_resources.disks()).await; + + // Save the updated `StorageResources` + self.storage_resources = updated_resources; + } + + // Adds a "notification to nexus" to `self.nexus_notifications`, informing it + // about the addition/removal of a physical disk to this sled. + async fn physical_disk_notify(&mut self, disk: NexusDiskRequest) { + let underlay = self.underlay.as_ref().unwrap().clone(); + let disk2 = disk.clone(); + let notify_nexus = move || { + let underlay = underlay.clone(); + let disk = disk.clone(); + async move { + let nexus_client = underlay.nexus_client.client().clone(); + + match &disk { + NexusDiskRequest::Put(request) => { + nexus_client + .physical_disk_put(&request) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?; + } + NexusDiskRequest::Delete(request) => { + nexus_client + .physical_disk_delete(&request) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?; + } + } + let msg = format!("{:?}", disk); + Ok(msg) + } + }; + + let log = self.log.clone(); + // This notification is often invoked before Nexus has started + // running, so avoid flagging any errors as concerning until some + // time has passed. + let log_post_failure = move |err, call_count, total_duration| { + if call_count == 0 { + info!(log, "failed to notify nexus about {disk2:?}"; + "err" => ?err + ); + } else if total_duration > std::time::Duration::from_secs(30) { + warn!(log, "failed to notify nexus about {disk2:?}"; + "err" => ?err, + "total duration" => ?total_duration); + } + }; + self.nexus_notifications.push_back( + backoff::retry_notify_ext( + backoff::retry_policy_internal_service_aggressive(), + notify_nexus, + log_post_failure, + ) + .boxed(), + ); + } + + // Adds a "notification to nexus" to `nexus_notifications`, + // informing it about the addition of `pool_id` to this sled. + async fn add_zpool_notify( + &mut self, + pool: Pool, + zpool_request: ZpoolPutRequest, + ) { + let pool_id = pool.name.id(); + let underlay = self.underlay.as_ref().unwrap().clone(); + + let notify_nexus = move || { + let underlay = underlay.clone(); + let zpool_request = zpool_request.clone(); + async move { + let sled_id = underlay.sled_id; + let nexus_client = underlay.nexus_client.client().clone(); + nexus_client + .zpool_put(&sled_id, &pool_id, &zpool_request) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?; + let msg = format!("{:?}", zpool_request); + Ok(msg) + } + }; + + let log = self.log.clone(); + let name = pool.name.clone(); + let disk = pool.parent.clone(); + let log_post_failure = move |err, call_count, total_duration| { + if call_count == 0 { + info!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; + "err" => ?err); + } else if total_duration > std::time::Duration::from_secs(30) { + warn!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; + "err" => ?err, + "total duration" => ?total_duration); + } + }; + self.nexus_notifications.push_back( + backoff::retry_notify_ext( + backoff::retry_policy_internal_service_aggressive(), + notify_nexus, + log_post_failure, + ) + .boxed(), + ); + } +} + +// The type of a future which is used to send a notification to Nexus. +type NotifyFut = + Pin> + Send>>; + +struct NexusUpdates { + disk_puts: Vec, + disk_deletes: Vec, + zpool_puts: Vec<(Pool, ZpoolPutRequest)>, +} + +fn compute_resource_diffs( + log: &Logger, + sled_id: &Uuid, + current: &StorageResources, + updated: &StorageResources, +) -> NexusUpdates { + let mut disk_puts = vec![]; + let mut disk_deletes = vec![]; + let mut zpool_puts = vec![]; + + let mut put_pool = |disk_id: &DiskIdentity, updated_pool: &Pool| { + match ByteCount::try_from(updated_pool.info.size()) { + Ok(size) => zpool_puts.push(( + updated_pool.clone(), + ZpoolPutRequest { + size: size.into(), + disk_model: disk_id.model.clone(), + disk_serial: disk_id.serial.clone(), + disk_vendor: disk_id.vendor.clone(), + }, + )), + Err(err) => { + error!( + log, + "Error parsing pool size"; + "name" => updated_pool.name.to_string(), + "err" => ?err); + } + } + }; + + // Diff the existing resources with the update to see what has changed + // This loop finds disks and pools that were modified or deleted + for (disk_id, (disk, pool)) in current.disks().iter() { + match updated.disks().get(disk_id) { + Some((updated_disk, updated_pool)) => { + if disk != updated_disk { + disk_puts.push(PhysicalDiskPutRequest { + sled_id: *sled_id, + model: disk_id.model.clone(), + serial: disk_id.serial.clone(), + vendor: disk_id.vendor.clone(), + variant: updated_disk.variant().into(), + }); + } + if pool != updated_pool { + put_pool(disk_id, updated_pool); + } + } + None => disk_deletes.push(PhysicalDiskDeleteRequest { + model: disk_id.model.clone(), + serial: disk_id.serial.clone(), + vendor: disk_id.vendor.clone(), + sled_id: *sled_id, + }), + } + } + + // Diff the existing resources with the update to see what has changed + // This loop finds new disks and pools + for (disk_id, (updated_disk, updated_pool)) in updated.disks().iter() { + if !current.disks().contains_key(disk_id) { + disk_puts.push(PhysicalDiskPutRequest { + sled_id: *sled_id, + model: disk_id.model.clone(), + serial: disk_id.serial.clone(), + vendor: disk_id.vendor.clone(), + variant: updated_disk.variant().into(), + }); + put_pool(disk_id, updated_pool); + } + } + + NexusUpdates { disk_puts, disk_deletes, zpool_puts } +} diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 91cb850df4..70b9da7708 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -6,7 +6,6 @@ //! Tools for collecting and inspecting service bundles for zones. -use crate::storage_manager::StorageResources; use anyhow::anyhow; use anyhow::Context; use camino::FromPathBufError; @@ -33,6 +32,8 @@ use illumos_utils::zone::AdmError; use schemars::JsonSchema; use serde::Deserialize; use serde::Serialize; +use sled_storage::dataset::U2_DEBUG_DATASET; +use sled_storage::manager::StorageHandle; use slog::Logger; use std::cmp::Ord; use std::cmp::Ordering; @@ -221,20 +222,12 @@ pub struct ZoneBundler { inner: Arc>, // Channel for notifying the cleanup task that it should reevaluate. notify_cleanup: Arc, - // Tokio task handle running the period cleanup operation. - cleanup_task: Arc>, -} - -impl Drop for ZoneBundler { - fn drop(&mut self) { - self.cleanup_task.abort(); - } } // State shared between tasks, e.g., used when creating a bundle in different // tasks or between a creation and cleanup. struct Inner { - resources: StorageResources, + storage_handle: StorageHandle, cleanup_context: CleanupContext, last_cleanup_at: Instant, } @@ -262,7 +255,8 @@ impl Inner { // that can exist but do not, i.e., those whose parent datasets already // exist; and returns those. async fn bundle_directories(&self) -> Vec { - let expected = self.resources.all_zone_bundle_directories().await; + let resources = self.storage_handle.get_latest_resources().await; + let expected = resources.all_zone_bundle_directories(); let mut out = Vec::with_capacity(expected.len()); for each in expected.into_iter() { if tokio::fs::create_dir_all(&each).await.is_ok() { @@ -322,11 +316,11 @@ impl ZoneBundler { /// Create a new zone bundler. /// /// This creates an object that manages zone bundles on the system. It can - /// be used to create bundles from running zones, and runs a period task to - /// clean them up to free up space. + /// be used to create bundles from running zones, and runs a periodic task + /// to clean them up to free up space. pub fn new( log: Logger, - resources: StorageResources, + storage_handle: StorageHandle, cleanup_context: CleanupContext, ) -> Self { // This is compiled out in tests because there's no way to set our @@ -336,17 +330,19 @@ impl ZoneBundler { .expect("Failed to initialize existing ZFS resources"); let notify_cleanup = Arc::new(Notify::new()); let inner = Arc::new(Mutex::new(Inner { - resources, + storage_handle, cleanup_context, last_cleanup_at: Instant::now(), })); let cleanup_log = log.new(slog::o!("component" => "auto-cleanup-task")); let notify_clone = notify_cleanup.clone(); let inner_clone = inner.clone(); - let cleanup_task = Arc::new(tokio::task::spawn( - Self::periodic_cleanup(cleanup_log, inner_clone, notify_clone), + tokio::task::spawn(Self::periodic_cleanup( + cleanup_log, + inner_clone, + notify_clone, )); - Self { log, inner, notify_cleanup, cleanup_task } + Self { log, inner, notify_cleanup } } /// Trigger an immediate cleanup of low-priority zone bundles. @@ -431,10 +427,9 @@ impl ZoneBundler { ) -> Result { let inner = self.inner.lock().await; let storage_dirs = inner.bundle_directories().await; - let extra_log_dirs = inner - .resources - .all_u2_mountpoints(sled_hardware::disk::U2_DEBUG_DATASET) - .await + let resources = inner.storage_handle.get_latest_resources().await; + let extra_log_dirs = resources + .all_u2_mountpoints(U2_DEBUG_DATASET) .into_iter() .collect(); let context = ZoneBundleContext { cause, storage_dirs, extra_log_dirs }; @@ -2165,7 +2160,6 @@ mod illumos_tests { use super::CleanupPeriod; use super::PriorityOrder; use super::StorageLimit; - use super::StorageResources; use super::Utf8Path; use super::Utf8PathBuf; use super::Uuid; @@ -2178,6 +2172,10 @@ mod illumos_tests { use anyhow::Context; use chrono::TimeZone; use chrono::Utc; + use illumos_utils::zpool::ZpoolName; + use sled_storage::disk::RawDisk; + use sled_storage::disk::SyntheticDisk; + use sled_storage::manager::{FakeStorageManager, StorageHandle}; use slog::Drain; use slog::Logger; use tokio::process::Command; @@ -2219,22 +2217,43 @@ mod illumos_tests { // system, that creates the directories implied by the `StorageResources` // expected disk structure. struct ResourceWrapper { - resources: StorageResources, + storage_handle: StorageHandle, dirs: Vec, } + async fn setup_storage() -> StorageHandle { + let (manager, handle) = FakeStorageManager::new(); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // These must be internal zpools + for _ in 0..2 { + let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let internal_disk: RawDisk = + SyntheticDisk::new(internal_zpool_name.clone()).into(); + handle.upsert_disk(internal_disk).await; + } + handle + } + impl ResourceWrapper { // Create new storage resources, and mount fake datasets at the required // locations. async fn new() -> Self { - let resources = StorageResources::new_for_test(); - let dirs = resources.all_zone_bundle_directories().await; + // Spawn the storage related tasks required for testing and insert + // synthetic disks. + let storage_handle = setup_storage().await; + let resources = storage_handle.get_latest_resources().await; + let dirs = resources.all_zone_bundle_directories(); for d in dirs.iter() { let id = d.components().nth(3).unwrap().as_str().parse().unwrap(); create_test_dataset(&id, d).await.unwrap(); } - Self { resources, dirs } + Self { storage_handle, dirs } } } @@ -2261,8 +2280,11 @@ mod illumos_tests { let log = test_logger(); let context = CleanupContext::default(); let resource_wrapper = ResourceWrapper::new().await; - let bundler = - ZoneBundler::new(log, resource_wrapper.resources.clone(), context); + let bundler = ZoneBundler::new( + log, + resource_wrapper.storage_handle.clone(), + context, + ); Ok(CleanupTestContext { resource_wrapper, context, bundler }) } diff --git a/sled-hardware/Cargo.toml b/sled-hardware/Cargo.toml index 14ae15996b..36ba633067 100644 --- a/sled-hardware/Cargo.toml +++ b/sled-hardware/Cargo.toml @@ -11,10 +11,8 @@ camino.workspace = true cfg-if.workspace = true futures.workspace = true illumos-utils.workspace = true -key-manager.workspace = true libc.workspace = true macaddr.workspace = true -nexus-client.workspace = true omicron-common.workspace = true rand.workspace = true schemars.workspace = true diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index e3078cbeea..44658658be 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -4,34 +4,14 @@ use camino::{Utf8Path, Utf8PathBuf}; use illumos_utils::fstyp::Fstyp; -use illumos_utils::zfs; -use illumos_utils::zfs::DestroyDatasetErrorVariant; -use illumos_utils::zfs::EncryptionDetails; -use illumos_utils::zfs::Keypath; -use illumos_utils::zfs::Mountpoint; -use illumos_utils::zfs::SizeDetails; -use illumos_utils::zfs::Zfs; use illumos_utils::zpool::Zpool; use illumos_utils::zpool::ZpoolKind; use illumos_utils::zpool::ZpoolName; -use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; -use rand::distributions::{Alphanumeric, DistString}; use slog::Logger; use slog::{info, warn}; -use std::sync::OnceLock; -use tokio::fs::{remove_file, File}; -use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; use uuid::Uuid; -/// This path is intentionally on a `tmpfs` to prevent copy-on-write behavior -/// and to ensure it goes away on power off. -/// -/// We want minimize the time the key files are in memory, and so we rederive -/// the keys and recreate the files on demand when creating and mounting -/// encrypted filesystems. We then zero them and unlink them. -pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; - cfg_if::cfg_if! { if #[cfg(target_os = "illumos")] { use crate::illumos::*; @@ -41,7 +21,7 @@ cfg_if::cfg_if! { } #[derive(Debug, thiserror::Error)] -pub enum DiskError { +pub enum PooledDiskError { #[error("Cannot open {path} due to {error}")] IoError { path: Utf8PathBuf, error: std::io::Error }, #[error("Failed to open partition at {path} due to {error}")] @@ -51,10 +31,6 @@ pub enum DiskError { #[error("Requested partition {partition:?} not found on device {path}")] NotFound { path: Utf8PathBuf, partition: Partition }, #[error(transparent)] - DestroyFilesystem(#[from] illumos_utils::zfs::DestroyDatasetError), - #[error(transparent)] - EnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), - #[error(transparent)] ZpoolCreate(#[from] illumos_utils::zpool::CreateError), #[error("Cannot import zpool: {0}")] ZpoolImport(illumos_utils::zpool::Error), @@ -62,18 +38,6 @@ pub enum DiskError { CannotFormatMissingDevPath { path: Utf8PathBuf }, #[error("Formatting M.2 devices is not yet implemented")] CannotFormatM2NotImplemented, - #[error("KeyManager error: {0}")] - KeyManager(#[from] key_manager::Error), - #[error("Missing StorageKeyRequester when creating U.2 disk")] - MissingStorageKeyRequester, - #[error("Encrypted filesystem '{0}' missing 'oxide:epoch' property")] - CannotParseEpochProperty(String), - #[error("Encrypted dataset '{dataset}' cannot set 'oxide:agent' property: {err}")] - CannotSetAgentProperty { - dataset: String, - #[source] - err: Box, - }, } /// A partition (or 'slice') of a disk. @@ -126,17 +90,17 @@ impl DiskPaths { } // Finds the first 'variant' partition, and returns the path to it. - fn partition_device_path( + pub fn partition_device_path( &self, partitions: &[Partition], expected_partition: Partition, raw: bool, - ) -> Result { + ) -> Result { for (index, partition) in partitions.iter().enumerate() { if &expected_partition == partition { let path = self.partition_path(index, raw).ok_or_else(|| { - DiskError::NotFound { + PooledDiskError::NotFound { path: self.devfs_path.clone(), partition: expected_partition, } @@ -144,7 +108,7 @@ impl DiskPaths { return Ok(path); } } - Err(DiskError::NotFound { + Err(PooledDiskError::NotFound { path: self.devfs_path.clone(), partition: expected_partition, }) @@ -154,9 +118,9 @@ impl DiskPaths { /// A disk which has been observed by monitoring hardware. /// /// No guarantees are made about the partitions which exist within this disk. -/// This exists as a distinct entity from [Disk] because it may be desirable to -/// monitor for hardware in one context, and conform disks to partition layouts -/// in a different context. +/// This exists as a distinct entity from `Disk` in `sled-storage` because it +/// may be desirable to monitor for hardware in one context, and conform disks +/// to partition layouts in a different context. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct UnparsedDisk { paths: DiskPaths, @@ -202,127 +166,34 @@ impl UnparsedDisk { } } -/// A physical disk conforming to the expected partition layout. +/// A physical disk that is partitioned to contain exactly one zpool +/// +/// A PooledDisk relies on hardware specific information to be constructed +/// and is the highest level disk structure in the `sled-hardware` package. +/// The `sled-storage` package contains `Disk`s whose zpool and datasets can be +/// manipulated. This separation exists to remove the hardware dependent logic +/// from the ZFS related logic which can also operate on file backed zpools. +/// Doing things this way allows us to not put higher level concepts like +/// storage keys into this hardware related package. #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct Disk { - paths: DiskPaths, - slot: i64, - variant: DiskVariant, - identity: DiskIdentity, - is_boot_disk: bool, - partitions: Vec, - +pub struct PooledDisk { + pub paths: DiskPaths, + pub slot: i64, + pub variant: DiskVariant, + pub identity: DiskIdentity, + pub is_boot_disk: bool, + pub partitions: Vec, // This embeds the assumtion that there is exactly one parsed zpool per // disk. - zpool_name: ZpoolName, -} - -// Helper type for describing expected datasets and their optional quota. -#[derive(Clone, Copy, Debug)] -struct ExpectedDataset { - // Name for the dataset - name: &'static str, - // Optional quota, in _bytes_ - quota: Option, - // Identifies if the dataset should be deleted on boot - wipe: bool, - // Optional compression mode - compression: Option<&'static str>, + pub zpool_name: ZpoolName, } -impl ExpectedDataset { - const fn new(name: &'static str) -> Self { - ExpectedDataset { name, quota: None, wipe: false, compression: None } - } - - const fn quota(mut self, quota: usize) -> Self { - self.quota = Some(quota); - self - } - - const fn wipe(mut self) -> Self { - self.wipe = true; - self - } - - const fn compression(mut self, compression: &'static str) -> Self { - self.compression = Some(compression); - self - } -} - -pub const INSTALL_DATASET: &'static str = "install"; -pub const CRASH_DATASET: &'static str = "crash"; -pub const CLUSTER_DATASET: &'static str = "cluster"; -pub const CONFIG_DATASET: &'static str = "config"; -pub const M2_DEBUG_DATASET: &'static str = "debug"; -pub const M2_BACKING_DATASET: &'static str = "backing"; -// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be -// tuned as needed. -pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); -// ditto. -pub const DUMP_DATASET_QUOTA: usize = 100 * (1 << 30); -// passed to zfs create -o compression= -pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; - -// U.2 datasets live under the encrypted dataset and inherit encryption -pub const ZONE_DATASET: &'static str = "crypt/zone"; -pub const DUMP_DATASET: &'static str = "crypt/debug"; -pub const U2_DEBUG_DATASET: &'static str = "crypt/debug"; - -// This is the root dataset for all U.2 drives. Encryption is inherited. -pub const CRYPT_DATASET: &'static str = "crypt"; - -const U2_EXPECTED_DATASET_COUNT: usize = 2; -static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ - // Stores filesystems for zones - ExpectedDataset::new(ZONE_DATASET).wipe(), - // For storing full kernel RAM dumps - ExpectedDataset::new(DUMP_DATASET) - .quota(DUMP_DATASET_QUOTA) - .compression(DUMP_DATASET_COMPRESSION), -]; - -const M2_EXPECTED_DATASET_COUNT: usize = 6; -static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ - // Stores software images. - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(INSTALL_DATASET), - // Stores crash dumps. - ExpectedDataset::new(CRASH_DATASET), - // Backing store for OS data that should be persisted across reboots. - // Its children are selectively overlay mounted onto parts of the ramdisk - // root. - ExpectedDataset::new(M2_BACKING_DATASET), - // Stores cluster configuration information. - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(CLUSTER_DATASET), - // Stores configuration data, including: - // - What services should be launched on this sled - // - Information about how to initialize the Sled Agent - // - (For scrimlets) RSS setup information - // - // Should be duplicated to both M.2s. - ExpectedDataset::new(CONFIG_DATASET), - // Store debugging data, such as service bundles. - ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), -]; - -impl Disk { - /// Create a new Disk - /// - /// WARNING: In all cases where a U.2 is a possible `DiskVariant`, a - /// `StorageKeyRequester` must be passed so that disk encryption can - /// be used. The `StorageManager` for the sled-agent always has a - /// `StorageKeyRequester` available, and so the only place we should pass - /// `None` is for the M.2s touched by the Installinator. - pub async fn new( +impl PooledDisk { + /// Create a new PooledDisk + pub fn new( log: &Logger, unparsed_disk: UnparsedDisk, - key_requester: Option<&StorageKeyRequester>, - ) -> Result { + ) -> Result { let paths = &unparsed_disk.paths; let variant = unparsed_disk.variant; // Ensure the GPT has the right format. This does not necessarily @@ -340,13 +211,8 @@ impl Disk { )?; let zpool_name = Self::ensure_zpool_exists(log, variant, &zpool_path)?; - Self::ensure_zpool_ready( - log, - &zpool_name, - &unparsed_disk.identity, - key_requester, - ) - .await?; + Self::ensure_zpool_imported(log, &zpool_name)?; + Self::ensure_zpool_failmode_is_continue(log, &zpool_name)?; Ok(Self { paths: unparsed_disk.paths, @@ -359,29 +225,11 @@ impl Disk { }) } - pub async fn ensure_zpool_ready( - log: &Logger, - zpool_name: &ZpoolName, - disk_identity: &DiskIdentity, - key_requester: Option<&StorageKeyRequester>, - ) -> Result<(), DiskError> { - Self::ensure_zpool_imported(log, &zpool_name)?; - Self::ensure_zpool_failmode_is_continue(log, &zpool_name)?; - Self::ensure_zpool_has_datasets( - log, - &zpool_name, - disk_identity, - key_requester, - ) - .await?; - Ok(()) - } - fn ensure_zpool_exists( log: &Logger, variant: DiskVariant, zpool_path: &Utf8Path, - ) -> Result { + ) -> Result { let zpool_name = match Fstyp::get_zpool(&zpool_path) { Ok(zpool_name) => zpool_name, Err(_) => { @@ -406,13 +254,13 @@ impl Disk { DiskVariant::M2 => ZpoolName::new_internal(Uuid::new_v4()), DiskVariant::U2 => ZpoolName::new_external(Uuid::new_v4()), }; - Zpool::create(zpool_name.clone(), &zpool_path)?; + Zpool::create(&zpool_name, &zpool_path)?; zpool_name } }; - Zpool::import(zpool_name.clone()).map_err(|e| { + Zpool::import(&zpool_name).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(zpool_name) @@ -421,10 +269,10 @@ impl Disk { fn ensure_zpool_imported( log: &Logger, zpool_name: &ZpoolName, - ) -> Result<(), DiskError> { - Zpool::import(zpool_name.clone()).map_err(|e| { + ) -> Result<(), PooledDiskError> { + Zpool::import(&zpool_name).map_err(|e| { warn!(log, "Failed to import zpool {zpool_name}: {e}"); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(()) } @@ -432,7 +280,7 @@ impl Disk { fn ensure_zpool_failmode_is_continue( log: &Logger, zpool_name: &ZpoolName, - ) -> Result<(), DiskError> { + ) -> Result<(), PooledDiskError> { // Ensure failmode is set to `continue`. See // https://github.com/oxidecomputer/omicron/issues/2766 for details. The // short version is, each pool is only backed by one vdev. There is no @@ -445,214 +293,10 @@ impl Disk { log, "Failed to set failmode=continue on zpool {zpool_name}: {e}" ); - DiskError::ZpoolImport(e) + PooledDiskError::ZpoolImport(e) })?; Ok(()) } - - // Ensure that the zpool contains all the datasets we would like it to - // contain. - async fn ensure_zpool_has_datasets( - log: &Logger, - zpool_name: &ZpoolName, - disk_identity: &DiskIdentity, - key_requester: Option<&StorageKeyRequester>, - ) -> Result<(), DiskError> { - let (root, datasets) = match zpool_name.kind().into() { - DiskVariant::M2 => (None, M2_EXPECTED_DATASETS.iter()), - DiskVariant::U2 => { - (Some(CRYPT_DATASET), U2_EXPECTED_DATASETS.iter()) - } - }; - - let zoned = false; - let do_format = true; - - // Ensure the root encrypted filesystem exists - // Datasets below this in the hierarchy will inherit encryption - if let Some(dataset) = root { - let Some(key_requester) = key_requester else { - return Err(DiskError::MissingStorageKeyRequester); - }; - let mountpoint = zpool_name.dataset_mountpoint(dataset); - let keypath: Keypath = disk_identity.into(); - - let epoch = - if let Ok(epoch_str) = Zfs::get_oxide_value(dataset, "epoch") { - if let Ok(epoch) = epoch_str.parse::() { - epoch - } else { - return Err(DiskError::CannotParseEpochProperty( - dataset.to_string(), - )); - } - } else { - // We got an error trying to call `Zfs::get_oxide_value` - // which indicates that the dataset doesn't exist or there - // was a problem running the command. - // - // Note that `Zfs::get_oxide_value` will succeed even if - // the epoch is missing. `epoch_str` will show up as a dash - // (`-`) and will not parse into a `u64`. So we don't have - // to worry about that case here as it is handled above. - // - // If the error indicated that the command failed for some - // other reason, but the dataset actually existed, we will - // try to create the dataset below and that will fail. So - // there is no harm in just loading the latest secret here. - key_requester.load_latest_secret().await? - }; - - let key = - key_requester.get_key(epoch, disk_identity.clone()).await?; - - let mut keyfile = - KeyFile::create(keypath.clone(), key.expose_secret(), log) - .await - .map_err(|error| DiskError::IoError { - path: keypath.0.clone(), - error, - })?; - - let encryption_details = EncryptionDetails { keypath, epoch }; - - info!( - log, - "Ensuring encrypted filesystem: {} for epoch {}", - dataset, - epoch - ); - let result = Zfs::ensure_filesystem( - &format!("{}/{}", zpool_name, dataset), - Mountpoint::Path(mountpoint), - zoned, - do_format, - Some(encryption_details), - None, - None, - ); - - keyfile.zero_and_unlink().await.map_err(|error| { - DiskError::IoError { path: keyfile.path().0.clone(), error } - })?; - - result?; - }; - - for dataset in datasets.into_iter() { - let mountpoint = zpool_name.dataset_mountpoint(dataset.name); - let name = &format!("{}/{}", zpool_name, dataset.name); - - // Use a value that's alive for the duration of this sled agent - // to answer the question: should we wipe this disk, or have - // we seen it before? - // - // If this value comes from a prior iteration of the sled agent, - // we opt to remove the corresponding dataset. - static AGENT_LOCAL_VALUE: OnceLock = OnceLock::new(); - let agent_local_value = AGENT_LOCAL_VALUE.get_or_init(|| { - Alphanumeric.sample_string(&mut rand::thread_rng(), 20) - }); - - if dataset.wipe { - match Zfs::get_oxide_value(name, "agent") { - Ok(v) if &v == agent_local_value => { - info!( - log, - "Skipping automatic wipe for dataset: {}", name - ); - } - Ok(_) | Err(_) => { - info!( - log, - "Automatically destroying dataset: {}", name - ); - Zfs::destroy_dataset(name).or_else(|err| { - // If we can't find the dataset, that's fine -- it - // might not have been formatted yet. - if let DestroyDatasetErrorVariant::NotFound = - err.err - { - Ok(()) - } else { - Err(err) - } - })?; - } - } - } - - let encryption_details = None; - let size_details = Some(SizeDetails { - quota: dataset.quota, - compression: dataset.compression, - }); - Zfs::ensure_filesystem( - name, - Mountpoint::Path(mountpoint), - zoned, - do_format, - encryption_details, - size_details, - None, - )?; - - if dataset.wipe { - Zfs::set_oxide_value(name, "agent", agent_local_value) - .map_err(|err| DiskError::CannotSetAgentProperty { - dataset: name.clone(), - err: Box::new(err), - })?; - } - } - Ok(()) - } - - pub fn is_boot_disk(&self) -> bool { - self.is_boot_disk - } - - pub fn identity(&self) -> &DiskIdentity { - &self.identity - } - - pub fn variant(&self) -> DiskVariant { - self.variant - } - - pub fn devfs_path(&self) -> &Utf8PathBuf { - &self.paths.devfs_path - } - - pub fn zpool_name(&self) -> &ZpoolName { - &self.zpool_name - } - - pub fn boot_image_devfs_path( - &self, - raw: bool, - ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::BootImage, - raw, - ) - } - - pub fn dump_device_devfs_path( - &self, - raw: bool, - ) -> Result { - self.paths.partition_device_path( - &self.partitions, - Partition::DumpDevice, - raw, - ) - } - - pub fn slot(&self) -> i64 { - self.slot - } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -671,56 +315,6 @@ impl From for DiskVariant { } } -/// A file that wraps a zfs encryption key. -/// -/// We put this in a RAM backed filesystem and zero and delete it when we are -/// done with it. Unfortunately we cannot do this inside `Drop` because there is no -/// equivalent async drop. -pub struct KeyFile { - path: Keypath, - file: File, - log: Logger, -} - -impl KeyFile { - pub async fn create( - path: Keypath, - key: &[u8; 32], - log: &Logger, - ) -> std::io::Result { - // TODO: fix this to not truncate - // We want to overwrite any existing contents. - // If we truncate we may leave dirty pages around - // containing secrets. - let mut file = tokio::fs::OpenOptions::new() - .create(true) - .write(true) - .open(&path.0) - .await?; - file.write_all(key).await?; - info!(log, "Created keyfile {}", path); - Ok(KeyFile { path, file, log: log.clone() }) - } - - /// These keyfiles live on a tmpfs and we zero the file so the data doesn't - /// linger on the page in memory. - /// - /// It'd be nice to `impl Drop for `KeyFile` and then call `zero` - /// from within the drop handler, but async `Drop` isn't supported. - pub async fn zero_and_unlink(&mut self) -> std::io::Result<()> { - let zeroes = [0u8; 32]; - let _ = self.file.seek(SeekFrom::Start(0)).await?; - self.file.write_all(&zeroes).await?; - info!(self.log, "Zeroed and unlinked keyfile {}", self.path); - remove_file(&self.path().0).await?; - Ok(()) - } - - pub fn path(&self) -> &Keypath { - &self.path - } -} - #[cfg(test)] mod test { use super::*; @@ -832,7 +426,7 @@ mod test { paths .partition_device_path(&[], Partition::ZfsPool, false) .expect_err("Should not have found partition"), - DiskError::NotFound { .. }, + PooledDiskError::NotFound { .. }, )); } } diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index c0145b75e8..19111c6cda 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -19,7 +19,6 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::sync::Mutex; use tokio::sync::broadcast; -use tokio::task::JoinHandle; use uuid::Uuid; mod gpt; @@ -589,11 +588,11 @@ async fn hardware_tracking_task( /// /// This structure provides interfaces for both querying and for receiving new /// events. +#[derive(Clone)] pub struct HardwareManager { log: Logger, inner: Arc>, tx: broadcast::Sender, - _worker: JoinHandle<()>, } impl HardwareManager { @@ -663,11 +662,11 @@ impl HardwareManager { let log2 = log.clone(); let inner2 = inner.clone(); let tx2 = tx.clone(); - let _worker = tokio::task::spawn(async move { + tokio::task::spawn(async move { hardware_tracking_task(log2, inner2, tx2).await }); - Ok(Self { log, inner, tx, _worker }) + Ok(Self { log, inner, tx }) } pub fn baseboard(&self) -> Baseboard { diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs index 950074bd3a..4b7e69057d 100644 --- a/sled-hardware/src/illumos/partitions.rs +++ b/sled-hardware/src/illumos/partitions.rs @@ -5,7 +5,7 @@ //! illumos-specific mechanisms for parsing disk info. use crate::illumos::gpt; -use crate::{DiskError, DiskPaths, DiskVariant, Partition}; +use crate::{DiskPaths, DiskVariant, Partition, PooledDiskError}; use camino::Utf8Path; use illumos_utils::zpool::ZpoolName; use slog::info; @@ -41,9 +41,9 @@ fn parse_partition_types( path: &Utf8Path, partitions: &Vec, expected_partitions: &[Partition; N], -) -> Result, DiskError> { +) -> Result, PooledDiskError> { if partitions.len() != N { - return Err(DiskError::BadPartitionLayout { + return Err(PooledDiskError::BadPartitionLayout { path: path.to_path_buf(), why: format!( "Expected {} partitions, only saw {}", @@ -54,7 +54,7 @@ fn parse_partition_types( } for i in 0..N { if partitions[i].index() != i { - return Err(DiskError::BadPartitionLayout { + return Err(PooledDiskError::BadPartitionLayout { path: path.to_path_buf(), why: format!( "The {i}-th partition has index {}", @@ -80,7 +80,7 @@ pub fn ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { internal_ensure_partition_layout::(log, paths, variant) } @@ -90,7 +90,7 @@ fn internal_ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { // Open the "Whole Disk" as a raw device to be parsed by the // libefi-illumos library. This lets us peek at the GPT before // making too many assumptions about it. @@ -114,14 +114,16 @@ fn internal_ensure_partition_layout( let dev_path = if let Some(dev_path) = &paths.dev_path { dev_path } else { - return Err(DiskError::CannotFormatMissingDevPath { path }); + return Err(PooledDiskError::CannotFormatMissingDevPath { + path, + }); }; match variant { DiskVariant::U2 => { info!(log, "Formatting zpool on disk {}", paths.devfs_path); // If a zpool does not already exist, create one. let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - Zpool::create(zpool_name, dev_path)?; + Zpool::create(&zpool_name, dev_path)?; return Ok(vec![Partition::ZfsPool]); } DiskVariant::M2 => { @@ -129,12 +131,12 @@ fn internal_ensure_partition_layout( // the expected partitions? Or would it be wiser to infer // that this indicates an unexpected error conditions that // needs mitigation? - return Err(DiskError::CannotFormatM2NotImplemented); + return Err(PooledDiskError::CannotFormatM2NotImplemented); } } } Err(err) => { - return Err(DiskError::Gpt { + return Err(PooledDiskError::Gpt { path, error: anyhow::Error::new(err), }); @@ -197,7 +199,7 @@ mod test { DiskVariant::U2, ); match result { - Err(DiskError::CannotFormatMissingDevPath { .. }) => {} + Err(PooledDiskError::CannotFormatMissingDevPath { .. }) => {} _ => panic!("Should have failed with a missing dev path error"), } @@ -373,7 +375,7 @@ mod test { DiskVariant::M2, ) .expect_err("Should have failed parsing empty GPT"), - DiskError::BadPartitionLayout { .. } + PooledDiskError::BadPartitionLayout { .. } )); logctx.cleanup_successful(); @@ -398,7 +400,7 @@ mod test { DiskVariant::U2, ) .expect_err("Should have failed parsing empty GPT"), - DiskError::BadPartitionLayout { .. } + PooledDiskError::BadPartitionLayout { .. } )); logctx.cleanup_successful(); diff --git a/sled-hardware/src/lib.rs b/sled-hardware/src/lib.rs index 654dfd59d9..2e3fd4a576 100644 --- a/sled-hardware/src/lib.rs +++ b/sled-hardware/src/lib.rs @@ -163,13 +163,3 @@ impl std::fmt::Display for Baseboard { } } } - -impl From for nexus_client::types::Baseboard { - fn from(b: Baseboard) -> nexus_client::types::Baseboard { - nexus_client::types::Baseboard { - serial_number: b.identifier().to_string(), - part_number: b.model().to_string(), - revision: b.revision(), - } - } -} diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs index 6e36330df0..d8372dd8aa 100644 --- a/sled-hardware/src/non_illumos/mod.rs +++ b/sled-hardware/src/non_illumos/mod.rs @@ -2,7 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use crate::disk::{DiskError, DiskPaths, DiskVariant, Partition, UnparsedDisk}; +use crate::disk::{ + DiskPaths, DiskVariant, Partition, PooledDiskError, UnparsedDisk, +}; use crate::{Baseboard, SledMode}; use slog::Logger; use std::collections::HashSet; @@ -16,6 +18,7 @@ use tokio::sync::broadcast; /// /// If you're actually trying to run the Sled Agent on non-illumos platforms, /// use the simulated sled agent, which does not attempt to abstract hardware. +#[derive(Clone)] pub struct HardwareManager {} impl HardwareManager { @@ -56,7 +59,7 @@ pub fn ensure_partition_layout( _log: &Logger, _paths: &DiskPaths, _variant: DiskVariant, -) -> Result, DiskError> { +) -> Result, PooledDiskError> { unimplemented!("Accessing hardware unsupported on non-illumos"); } diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml new file mode 100644 index 0000000000..cb3a790631 --- /dev/null +++ b/sled-storage/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "sled-storage" +version = "0.1.0" +edition = "2021" + +[dependencies] +async-trait.workspace = true +camino.workspace = true +cfg-if.workspace = true +derive_more.workspace = true +glob.workspace = true +illumos-utils.workspace = true +key-manager.workspace = true +omicron-common.workspace = true +rand.workspace = true +schemars = { workspace = true, features = [ "chrono", "uuid1" ] } +serde.workspace = true +serde_json.workspace = true +sled-hardware.workspace = true +slog.workspace = true +thiserror.workspace = true +tokio.workspace = true +uuid.workspace = true +omicron-workspace-hack.workspace = true + +[dev-dependencies] +illumos-utils = { workspace = true, features = ["tmp_keypath", "testing"] } +omicron-test-utils.workspace = true +camino-tempfile.workspace = true + +[features] +# Quotas and the like can be shrunk via this feature +testing = [] diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs new file mode 100644 index 0000000000..a2878af7f6 --- /dev/null +++ b/sled-storage/src/dataset.rs @@ -0,0 +1,379 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! ZFS dataset related functionality + +use crate::keyfile::KeyFile; +use camino::Utf8PathBuf; +use cfg_if::cfg_if; +use illumos_utils::zfs::{ + self, DestroyDatasetErrorVariant, EncryptionDetails, Keypath, Mountpoint, + SizeDetails, Zfs, +}; +use illumos_utils::zpool::ZpoolName; +use key_manager::StorageKeyRequester; +use omicron_common::disk::DiskIdentity; +use rand::distributions::{Alphanumeric, DistString}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sled_hardware::DiskVariant; +use slog::{info, Logger}; +use std::sync::OnceLock; + +pub const INSTALL_DATASET: &'static str = "install"; +pub const CRASH_DATASET: &'static str = "crash"; +pub const CLUSTER_DATASET: &'static str = "cluster"; +pub const CONFIG_DATASET: &'static str = "config"; +pub const M2_DEBUG_DATASET: &'static str = "debug"; +pub const M2_BACKING_DATASET: &'static str = "backing"; + +cfg_if! { + if #[cfg(any(test, feature = "testing"))] { + // Tuned for zone_bundle tests + pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 10); + } else { + // TODO-correctness: This value of 100GiB is a pretty wild guess, and should be + // tuned as needed. + pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); + } +} +// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be +// tuned as needed. +pub const DUMP_DATASET_QUOTA: usize = 100 * (1 << 30); +// passed to zfs create -o compression= +pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; + +// U.2 datasets live under the encrypted dataset and inherit encryption +pub const ZONE_DATASET: &'static str = "crypt/zone"; +pub const DUMP_DATASET: &'static str = "crypt/debug"; +pub const U2_DEBUG_DATASET: &'static str = "crypt/debug"; + +// This is the root dataset for all U.2 drives. Encryption is inherited. +pub const CRYPT_DATASET: &'static str = "crypt"; + +const U2_EXPECTED_DATASET_COUNT: usize = 2; +static U2_EXPECTED_DATASETS: [ExpectedDataset; U2_EXPECTED_DATASET_COUNT] = [ + // Stores filesystems for zones + ExpectedDataset::new(ZONE_DATASET).wipe(), + // For storing full kernel RAM dumps + ExpectedDataset::new(DUMP_DATASET) + .quota(DUMP_DATASET_QUOTA) + .compression(DUMP_DATASET_COMPRESSION), +]; + +const M2_EXPECTED_DATASET_COUNT: usize = 6; +static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ + // Stores software images. + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(INSTALL_DATASET), + // Stores crash dumps. + ExpectedDataset::new(CRASH_DATASET), + // Backing store for OS data that should be persisted across reboots. + // Its children are selectively overlay mounted onto parts of the ramdisk + // root. + ExpectedDataset::new(M2_BACKING_DATASET), + // Stores cluter configuration information. + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(CLUSTER_DATASET), + // Stores configuration data, including: + // - What services should be launched on this sled + // - Information about how to initialize the Sled Agent + // - (For scrimlets) RSS setup information + // + // Should be duplicated to both M.2s. + ExpectedDataset::new(CONFIG_DATASET), + // Store debugging data, such as service bundles. + ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), +]; + +// Helper type for describing expected datasets and their optional quota. +#[derive(Clone, Copy, Debug)] +struct ExpectedDataset { + // Name for the dataset + name: &'static str, + // Optional quota, in _bytes_ + quota: Option, + // Identifies if the dataset should be deleted on boot + wipe: bool, + // Optional compression mode + compression: Option<&'static str>, +} + +impl ExpectedDataset { + const fn new(name: &'static str) -> Self { + ExpectedDataset { name, quota: None, wipe: false, compression: None } + } + + const fn quota(mut self, quota: usize) -> Self { + self.quota = Some(quota); + self + } + + const fn wipe(mut self) -> Self { + self.wipe = true; + self + } + + const fn compression(mut self, compression: &'static str) -> Self { + self.compression = Some(compression); + self + } +} + +/// The type of a dataset, and an auxiliary information necessary +/// to successfully launch a zone managing the associated data. +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, +)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum DatasetKind { + CockroachDb, + Crucible, + Clickhouse, + ClickhouseKeeper, + ExternalDns, + InternalDns, +} + +impl std::fmt::Display for DatasetKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use DatasetKind::*; + let s = match self { + Crucible => "crucible", + CockroachDb { .. } => "cockroachdb", + Clickhouse => "clickhouse", + ClickhouseKeeper => "clickhouse_keeper", + ExternalDns { .. } => "external_dns", + InternalDns { .. } => "internal_dns", + }; + write!(f, "{}", s) + } +} + +#[derive( + Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Clone, JsonSchema, +)] +pub struct DatasetName { + // A unique identifier for the Zpool on which the dataset is stored. + pool_name: ZpoolName, + // A name for the dataset within the Zpool. + kind: DatasetKind, +} + +impl DatasetName { + pub fn new(pool_name: ZpoolName, kind: DatasetKind) -> Self { + Self { pool_name, kind } + } + + pub fn pool(&self) -> &ZpoolName { + &self.pool_name + } + + pub fn dataset(&self) -> &DatasetKind { + &self.kind + } + + pub fn full(&self) -> String { + format!("{}/{}", self.pool_name, self.kind) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum DatasetError { + #[error("Cannot open {path} due to {error}")] + IoError { path: Utf8PathBuf, error: std::io::Error }, + #[error(transparent)] + DestroyFilesystem(#[from] illumos_utils::zfs::DestroyDatasetError), + #[error(transparent)] + EnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), + #[error("KeyManager error: {0}")] + KeyManager(#[from] key_manager::Error), + #[error("Missing StorageKeyRequester when creating U.2 disk")] + MissingStorageKeyRequester, + #[error("Encrypted filesystem '{0}' missing 'oxide:epoch' property")] + CannotParseEpochProperty(String), + #[error("Encrypted dataset '{dataset}' cannot set 'oxide:agent' property: {err}")] + CannotSetAgentProperty { + dataset: String, + #[source] + err: Box, + }, +} + +/// Ensure that the zpool contains all the datasets we would like it to +/// contain. +/// +/// WARNING: In all cases where a U.2 is a possible `DiskVariant`, a +/// `StorageKeyRequester` must be passed so that disk encryption can +/// be used. The `StorageManager` for the sled-agent always has a +/// `StorageKeyRequester` available, and so the only place we should pass +/// `None` is for the M.2s touched by the Installinator. +pub(crate) async fn ensure_zpool_has_datasets( + log: &Logger, + zpool_name: &ZpoolName, + disk_identity: &DiskIdentity, + key_requester: Option<&StorageKeyRequester>, +) -> Result<(), DatasetError> { + let (root, datasets) = match zpool_name.kind().into() { + DiskVariant::M2 => (None, M2_EXPECTED_DATASETS.iter()), + DiskVariant::U2 => (Some(CRYPT_DATASET), U2_EXPECTED_DATASETS.iter()), + }; + + let zoned = false; + let do_format = true; + + // Ensure the root encrypted filesystem exists + // Datasets below this in the hierarchy will inherit encryption + if let Some(dataset) = root { + let Some(key_requester) = key_requester else { + return Err(DatasetError::MissingStorageKeyRequester); + }; + let mountpoint = zpool_name.dataset_mountpoint(dataset); + let keypath: Keypath = disk_identity.into(); + + let epoch = if let Ok(epoch_str) = + Zfs::get_oxide_value(dataset, "epoch") + { + if let Ok(epoch) = epoch_str.parse::() { + epoch + } else { + return Err(DatasetError::CannotParseEpochProperty( + dataset.to_string(), + )); + } + } else { + // We got an error trying to call `Zfs::get_oxide_value` + // which indicates that the dataset doesn't exist or there + // was a problem running the command. + // + // Note that `Zfs::get_oxide_value` will succeed even if + // the epoch is missing. `epoch_str` will show up as a dash + // (`-`) and will not parse into a `u64`. So we don't have + // to worry about that case here as it is handled above. + // + // If the error indicated that the command failed for some + // other reason, but the dataset actually existed, we will + // try to create the dataset below and that will fail. So + // there is no harm in just loading the latest secret here. + info!(log, "Loading latest secret"; "disk_id"=>#?disk_identity); + let epoch = key_requester.load_latest_secret().await?; + info!(log, "Loaded latest secret"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); + epoch + }; + + info!(log, "Retrieving key"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); + let key = key_requester.get_key(epoch, disk_identity.clone()).await?; + info!(log, "Got key"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); + + let mut keyfile = + KeyFile::create(keypath.clone(), key.expose_secret(), log) + .await + .map_err(|error| DatasetError::IoError { + path: keypath.0.clone(), + error, + })?; + + let encryption_details = EncryptionDetails { keypath, epoch }; + + info!( + log, + "Ensuring encrypted filesystem: {} for epoch {}", dataset, epoch + ); + let result = Zfs::ensure_filesystem( + &format!("{}/{}", zpool_name, dataset), + Mountpoint::Path(mountpoint), + zoned, + do_format, + Some(encryption_details), + None, + None, + ); + + keyfile.zero_and_unlink().await.map_err(|error| { + DatasetError::IoError { path: keyfile.path().0.clone(), error } + })?; + + result?; + }; + + for dataset in datasets.into_iter() { + let mountpoint = zpool_name.dataset_mountpoint(dataset.name); + let name = &format!("{}/{}", zpool_name, dataset.name); + + // Use a value that's alive for the duration of this sled agent + // to answer the question: should we wipe this disk, or have + // we seen it before? + // + // If this value comes from a prior iteration of the sled agent, + // we opt to remove the corresponding dataset. + static AGENT_LOCAL_VALUE: OnceLock = OnceLock::new(); + let agent_local_value = AGENT_LOCAL_VALUE.get_or_init(|| { + Alphanumeric.sample_string(&mut rand::thread_rng(), 20) + }); + + if dataset.wipe { + match Zfs::get_oxide_value(name, "agent") { + Ok(v) if &v == agent_local_value => { + info!(log, "Skipping automatic wipe for dataset: {}", name); + } + Ok(_) | Err(_) => { + info!(log, "Automatically destroying dataset: {}", name); + Zfs::destroy_dataset(name).or_else(|err| { + // If we can't find the dataset, that's fine -- it might + // not have been formatted yet. + if matches!( + err.err, + DestroyDatasetErrorVariant::NotFound + ) { + Ok(()) + } else { + Err(err) + } + })?; + } + } + } + + let encryption_details = None; + let size_details = Some(SizeDetails { + quota: dataset.quota, + compression: dataset.compression, + }); + Zfs::ensure_filesystem( + name, + Mountpoint::Path(mountpoint), + zoned, + do_format, + encryption_details, + size_details, + None, + )?; + + if dataset.wipe { + Zfs::set_oxide_value(name, "agent", agent_local_value).map_err( + |err| DatasetError::CannotSetAgentProperty { + dataset: name.clone(), + err: Box::new(err), + }, + )?; + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use super::*; + use uuid::Uuid; + + #[test] + fn serialize_dataset_name() { + let pool = ZpoolName::new_internal(Uuid::new_v4()); + let kind = DatasetKind::Crucible; + let name = DatasetName::new(pool, kind); + serde_json::to_string(&name).unwrap(); + } +} diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs new file mode 100644 index 0000000000..f5209def77 --- /dev/null +++ b/sled-storage/src/disk.rs @@ -0,0 +1,243 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Disk related types + +use camino::{Utf8Path, Utf8PathBuf}; +use derive_more::From; +use illumos_utils::zpool::{Zpool, ZpoolKind, ZpoolName}; +use key_manager::StorageKeyRequester; +use omicron_common::disk::DiskIdentity; +use sled_hardware::{ + DiskVariant, Partition, PooledDisk, PooledDiskError, UnparsedDisk, +}; +use slog::Logger; +use std::fs::File; + +use crate::dataset; + +#[derive(Debug, thiserror::Error)] +pub enum DiskError { + #[error(transparent)] + Dataset(#[from] crate::dataset::DatasetError), + #[error(transparent)] + PooledDisk(#[from] sled_hardware::PooledDiskError), +} + +// A synthetic disk that acts as one "found" by the hardware and that is backed +// by a zpool +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SyntheticDisk { + pub identity: DiskIdentity, + pub zpool_name: ZpoolName, +} + +impl SyntheticDisk { + // Create a zpool and import it for the synthetic disk + // Zpools willl be set to the min size of 64Mib + pub fn create_zpool( + dir: &Utf8Path, + zpool_name: &ZpoolName, + ) -> SyntheticDisk { + // 64 MiB (min size of zpool) + const DISK_SIZE: u64 = 64 * 1024 * 1024; + let path = dir.join(zpool_name.to_string()); + let file = File::create(&path).unwrap(); + file.set_len(DISK_SIZE).unwrap(); + drop(file); + Zpool::create(zpool_name, &path).unwrap(); + Zpool::import(zpool_name).unwrap(); + Zpool::set_failmode_continue(zpool_name).unwrap(); + Self::new(zpool_name.clone()) + } + + pub fn new(zpool_name: ZpoolName) -> SyntheticDisk { + let id = zpool_name.id(); + let identity = DiskIdentity { + vendor: "synthetic-vendor".to_string(), + serial: format!("synthetic-serial-{id}"), + model: "synthetic-model".to_string(), + }; + SyntheticDisk { identity, zpool_name } + } +} + +// An [`UnparsedDisk`] disk learned about from the hardware or a wrapped zpool +#[derive(Debug, Clone, PartialEq, Eq, Hash, From)] +pub enum RawDisk { + Real(UnparsedDisk), + Synthetic(SyntheticDisk), +} + +impl RawDisk { + pub fn is_boot_disk(&self) -> bool { + match self { + Self::Real(disk) => disk.is_boot_disk(), + Self::Synthetic(disk) => { + // Just label any M.2 the boot disk. + disk.zpool_name.kind() == ZpoolKind::Internal + } + } + } + + pub fn identity(&self) -> &DiskIdentity { + match self { + Self::Real(disk) => &disk.identity(), + Self::Synthetic(disk) => &disk.identity, + } + } + + pub fn variant(&self) -> DiskVariant { + match self { + Self::Real(disk) => disk.variant(), + Self::Synthetic(disk) => match disk.zpool_name.kind() { + ZpoolKind::External => DiskVariant::U2, + ZpoolKind::Internal => DiskVariant::M2, + }, + } + } + + #[cfg(test)] + pub fn zpool_name(&self) -> &ZpoolName { + match self { + Self::Real(_) => unreachable!(), + Self::Synthetic(disk) => &disk.zpool_name, + } + } + + pub fn is_synthetic(&self) -> bool { + match self { + Self::Real(_) => false, + Self::Synthetic(_) => true, + } + } + + pub fn is_real(&self) -> bool { + !self.is_synthetic() + } + + pub fn devfs_path(&self) -> &Utf8PathBuf { + match self { + Self::Real(disk) => disk.devfs_path(), + Self::Synthetic(_) => unreachable!(), + } + } +} + +/// A physical [`PooledDisk`] or a [`SyntheticDisk`] that contains or is backed +/// by a single zpool and that has provisioned datasets. This disk is ready for +/// usage by higher level software. +#[derive(Debug, Clone, PartialEq, Eq, Hash, From)] +pub enum Disk { + Real(PooledDisk), + Synthetic(SyntheticDisk), +} + +impl Disk { + pub async fn new( + log: &Logger, + raw_disk: RawDisk, + key_requester: Option<&StorageKeyRequester>, + ) -> Result { + let disk = match raw_disk { + RawDisk::Real(disk) => PooledDisk::new(log, disk)?.into(), + RawDisk::Synthetic(disk) => Disk::Synthetic(disk), + }; + dataset::ensure_zpool_has_datasets( + log, + disk.zpool_name(), + disk.identity(), + key_requester, + ) + .await?; + Ok(disk) + } + + pub fn is_synthetic(&self) -> bool { + match self { + Self::Real(_) => false, + Self::Synthetic(_) => true, + } + } + + pub fn is_real(&self) -> bool { + !self.is_synthetic() + } + + pub fn is_boot_disk(&self) -> bool { + match self { + Self::Real(disk) => disk.is_boot_disk, + Self::Synthetic(disk) => { + // Just label any M.2 the boot disk. + disk.zpool_name.kind() == ZpoolKind::Internal + } + } + } + + pub fn identity(&self) -> &DiskIdentity { + match self { + Self::Real(disk) => &disk.identity, + Self::Synthetic(disk) => &disk.identity, + } + } + + pub fn variant(&self) -> DiskVariant { + match self { + Self::Real(disk) => disk.variant, + Self::Synthetic(disk) => match disk.zpool_name.kind() { + ZpoolKind::External => DiskVariant::U2, + ZpoolKind::Internal => DiskVariant::M2, + }, + } + } + + pub fn devfs_path(&self) -> &Utf8PathBuf { + match self { + Self::Real(disk) => &disk.paths.devfs_path, + Self::Synthetic(_) => unreachable!(), + } + } + + pub fn zpool_name(&self) -> &ZpoolName { + match self { + Self::Real(disk) => &disk.zpool_name, + Self::Synthetic(disk) => &disk.zpool_name, + } + } + + pub fn boot_image_devfs_path( + &self, + raw: bool, + ) -> Result { + match self { + Self::Real(disk) => disk.paths.partition_device_path( + &disk.partitions, + Partition::BootImage, + raw, + ), + Self::Synthetic(_) => unreachable!(), + } + } + + pub fn dump_device_devfs_path( + &self, + raw: bool, + ) -> Result { + match self { + Self::Real(disk) => disk.paths.partition_device_path( + &disk.partitions, + Partition::DumpDevice, + raw, + ), + Self::Synthetic(_) => unreachable!(), + } + } + + pub fn slot(&self) -> i64 { + match self { + Self::Real(disk) => disk.slot, + Self::Synthetic(_) => unreachable!(), + } + } +} diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs new file mode 100644 index 0000000000..b9f97ee428 --- /dev/null +++ b/sled-storage/src/error.rs @@ -0,0 +1,81 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Storage related errors + +use crate::dataset::{DatasetError, DatasetName}; +use crate::disk::DiskError; +use camino::Utf8PathBuf; +use omicron_common::api::external::ByteCountRangeError; +use uuid::Uuid; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error(transparent)] + DiskError(#[from] DiskError), + + #[error(transparent)] + DatasetError(#[from] DatasetError), + + // TODO: We could add the context of "why are we doint this op", maybe? + #[error(transparent)] + ZfsListDataset(#[from] illumos_utils::zfs::ListDatasetsError), + + #[error(transparent)] + ZfsEnsureFilesystem(#[from] illumos_utils::zfs::EnsureFilesystemError), + + #[error(transparent)] + ZfsSetValue(#[from] illumos_utils::zfs::SetValueError), + + #[error(transparent)] + ZfsGetValue(#[from] illumos_utils::zfs::GetValueError), + + #[error(transparent)] + GetZpoolInfo(#[from] illumos_utils::zpool::GetInfoError), + + #[error(transparent)] + Fstyp(#[from] illumos_utils::fstyp::Error), + + #[error(transparent)] + ZoneCommand(#[from] illumos_utils::running_zone::RunCommandError), + + #[error(transparent)] + ZoneBoot(#[from] illumos_utils::running_zone::BootError), + + #[error(transparent)] + ZoneEnsureAddress(#[from] illumos_utils::running_zone::EnsureAddressError), + + #[error(transparent)] + ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), + + #[error("No U.2 Zpools found")] + NoU2Zpool, + + #[error("Failed to parse UUID from {path}: {err}")] + ParseUuid { + path: Utf8PathBuf, + #[source] + err: uuid::Error, + }, + + #[error("Dataset {name:?} exists with a different uuid (has {old}, requested {new})")] + UuidMismatch { name: Box, old: Uuid, new: Uuid }, + + #[error("Error parsing pool {name}'s size: {err}")] + BadPoolSize { + name: String, + #[source] + err: ByteCountRangeError, + }, + + #[error("Failed to parse the dataset {name}'s UUID: {err}")] + ParseDatasetUuid { + name: String, + #[source] + err: uuid::Error, + }, + + #[error("Zpool Not Found: {0}")] + ZpoolNotFound(String), +} diff --git a/sled-storage/src/keyfile.rs b/sled-storage/src/keyfile.rs new file mode 100644 index 0000000000..48e5d9a528 --- /dev/null +++ b/sled-storage/src/keyfile.rs @@ -0,0 +1,76 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Key file support for ZFS dataset encryption + +use illumos_utils::zfs::Keypath; +use slog::{error, info, Logger}; +use tokio::fs::{remove_file, File}; +use tokio::io::{AsyncSeekExt, AsyncWriteExt, SeekFrom}; + +/// A file that wraps a zfs encryption key. +/// +/// We put this in a RAM backed filesystem and zero and delete it when we are +/// done with it. Unfortunately we cannot do this inside `Drop` because there is no +/// equivalent async drop. +pub struct KeyFile { + path: Keypath, + file: File, + log: Logger, + zero_and_unlink_called: bool, +} + +impl KeyFile { + pub async fn create( + path: Keypath, + key: &[u8; 32], + log: &Logger, + ) -> std::io::Result { + // We want to overwrite any existing contents. + let mut file = tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .open(&path.0) + .await?; + file.write_all(key).await?; + info!(log, "Created keyfile {}", path); + Ok(KeyFile { + path, + file, + log: log.clone(), + zero_and_unlink_called: false, + }) + } + + /// These keyfiles live on a tmpfs and we zero the file so the data doesn't + /// linger on the page in memory. + /// + /// It'd be nice to `impl Drop for `KeyFile` and then call `zero` + /// from within the drop handler, but async `Drop` isn't supported. + pub async fn zero_and_unlink(&mut self) -> std::io::Result<()> { + self.zero_and_unlink_called = true; + let zeroes = [0u8; 32]; + let _ = self.file.seek(SeekFrom::Start(0)).await?; + self.file.write_all(&zeroes).await?; + info!(self.log, "Zeroed and unlinked keyfile {}", self.path); + remove_file(&self.path().0).await?; + Ok(()) + } + + pub fn path(&self) -> &Keypath { + &self.path + } +} + +impl Drop for KeyFile { + fn drop(&mut self) { + if !self.zero_and_unlink_called { + error!( + self.log, + "Failed to call zero_and_unlink for keyfile"; + "path" => %self.path + ); + } + } +} diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs new file mode 100644 index 0000000000..d4b64c55a5 --- /dev/null +++ b/sled-storage/src/lib.rs @@ -0,0 +1,17 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Local storage abstraction for use by sled-agent +//! +//! This abstraction operates at the ZFS level and relies on zpool setup on +//! hardware partitions from the `sled-hardware` crate. It utilizes the +//! `illumos-utils` crate to actually perform ZFS related OS calls. + +pub mod dataset; +pub mod disk; +pub mod error; +pub(crate) mod keyfile; +pub mod manager; +pub mod pool; +pub mod resources; diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs new file mode 100644 index 0000000000..50b1c44148 --- /dev/null +++ b/sled-storage/src/manager.rs @@ -0,0 +1,1034 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The storage manager task + +use std::collections::HashSet; + +use crate::dataset::{DatasetError, DatasetName}; +use crate::disk::{Disk, DiskError, RawDisk}; +use crate::error::Error; +use crate::resources::{AddDiskResult, StorageResources}; +use camino::Utf8PathBuf; +use illumos_utils::zfs::{Mountpoint, Zfs}; +use illumos_utils::zpool::ZpoolName; +use key_manager::StorageKeyRequester; +use omicron_common::disk::DiskIdentity; +use sled_hardware::DiskVariant; +use slog::{error, info, o, warn, Logger}; +use tokio::sync::{mpsc, oneshot, watch}; +use tokio::time::{interval, Duration, MissedTickBehavior}; +use uuid::Uuid; + +// The size of the mpsc bounded channel used to communicate +// between the `StorageHandle` and `StorageManager`. +// +// How did we choose this bound, and why? +// +// Picking a bound can be tricky, but in general, you want the channel to act +// unbounded, such that sends never fail. This makes the channels reliable, +// such that we never drop messages inside the process, and the caller doesn't +// have to choose what to do when overloaded. This simplifies things drastically +// for developers. However, you also don't want to make the channel actually +// unbounded, because that can lead to run-away memory growth and pathological +// behaviors, such that requests get slower over time until the system crashes. +// +// Our team's chosen solution, and used elsewhere in the codebase, is is to +// choose a large enough bound such that we should never hit it in practice +// unless we are truly overloaded. If we hit the bound it means that beyond that +// requests will start to build up and we will eventually topple over. So when +// we hit this bound, we just go ahead and panic. +// +// Picking a channel bound is hard to do empirically, but practically, if +// requests are mostly mutating task local state, a bound of 1024 or even 8192 +// should be plenty. Tasks that must perform longer running ops can spawn helper +// tasks as necessary or include their own handles for replies rather than +// synchronously waiting. Memory for the queue can be kept small with boxing of +// large messages. +// +// Here we start relatively small so that we can evaluate our choice over time. +const QUEUE_SIZE: usize = 256; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StorageManagerState { + WaitingForKeyManager, + QueueingDisks, + Normal, +} + +#[derive(Debug)] +struct NewFilesystemRequest { + dataset_id: Uuid, + dataset_name: DatasetName, + responder: oneshot::Sender>, +} + +#[derive(Debug)] +enum StorageRequest { + AddDisk(RawDisk), + RemoveDisk(RawDisk), + DisksChanged(HashSet), + NewFilesystem(NewFilesystemRequest), + KeyManagerReady, + /// This will always grab the latest state after any new updates, as it + /// serializes through the `StorageManager` task after all prior requests. + /// This serialization is particularly useful for tests. + GetLatestResources(oneshot::Sender), + + /// Get the internal task state of the manager + GetManagerState(oneshot::Sender), +} + +/// Data managed internally to the StorageManagerTask that can be useful +/// to clients for debugging purposes, and that isn't exposed in other ways. +#[derive(Debug, Clone)] +pub struct StorageManagerData { + pub state: StorageManagerState, + pub queued_u2_drives: HashSet, +} + +/// A mechanism for interacting with the [`StorageManager`] +#[derive(Clone)] +pub struct StorageHandle { + tx: mpsc::Sender, + resource_updates: watch::Receiver, +} + +impl StorageHandle { + /// Adds a disk and associated zpool to the storage manager. + pub async fn upsert_disk(&self, disk: RawDisk) { + self.tx.send(StorageRequest::AddDisk(disk)).await.unwrap(); + } + + /// Removes a disk, if it's tracked by the storage manager, as well + /// as any associated zpools. + pub async fn delete_disk(&self, disk: RawDisk) { + self.tx.send(StorageRequest::RemoveDisk(disk)).await.unwrap(); + } + + /// Ensures that the storage manager tracks exactly the provided disks. + /// + /// This acts similar to a batch [Self::upsert_disk] for all new disks, and + /// [Self::delete_disk] for all removed disks. + /// + /// If errors occur, an arbitrary "one" of them will be returned, but a + /// best-effort attempt to add all disks will still be attempted. + pub async fn ensure_using_exactly_these_disks(&self, raw_disks: I) + where + I: IntoIterator, + { + self.tx + .send(StorageRequest::DisksChanged(raw_disks.into_iter().collect())) + .await + .unwrap(); + } + + /// Notify the [`StorageManager`] that the [`key_manager::KeyManager`] + /// has determined what [`key_manager::SecretRetriever`] to use and + /// it is now possible to retrieve secrets and construct keys. Note + /// that in cases of using the trust quorum, it is possible that the + /// [`key_manager::SecretRetriever`] is ready, but enough key shares cannot + /// be retrieved from other sleds. In this case, we still will be unable + /// to add the disks successfully. In the common case this is a transient + /// error. In other cases it may be fatal. However, that is outside the + /// scope of the cares of this module. + pub async fn key_manager_ready(&self) { + self.tx.send(StorageRequest::KeyManagerReady).await.unwrap(); + } + + /// Wait for a boot disk to be initialized + pub async fn wait_for_boot_disk(&mut self) -> (DiskIdentity, ZpoolName) { + loop { + let resources = self.resource_updates.borrow_and_update(); + if let Some((disk_id, zpool_name)) = resources.boot_disk() { + return (disk_id, zpool_name); + } + drop(resources); + // We panic if the sender is dropped, as this means + // the StorageManager has gone away, which it should not do. + self.resource_updates.changed().await.unwrap(); + } + } + + /// Wait for any storage resource changes + pub async fn wait_for_changes(&mut self) -> StorageResources { + self.resource_updates.changed().await.unwrap(); + self.resource_updates.borrow_and_update().clone() + } + + /// Retrieve the latest value of `StorageResources` from the + /// `StorageManager` task. + pub async fn get_latest_resources(&self) -> StorageResources { + let (tx, rx) = oneshot::channel(); + self.tx.send(StorageRequest::GetLatestResources(tx)).await.unwrap(); + rx.await.unwrap() + } + + /// Return internal data useful for debugging and testing + pub async fn get_manager_state(&self) -> StorageManagerData { + let (tx, rx) = oneshot::channel(); + self.tx.send(StorageRequest::GetManagerState(tx)).await.unwrap(); + rx.await.unwrap() + } + + pub async fn upsert_filesystem( + &self, + dataset_id: Uuid, + dataset_name: DatasetName, + ) -> Result<(), Error> { + let (tx, rx) = oneshot::channel(); + let request = + NewFilesystemRequest { dataset_id, dataset_name, responder: tx }; + self.tx.send(StorageRequest::NewFilesystem(request)).await.unwrap(); + rx.await.unwrap() + } +} + +// Some sled-agent tests cannot currently use the real StorageManager +// and want to fake the entire behavior, but still have access to the +// `StorageResources`. We allow this via use of the `FakeStorageManager` +// that will respond to real storage requests from a real `StorageHandle`. +#[cfg(feature = "testing")] +pub struct FakeStorageManager { + rx: mpsc::Receiver, + resources: StorageResources, + resource_updates: watch::Sender, +} + +#[cfg(feature = "testing")] +impl FakeStorageManager { + pub fn new() -> (Self, StorageHandle) { + let (tx, rx) = mpsc::channel(QUEUE_SIZE); + let resources = StorageResources::default(); + let (update_tx, update_rx) = watch::channel(resources.clone()); + ( + Self { rx, resources, resource_updates: update_tx }, + StorageHandle { tx, resource_updates: update_rx }, + ) + } + + /// Run the main receive loop of the `FakeStorageManager` + /// + /// This should be spawned into a tokio task + pub async fn run(mut self) { + loop { + match self.rx.recv().await { + Some(StorageRequest::AddDisk(raw_disk)) => { + if self.add_disk(raw_disk).disk_inserted() { + self.resource_updates + .send_replace(self.resources.clone()); + } + } + Some(StorageRequest::GetLatestResources(tx)) => { + let _ = tx.send(self.resources.clone()); + } + Some(_) => { + unreachable!(); + } + None => break, + } + } + } + + // Add a disk to `StorageResources` if it is new and return true if so + fn add_disk(&mut self, raw_disk: RawDisk) -> AddDiskResult { + let disk = match raw_disk { + RawDisk::Real(_) => { + panic!( + "Only synthetic disks can be used with `FakeStorageManager`" + ); + } + RawDisk::Synthetic(synthetic_disk) => { + Disk::Synthetic(synthetic_disk) + } + }; + self.resources.insert_fake_disk(disk) + } +} + +/// The storage manager responsible for the state of the storage +/// on a sled. The storage manager runs in its own task and is interacted +/// with via the [`StorageHandle`]. +pub struct StorageManager { + log: Logger, + state: StorageManagerState, + // Used to find the capacity of the channel for tracking purposes + tx: mpsc::Sender, + rx: mpsc::Receiver, + resources: StorageResources, + queued_u2_drives: HashSet, + key_requester: StorageKeyRequester, + resource_updates: watch::Sender, + last_logged_capacity: usize, +} + +impl StorageManager { + pub fn new( + log: &Logger, + key_requester: StorageKeyRequester, + ) -> (StorageManager, StorageHandle) { + let (tx, rx) = mpsc::channel(QUEUE_SIZE); + let resources = StorageResources::default(); + let (update_tx, update_rx) = watch::channel(resources.clone()); + ( + StorageManager { + log: log.new(o!("component" => "StorageManager")), + state: StorageManagerState::WaitingForKeyManager, + tx: tx.clone(), + rx, + resources, + queued_u2_drives: HashSet::new(), + key_requester, + resource_updates: update_tx, + last_logged_capacity: QUEUE_SIZE, + }, + StorageHandle { tx, resource_updates: update_rx }, + ) + } + + /// Run the main receive loop of the `StorageManager` + /// + /// This should be spawned into a tokio task + pub async fn run(mut self) { + loop { + const QUEUED_DISK_RETRY_TIMEOUT: Duration = Duration::from_secs(10); + let mut interval = interval(QUEUED_DISK_RETRY_TIMEOUT); + interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + tokio::select! { + res = self.step() => { + if let Err(e) = res { + warn!(self.log, "{e}"); + } + } + _ = interval.tick(), + if self.state == StorageManagerState::QueueingDisks => + { + if self.add_queued_disks().await { + let _ = self.resource_updates.send_replace(self.resources.clone()); + } + } + } + } + } + + /// Process the next event + /// + /// This is useful for testing/debugging + pub async fn step(&mut self) -> Result<(), Error> { + const CAPACITY_LOG_THRESHOLD: usize = 10; + // We check the capacity and log it every time it changes by at least 10 + // entries in either direction. + let current = self.tx.capacity(); + if self.last_logged_capacity.saturating_sub(current) + >= CAPACITY_LOG_THRESHOLD + { + info!( + self.log, + "Channel capacity decreased"; + "previous" => ?self.last_logged_capacity, + "current" => ?current + ); + self.last_logged_capacity = current; + } else if current.saturating_sub(self.last_logged_capacity) + >= CAPACITY_LOG_THRESHOLD + { + info!( + self.log, + "Channel capacity increased"; + "previous" => ?self.last_logged_capacity, + "current" => ?current + ); + self.last_logged_capacity = current; + } + // The sending side never disappears because we hold a copy + let req = self.rx.recv().await.unwrap(); + info!(self.log, "Received {:?}", req); + let should_send_updates = match req { + StorageRequest::AddDisk(raw_disk) => { + self.add_disk(raw_disk).await?.disk_inserted() + } + StorageRequest::RemoveDisk(raw_disk) => self.remove_disk(raw_disk), + StorageRequest::DisksChanged(raw_disks) => { + self.ensure_using_exactly_these_disks(raw_disks).await + } + StorageRequest::NewFilesystem(request) => { + let result = self.add_dataset(&request).await; + if result.is_err() { + warn!(self.log, "{result:?}"); + } + let _ = request.responder.send(result); + false + } + StorageRequest::KeyManagerReady => { + self.state = StorageManagerState::Normal; + self.add_queued_disks().await + } + StorageRequest::GetLatestResources(tx) => { + let _ = tx.send(self.resources.clone()); + false + } + StorageRequest::GetManagerState(tx) => { + let _ = tx.send(StorageManagerData { + state: self.state, + queued_u2_drives: self.queued_u2_drives.clone(), + }); + false + } + }; + + if should_send_updates { + let _ = self.resource_updates.send_replace(self.resources.clone()); + } + + Ok(()) + } + + // Loop through all queued disks inserting them into [`StorageResources`] + // unless we hit a transient error. If we hit a transient error, we return + // and wait for the next retry window to re-call this method. If we hit a + // permanent error we log it, but we continue inserting queued disks. + // + // Return true if updates should be sent to watchers, false otherwise + async fn add_queued_disks(&mut self) -> bool { + info!( + self.log, + "Attempting to add queued disks"; + "num_disks" => %self.queued_u2_drives.len() + ); + self.state = StorageManagerState::Normal; + + let mut send_updates = false; + + // Disks that should be requeued. + let queued = self.queued_u2_drives.clone(); + let mut to_dequeue = HashSet::new(); + for disk in queued.iter() { + if self.state == StorageManagerState::QueueingDisks { + // We hit a transient error in a prior iteration. + break; + } else { + match self.add_u2_disk(disk.clone()).await { + Err(_) => { + // This is an unrecoverable error, so we don't queue the + // disk again. + to_dequeue.insert(disk); + } + Ok(AddDiskResult::DiskInserted) => { + send_updates = true; + to_dequeue.insert(disk); + } + Ok(AddDiskResult::DiskAlreadyInserted) => { + to_dequeue.insert(disk); + } + Ok(AddDiskResult::DiskQueued) => (), + } + } + } + // Dequeue any inserted disks + self.queued_u2_drives.retain(|k| !to_dequeue.contains(k)); + send_updates + } + + // Add a disk to `StorageResources` if it is new, + // updated, or its pool has been updated as determined by + // [`$crate::resources::StorageResources::insert_disk`] and we decide not to + // queue the disk for later addition. + async fn add_disk( + &mut self, + raw_disk: RawDisk, + ) -> Result { + match raw_disk.variant() { + DiskVariant::U2 => self.add_u2_disk(raw_disk).await, + DiskVariant::M2 => self.add_m2_disk(raw_disk).await, + } + } + + // Add a U.2 disk to [`StorageResources`] or queue it to be added later + async fn add_u2_disk( + &mut self, + raw_disk: RawDisk, + ) -> Result { + if self.state != StorageManagerState::Normal { + self.queued_u2_drives.insert(raw_disk); + return Ok(AddDiskResult::DiskQueued); + } + + match Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) + .await + { + Ok(disk) => self.resources.insert_disk(disk), + Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { + warn!( + self.log, + "Transient error: {err}: queuing disk"; + "disk_id" => ?raw_disk.identity() + ); + self.queued_u2_drives.insert(raw_disk); + self.state = StorageManagerState::QueueingDisks; + Ok(AddDiskResult::DiskQueued) + } + Err(err) => { + error!( + self.log, + "Persistent error: {err}: not queueing disk"; + "disk_id" => ?raw_disk.identity() + ); + Err(err.into()) + } + } + } + + // Add a U.2 disk to [`StorageResources`] if new and return `Ok(true)` if so + // + // + // We never queue M.2 drives, as they don't rely on [`KeyManager`] based + // encryption + async fn add_m2_disk( + &mut self, + raw_disk: RawDisk, + ) -> Result { + let disk = + Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) + .await?; + self.resources.insert_disk(disk) + } + + // Delete a real disk and return `true` if the disk was actually removed + fn remove_disk(&mut self, raw_disk: RawDisk) -> bool { + // If the disk is a U.2, we want to first delete it from any queued disks + let _ = self.queued_u2_drives.remove(&raw_disk); + self.resources.remove_disk(raw_disk.identity()) + } + + // Find all disks to remove that are not in raw_disks and remove them. Then + // take the remaining disks and try to add them all. `StorageResources` will + // inform us if anything changed, and if so we return true, otherwise we + // return false. + async fn ensure_using_exactly_these_disks( + &mut self, + raw_disks: HashSet, + ) -> bool { + let mut should_update = false; + + // Clear out any queued U.2 disks that are real. + // We keep synthetic disks, as they are only added once. + self.queued_u2_drives.retain(|d| d.is_synthetic()); + + let all_ids: HashSet<_> = + raw_disks.iter().map(|d| d.identity()).collect(); + + // Find all existing disks not in the current set + let to_remove: Vec = self + .resources + .disks() + .keys() + .filter_map(|id| { + if !all_ids.contains(id) { + Some(id.clone()) + } else { + None + } + }) + .collect(); + + for id in to_remove { + if self.resources.remove_disk(&id) { + should_update = true; + } + } + + for raw_disk in raw_disks { + let disk_id = raw_disk.identity().clone(); + match self.add_disk(raw_disk).await { + Ok(AddDiskResult::DiskInserted) => should_update = true, + Ok(_) => (), + Err(err) => { + warn!( + self.log, + "Failed to add disk to storage resources: {err}"; + "disk_id" => ?disk_id + ); + } + } + } + + should_update + } + + // Attempts to add a dataset within a zpool, according to `request`. + async fn add_dataset( + &mut self, + request: &NewFilesystemRequest, + ) -> Result<(), Error> { + info!(self.log, "add_dataset: {:?}", request); + if !self + .resources + .disks() + .values() + .any(|(_, pool)| &pool.name == request.dataset_name.pool()) + { + return Err(Error::ZpoolNotFound(format!( + "{}, looked up while trying to add dataset", + request.dataset_name.pool(), + ))); + } + + let zoned = true; + let fs_name = &request.dataset_name.full(); + let do_format = true; + let encryption_details = None; + let size_details = None; + Zfs::ensure_filesystem( + fs_name, + Mountpoint::Path(Utf8PathBuf::from("/data")), + zoned, + do_format, + encryption_details, + size_details, + None, + )?; + // Ensure the dataset has a usable UUID. + if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { + if let Ok(id) = id_str.parse::() { + if id != request.dataset_id { + return Err(Error::UuidMismatch { + name: Box::new(request.dataset_name.clone()), + old: id, + new: request.dataset_id, + }); + } + return Ok(()); + } + } + Zfs::set_oxide_value( + &fs_name, + "uuid", + &request.dataset_id.to_string(), + )?; + + Ok(()) + } +} + +/// All tests only use synthetic disks, but are expected to be run on illumos +/// systems. +#[cfg(all(test, target_os = "illumos"))] +mod tests { + use crate::dataset::DatasetKind; + use crate::disk::SyntheticDisk; + + use super::*; + use async_trait::async_trait; + use camino_tempfile::tempdir; + use illumos_utils::zpool::Zpool; + use key_manager::{ + KeyManager, SecretRetriever, SecretRetrieverError, SecretState, + VersionedIkm, + }; + use omicron_test_utils::dev::test_setup_log; + use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }; + use uuid::Uuid; + + /// A [`key-manager::SecretRetriever`] that only returns hardcoded IKM for + /// epoch 0 + #[derive(Debug, Default)] + struct HardcodedSecretRetriever { + inject_error: Arc, + } + + #[async_trait] + impl SecretRetriever for HardcodedSecretRetriever { + async fn get_latest( + &self, + ) -> Result { + if self.inject_error.load(Ordering::SeqCst) { + return Err(SecretRetrieverError::Bootstore( + "Timeout".to_string(), + )); + } + + let epoch = 0; + let salt = [0u8; 32]; + let secret = [0x1d; 32]; + + Ok(VersionedIkm::new(epoch, salt, &secret)) + } + + /// We don't plan to do any key rotation before trust quorum is ready + async fn get( + &self, + epoch: u64, + ) -> Result { + if self.inject_error.load(Ordering::SeqCst) { + return Err(SecretRetrieverError::Bootstore( + "Timeout".to_string(), + )); + } + if epoch != 0 { + return Err(SecretRetrieverError::NoSuchEpoch(epoch)); + } + Ok(SecretState::Current(self.get_latest().await?)) + } + } + + #[tokio::test] + async fn add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log( + "add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued", + ); + let (mut _key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let raw_disk: RawDisk = SyntheticDisk::new(zpool_name).into(); + assert_eq!(StorageManagerState::WaitingForKeyManager, manager.state); + manager.add_u2_disk(raw_disk.clone()).await.unwrap(); + assert!(manager.resources.all_u2_zpools().is_empty()); + assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk.clone()])); + + // Check other non-normal stages and ensure disk gets queued + manager.queued_u2_drives.clear(); + manager.state = StorageManagerState::QueueingDisks; + manager.add_u2_disk(raw_disk.clone()).await.unwrap(); + assert!(manager.resources.all_u2_zpools().is_empty()); + assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk])); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn ensure_u2_gets_added_to_resources() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("ensure_u2_gets_added_to_resources"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Set the stage to pretend we've progressed enough to have a key_manager available. + manager.state = StorageManagerState::Normal; + manager.add_u2_disk(disk).await.unwrap(); + assert_eq!(manager.resources.all_u2_zpools().len(), 1); + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn wait_for_bootdisk() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("wait_for_bootdisk"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (manager, mut handle) = + StorageManager::new(&logctx.log, key_requester); + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Create a synthetic internal disk + let zpool_name = ZpoolName::new_internal(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + + handle.upsert_disk(disk).await; + handle.wait_for_boot_disk().await; + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn queued_disks_get_added_as_resources() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("queued_disks_get_added_as_resources"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (manager, handle) = StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Queue up a disks, as we haven't told the `StorageManager` that + // the `KeyManager` is ready yet. + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk).await; + let resources = handle.get_latest_resources().await; + assert!(resources.all_u2_zpools().is_empty()); + + // Now inform the storage manager that the key manager is ready + // The queued disk should be successfully added + handle.key_manager_ready().await; + let resources = handle.get_latest_resources().await; + assert_eq!(resources.all_u2_zpools().len(), 1); + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } + + /// For this test, we are going to step through the msg recv loop directly + /// without running the `StorageManager` in a tokio task. + /// This allows us to control timing precisely. + #[tokio::test] + async fn queued_disks_get_requeued_on_secret_retriever_error() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log( + "queued_disks_get_requeued_on_secret_retriever_error", + ); + let inject_error = Arc::new(AtomicBool::new(false)); + let (mut key_manager, key_requester) = KeyManager::new( + &logctx.log, + HardcodedSecretRetriever { inject_error: inject_error.clone() }, + ); + let (mut manager, handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Queue up a disks, as we haven't told the `StorageManager` that + // the `KeyManager` is ready yet. + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let disk = SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk).await; + manager.step().await.unwrap(); + + // We can't wait for a reply through the handle as the storage manager task + // isn't actually running. We just check the resources directly. + assert!(manager.resources.all_u2_zpools().is_empty()); + + // Let's inject an error to the `SecretRetriever` to simulate a trust + // quorum timeout + inject_error.store(true, Ordering::SeqCst); + + // Now inform the storage manager that the key manager is ready + // The queued disk should not be added due to the error + handle.key_manager_ready().await; + manager.step().await.unwrap(); + assert!(manager.resources.all_u2_zpools().is_empty()); + + // Manually simulating a timer tick to add queued disks should also + // still hit the error + manager.add_queued_disks().await; + assert!(manager.resources.all_u2_zpools().is_empty()); + + // Clearing the injected error will cause the disk to get added + inject_error.store(false, Ordering::SeqCst); + manager.add_queued_disks().await; + assert_eq!(1, manager.resources.all_u2_zpools().len()); + + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn delete_disk_triggers_notification() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("delete_disk_triggers_notification"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (manager, mut handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Inform the storage manager that the key manager is ready, so disks + // don't get queued + handle.key_manager_ready().await; + + // Create and add a disk + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let disk: RawDisk = + SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk.clone()).await; + + // Wait for the add disk notification + let resources = handle.wait_for_changes().await; + assert_eq!(resources.all_u2_zpools().len(), 1); + + // Delete the disk and wait for a notification + handle.delete_disk(disk).await; + let resources = handle.wait_for_changes().await; + assert!(resources.all_u2_zpools().is_empty()); + + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn ensure_using_exactly_these_disks() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("ensure_using_exactly_these_disks"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (manager, mut handle) = + StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + // Create a bunch of file backed external disks with zpools + let dir = tempdir().unwrap(); + let zpools: Vec = + (0..10).map(|_| ZpoolName::new_external(Uuid::new_v4())).collect(); + let disks: Vec = zpools + .iter() + .map(|zpool_name| { + SyntheticDisk::create_zpool(dir.path(), zpool_name).into() + }) + .collect(); + + // Add the first 3 disks, and ensure they get queued, as we haven't + // marked our key manager ready yet + handle + .ensure_using_exactly_these_disks(disks.iter().take(3).cloned()) + .await; + let state = handle.get_manager_state().await; + assert_eq!(state.queued_u2_drives.len(), 3); + assert_eq!(state.state, StorageManagerState::WaitingForKeyManager); + assert!(handle.get_latest_resources().await.all_u2_zpools().is_empty()); + + // Mark the key manager ready and wait for the storage update + handle.key_manager_ready().await; + let resources = handle.wait_for_changes().await; + let expected: HashSet<_> = + disks.iter().take(3).map(|d| d.identity()).collect(); + let actual: HashSet<_> = resources.disks().keys().collect(); + assert_eq!(expected, actual); + + // Add first three disks after the initial one. The returned resources + // should not contain the first disk. + handle + .ensure_using_exactly_these_disks( + disks.iter().skip(1).take(3).cloned(), + ) + .await; + let resources = handle.wait_for_changes().await; + let expected: HashSet<_> = + disks.iter().skip(1).take(3).map(|d| d.identity()).collect(); + let actual: HashSet<_> = resources.disks().keys().collect(); + assert_eq!(expected, actual); + + // Ensure the same set of disks and make sure no change occurs + // Note that we directly request the resources this time so we aren't + // waiting forever for a change notification. + handle + .ensure_using_exactly_these_disks( + disks.iter().skip(1).take(3).cloned(), + ) + .await; + let resources2 = handle.get_latest_resources().await; + assert_eq!(resources, resources2); + + // Add a disjoint set of disks and see that only they come through + handle + .ensure_using_exactly_these_disks( + disks.iter().skip(4).take(5).cloned(), + ) + .await; + let resources = handle.wait_for_changes().await; + let expected: HashSet<_> = + disks.iter().skip(4).take(5).map(|d| d.identity()).collect(); + let actual: HashSet<_> = resources.disks().keys().collect(); + assert_eq!(expected, actual); + + // Finally, change the zpool backing of the 5th disk to be that of the 10th + // and ensure that disk changes. Note that we don't change the identity + // of the 5th disk. + let mut modified_disk = disks[4].clone(); + if let RawDisk::Synthetic(disk) = &mut modified_disk { + disk.zpool_name = disks[9].zpool_name().clone(); + } else { + panic!(); + } + let mut expected: HashSet<_> = + disks.iter().skip(5).take(4).cloned().collect(); + expected.insert(modified_disk); + + handle + .ensure_using_exactly_these_disks(expected.clone().into_iter()) + .await; + let resources = handle.wait_for_changes().await; + + // Ensure the one modified disk changed as we expected + assert_eq!(5, resources.disks().len()); + for raw_disk in expected { + let (disk, pool) = + resources.disks().get(raw_disk.identity()).unwrap(); + assert_eq!(disk.zpool_name(), raw_disk.zpool_name()); + assert_eq!(&pool.name, disk.zpool_name()); + assert_eq!(raw_disk.identity(), &pool.parent); + } + + // Cleanup + for zpool in zpools { + Zpool::destroy(&zpool).unwrap(); + } + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn upsert_filesystem() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("upsert_filesystem"); + let (mut key_manager, key_requester) = + KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); + let (manager, handle) = StorageManager::new(&logctx.log, key_requester); + + // Spawn the key_manager so that it will respond to requests for encryption keys + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + tokio::spawn(async move { + manager.run().await; + }); + + handle.key_manager_ready().await; + + // Create and add a disk + let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let dir = tempdir().unwrap(); + let disk: RawDisk = + SyntheticDisk::create_zpool(dir.path(), &zpool_name).into(); + handle.upsert_disk(disk.clone()).await; + + // Create a filesystem + let dataset_id = Uuid::new_v4(); + let dataset_name = + DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); + handle.upsert_filesystem(dataset_id, dataset_name).await.unwrap(); + + Zpool::destroy(&zpool_name).unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/sled-storage/src/pool.rs b/sled-storage/src/pool.rs new file mode 100644 index 0000000000..cc71aeb19d --- /dev/null +++ b/sled-storage/src/pool.rs @@ -0,0 +1,35 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! ZFS storage pool + +use crate::error::Error; +use illumos_utils::zpool::{Zpool, ZpoolInfo, ZpoolName}; +use omicron_common::disk::DiskIdentity; + +/// A ZFS storage pool wrapper that tracks information returned from +/// `zpool` commands +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Pool { + pub name: ZpoolName, + pub info: ZpoolInfo, + pub parent: DiskIdentity, +} + +impl Pool { + /// Queries for an existing Zpool by name. + /// + /// Returns Ok if the pool exists. + pub fn new(name: ZpoolName, parent: DiskIdentity) -> Result { + let info = Zpool::get_info(&name.to_string())?; + Ok(Pool { name, info, parent }) + } + + /// Return a Pool consisting of fake info + #[cfg(feature = "testing")] + pub fn new_with_fake_info(name: ZpoolName, parent: DiskIdentity) -> Pool { + let info = ZpoolInfo::new_hardcoded(name.to_string()); + Pool { name, info, parent } + } +} diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs new file mode 100644 index 0000000000..c1f460dc92 --- /dev/null +++ b/sled-storage/src/resources.rs @@ -0,0 +1,206 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Discovered and usable disks and zpools + +use crate::dataset::M2_DEBUG_DATASET; +use crate::disk::Disk; +use crate::error::Error; +use crate::pool::Pool; +use camino::Utf8PathBuf; +use cfg_if::cfg_if; +use illumos_utils::zpool::ZpoolName; +use omicron_common::disk::DiskIdentity; +use sled_hardware::DiskVariant; +use std::collections::BTreeMap; +use std::sync::Arc; + +// The directory within the debug dataset in which bundles are created. +const BUNDLE_DIRECTORY: &str = "bundle"; + +// The directory for zone bundles. +const ZONE_BUNDLE_DIRECTORY: &str = "zone"; + +pub enum AddDiskResult { + DiskInserted, + DiskAlreadyInserted, + DiskQueued, +} + +impl AddDiskResult { + pub fn disk_inserted(&self) -> bool { + match self { + AddDiskResult::DiskInserted => true, + _ => false, + } + } +} + +/// Storage related resources: disks and zpools +/// +/// This state is internal to the [`crate::manager::StorageManager`] task. Clones +/// of this state can be retrieved by requests to the `StorageManager` task +/// from the [`crate::manager::StorageHandle`]. This state is not `Sync`, and +/// as such does not require any mutexes. However, we do expect to share it +/// relatively frequently, and we want copies of it to be as cheaply made +/// as possible. So any large state is stored inside `Arc`s. On the other +/// hand, we expect infrequent updates to this state, and as such, we use +/// [`std::sync::Arc::make_mut`] to implement clone on write functionality +/// inside the `StorageManager` task if there are any outstanding copies. +/// Therefore, we only pay the cost to update infrequently, and no locks are +/// required by callers when operating on cloned data. The only contention here +/// is for the reference counters of the internal Arcs when `StorageResources` +/// gets cloned or dropped. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct StorageResources { + // All disks, real and synthetic, being managed by this sled + disks: Arc>, +} + +impl StorageResources { + /// Return a reference to the current snapshot of disks + pub fn disks(&self) -> &BTreeMap { + &self.disks + } + + /// Insert a disk and its zpool + /// + /// If the disk passed in is new or modified, or its pool size or pool + /// name changed, then insert the changed values and return `DiskInserted`. + /// Otherwise, do not insert anything and return `DiskAlreadyInserted`. + /// For instance, if only the pool health changes, because it is not one + /// of the checked values, we will not insert the update and will return + /// `DiskAlreadyInserted`. + pub(crate) fn insert_disk( + &mut self, + disk: Disk, + ) -> Result { + let disk_id = disk.identity().clone(); + let zpool_name = disk.zpool_name().clone(); + let zpool = Pool::new(zpool_name, disk_id.clone())?; + if let Some((stored_disk, stored_pool)) = self.disks.get(&disk_id) { + if stored_disk == &disk + && stored_pool.info.size() == zpool.info.size() + && stored_pool.name == zpool.name + { + return Ok(AddDiskResult::DiskAlreadyInserted); + } + } + // Either the disk or zpool changed + Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); + Ok(AddDiskResult::DiskInserted) + } + + /// Insert a disk while creating a fake pool + /// This is a workaround for current mock based testing strategies + /// in the sled-agent. + #[cfg(feature = "testing")] + pub fn insert_fake_disk(&mut self, disk: Disk) -> AddDiskResult { + let disk_id = disk.identity().clone(); + let zpool_name = disk.zpool_name().clone(); + let zpool = Pool::new_with_fake_info(zpool_name, disk_id.clone()); + if self.disks.contains_key(&disk_id) { + return AddDiskResult::DiskAlreadyInserted; + } + // Either the disk or zpool changed + Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); + AddDiskResult::DiskInserted + } + + /// Delete a disk and its zpool + /// + /// Return true, if data was changed, false otherwise + /// + /// Note: We never allow removal of synthetic disks in production as they + /// are only added once. + pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) -> bool { + let Some((disk, _)) = self.disks.get(id) else { + return false; + }; + + cfg_if! { + if #[cfg(test)] { + // For testing purposes, we allow synthetic disks to be deleted. + // Silence an unused variable warning. + _ = disk; + } else { + // In production, we disallow removal of synthetic disks as they + // are only added once. + if disk.is_synthetic() { + return false; + } + } + } + + // Safe to unwrap as we just checked the key existed above + Arc::make_mut(&mut self.disks).remove(id).unwrap(); + true + } + + /// Returns the identity of the boot disk. + /// + /// If this returns `None`, we have not processed the boot disk yet. + pub fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { + for (id, (disk, _)) in self.disks.iter() { + if disk.is_boot_disk() { + return Some((id.clone(), disk.zpool_name().clone())); + } + } + None + } + + /// Returns all M.2 zpools + pub fn all_m2_zpools(&self) -> Vec { + self.all_zpools(DiskVariant::M2) + } + + /// Returns all U.2 zpools + pub fn all_u2_zpools(&self) -> Vec { + self.all_zpools(DiskVariant::U2) + } + + /// Returns all mountpoints within all M.2s for a particular dataset. + pub fn all_m2_mountpoints(&self, dataset: &str) -> Vec { + self.all_m2_zpools() + .iter() + .map(|zpool| zpool.dataset_mountpoint(dataset)) + .collect() + } + + /// Returns all mountpoints within all U.2s for a particular dataset. + pub fn all_u2_mountpoints(&self, dataset: &str) -> Vec { + self.all_u2_zpools() + .iter() + .map(|zpool| zpool.dataset_mountpoint(dataset)) + .collect() + } + + pub fn get_all_zpools(&self) -> Vec<(ZpoolName, DiskVariant)> { + self.disks + .values() + .map(|(disk, _)| (disk.zpool_name().clone(), disk.variant())) + .collect() + } + + // Returns all zpools of a particular variant + fn all_zpools(&self, variant: DiskVariant) -> Vec { + self.disks + .values() + .filter_map(|(disk, _)| { + if disk.variant() == variant { + return Some(disk.zpool_name().clone()); + } + None + }) + .collect() + } + + /// Return the directories for storing zone service bundles. + pub fn all_zone_bundle_directories(&self) -> Vec { + self.all_m2_mountpoints(M2_DEBUG_DATASET) + .into_iter() + .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) + .collect() + } +} diff --git a/wicketd/src/artifacts/extracted_artifacts.rs b/wicketd/src/artifacts/extracted_artifacts.rs index 352d8ad3d5..b796201936 100644 --- a/wicketd/src/artifacts/extracted_artifacts.rs +++ b/wicketd/src/artifacts/extracted_artifacts.rs @@ -169,7 +169,7 @@ impl ExtractedArtifacts { /// /// As the returned file is written to, the data will be hashed; once /// writing is complete, call [`ExtractedArtifacts::store_tempfile()`] to - /// persist the temporary file into an [`ExtractedArtifactDataHandle()`]. + /// persist the temporary file into an [`ExtractedArtifactDataHandle`]. pub(super) fn new_tempfile( &self, ) -> Result {