From 4eb81539ddad077116afe04124b96da9876d57b3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 28 Mar 2024 19:40:10 -0700 Subject: [PATCH] Expose Sled Agent API for "control plane disk management", use it (#5172) # Overview ## Virtual Environment Changes - Acting on Disks, not Zpools - Previously, sled agent could operate on "user-supplied zpools", which were created by `./tools/virtual_hardware.sh` - Now, in a world where Nexus has more control over zpool allocation, the configuration can supply "virtual devices" instead of "zpools", to give RSS/Nexus control over "when zpools actually get placed on these devices". - Impact: - `sled-agent/src/config.rs` - `smf/sled-agent/non-gimlet/config.toml` - `tools/virtual_hardware.sh` ## Sled Agent Changes - HTTP API - The Sled Agent exposes an API to "set" and "get" the "control plane physical disks" specified by Nexus. The set of control plane physical disks (usable U.2s) are stored into a ledger on the M.2s (as `omicron-physical-disks.json`). The set of control plane physical disks also determines "which disks are available to the rest of the sled agent". - StorageManager - **Before**: When physical U.2 disks are detected by the Sled Agent, they are "auto-formatted if empty", and we notify Nexus about them. This "upserts" them into the DB, so they are basically automatically adopted into the control plane. - **After**: As we've discussed on RFD 457, we want to get to a world where physical U.2 disks are **detected** by Sled Agent, but not **used** until RSS/Nexus explicitly tells the Sled Agent to "use this sled as part of the control plane". This set of "in-use control plane disks" is stored on a "ledger" file in the M.2. - **Transition**: On deployed systems, we need to boot up to Nexus, even though we don't have a ledger of control plane disks. Within the implementation of `StorageManager::key_manager_ready`, we implement a workaround: if we detect a system with no ledger, but with zpools, we'll use that set of zpools unconditionally until told otherwise. This is a short-term workaround to migrate existing systems, but can be removed when deployed racks reliably have ledgers for control plane disks. - StorageManagerTestHarness - In an effort to reduce "test fakes" and replace them with real storage, `StorageManagerTestHarness` provides testing utilities for spinning up vdevs, formatting them with zpools, and managing them. This helps us avoid a fair bit of bifurcation for "test-only synthetic disks" vs "real disks", though it does mean many of our tests in the sled-agent are now 'illumos-only'. ## RSS Changes - RSS is now responsible for provisioning "control plane disks and zpools" during initial bootstrapping - RSS informs Nexus about the allocation decisions it makes via the RSS handoff ## Nexus Changes - Nexus exposes a smaller API (no notification of "disk add/remove, zpools add/remove"). It receives a handoff from RSS, and will later be in charge of provisioning decisions based on inventory. - Dynamically adding/removing disks/zpools after RSS will be appearing in a subsequent PR. --------- Co-authored-by: Andrew J. Stone --- .github/buildomat/jobs/deploy.sh | 13 +- Cargo.lock | 5 + Cargo.toml | 1 + clients/sled-agent-client/src/lib.rs | 11 +- common/src/api/external/mod.rs | 1 + common/src/ledger.rs | 5 +- illumos-utils/Cargo.toml | 4 +- illumos-utils/src/zfs.rs | 61 +- illumos-utils/src/zpool.rs | 17 +- installinator/src/hardware.rs | 13 +- installinator/src/write.rs | 1 + key-manager/src/lib.rs | 2 +- nexus/db-model/src/physical_disk.rs | 15 +- nexus/db-queries/src/authz/api_resources.rs | 2 +- .../src/authz/policy_test/resources.rs | 6 +- nexus/db-queries/src/db/datastore/dataset.rs | 5 +- nexus/db-queries/src/db/datastore/mod.rs | 17 +- .../src/db/datastore/physical_disk.rs | 39 +- nexus/db-queries/src/db/datastore/rack.rs | 46 +- nexus/db-queries/src/db/datastore/zpool.rs | 21 +- nexus/db-queries/src/db/lookup.rs | 20 +- nexus/db-queries/tests/output/authz-roles.out | 2 +- nexus/inventory/src/examples.rs | 8 +- .../reconfigurator/execution/src/datasets.rs | 4 +- nexus/src/app/rack.rs | 51 +- nexus/src/app/sled.rs | 69 +- nexus/src/internal_api/http_entrypoints.rs | 81 +- nexus/src/lib.rs | 28 +- nexus/test-interface/src/lib.rs | 14 +- nexus/test-utils/src/lib.rs | 2 + nexus/test-utils/src/resource_helpers.rs | 109 +- nexus/tests/integration_tests/mod.rs | 1 - nexus/tests/integration_tests/sleds.rs | 46 +- nexus/tests/integration_tests/switches.rs | 45 +- nexus/tests/integration_tests/zpools.rs | 128 -- nexus/types/src/internal_api/params.rs | 39 +- nexus/types/src/inventory.rs | 2 +- openapi/nexus-internal.json | 198 +- openapi/sled-agent.json | 219 +++ schema/omicron-physical-disks.json | 74 + schema/rss-service-plan-v3.json | 848 +++++++++ sled-agent/Cargo.toml | 2 +- sled-agent/src/bootstrap/bootstore_setup.rs | 16 +- sled-agent/src/bootstrap/pre_server.rs | 4 - sled-agent/src/bootstrap/server.rs | 27 +- sled-agent/src/config.rs | 5 +- sled-agent/src/dump_setup.rs | 101 +- sled-agent/src/hardware_monitor.rs | 27 +- sled-agent/src/http_entrypoints.rs | 36 +- sled-agent/src/instance.rs | 245 +-- sled-agent/src/instance_manager.rs | 2 +- sled-agent/src/long_running_tasks.rs | 46 +- sled-agent/src/nexus.rs | 10 - sled-agent/src/params.rs | 5 + sled-agent/src/probe_manager.rs | 2 +- sled-agent/src/rack_setup/plan/service.rs | 154 +- sled-agent/src/rack_setup/plan/sled.rs | 4 +- sled-agent/src/rack_setup/service.rs | 277 ++- sled-agent/src/server.rs | 4 - sled-agent/src/services.rs | 163 +- sled-agent/src/sim/http_entrypoints.rs | 30 +- sled-agent/src/sim/server.rs | 26 +- sled-agent/src/sim/sled_agent.rs | 49 +- sled-agent/src/sim/storage.rs | 159 +- sled-agent/src/sled_agent.rs | 77 +- sled-agent/src/storage_monitor.rs | 344 +--- sled-agent/src/vmm_reservoir.rs | 3 +- sled-agent/src/zone_bundle.rs | 217 +-- sled-hardware/src/disk.rs | 224 ++- sled-hardware/src/illumos/partitions.rs | 32 +- sled-hardware/src/non_illumos/mod.rs | 1 + sled-storage/Cargo.toml | 8 +- sled-storage/src/config.rs | 39 + sled-storage/src/dataset.rs | 22 +- sled-storage/src/disk.rs | 261 ++- sled-storage/src/error.rs | 48 +- sled-storage/src/keyfile.rs | 3 +- sled-storage/src/lib.rs | 3 + sled-storage/src/manager.rs | 1675 ++++++++++------- sled-storage/src/manager_test_harness.rs | 393 ++++ sled-storage/src/resources.rs | 578 ++++-- smf/sled-agent/non-gimlet/config.toml | 40 +- tools/create_gimlet_virtual_hardware.sh | 2 +- tools/create_scrimlet_virtual_hardware.sh | 2 +- tools/create_virtual_hardware.sh | 2 +- tools/virtual_hardware.sh | 28 +- 86 files changed, 5037 insertions(+), 2632 deletions(-) delete mode 100644 nexus/tests/integration_tests/zpools.rs create mode 100644 schema/omicron-physical-disks.json create mode 100644 schema/rss-service-plan-v3.json create mode 100644 sled-storage/src/config.rs create mode 100644 sled-storage/src/manager_test_harness.rs diff --git a/.github/buildomat/jobs/deploy.sh b/.github/buildomat/jobs/deploy.sh index 6574ac839c..9f0629d4c1 100755 --- a/.github/buildomat/jobs/deploy.sh +++ b/.github/buildomat/jobs/deploy.sh @@ -205,7 +205,7 @@ PXA_END="$EXTRA_IP_END" export GATEWAY_IP GATEWAY_MAC PXA_START PXA_END pfexec zpool create -f scratch c1t1d0 c2t1d0 -ZPOOL_VDEV_DIR=/scratch ptime -m pfexec ./tools/create_virtual_hardware.sh +VDEV_DIR=/scratch ptime -m pfexec ./tools/create_virtual_hardware.sh # # Generate a self-signed certificate to use as the initial TLS certificate for @@ -214,7 +214,12 @@ ZPOOL_VDEV_DIR=/scratch ptime -m pfexec ./tools/create_virtual_hardware.sh # real system, the certificate would come from the customer during initial rack # setup on the technician port. # -tar xf out/omicron-sled-agent.tar pkg/config-rss.toml +tar xf out/omicron-sled-agent.tar pkg/config-rss.toml pkg/config.toml + +# Update the vdevs to point to where we've created them +sed -E -i~ "s/(m2|u2)(.*\.vdev)/\/scratch\/\1\2/g" pkg/config.toml +diff -u pkg/config.toml{~,} || true + SILO_NAME="$(sed -n 's/silo_name = "\(.*\)"/\1/p' pkg/config-rss.toml)" EXTERNAL_DNS_DOMAIN="$(sed -n 's/external_dns_zone_name = "\(.*\)"/\1/p' pkg/config-rss.toml)" @@ -241,8 +246,8 @@ addresses = \\[\"$UPLINK_IP/24\"\\] " pkg/config-rss.toml diff -u pkg/config-rss.toml{~,} || true -tar rvf out/omicron-sled-agent.tar pkg/config-rss.toml -rm -f pkg/config-rss.toml* +tar rvf out/omicron-sled-agent.tar pkg/config-rss.toml pkg/config.toml +rm -f pkg/config-rss.toml* pkg/config.toml* # # By default, OpenSSL creates self-signed certificates with "CA:true". The TLS diff --git a/Cargo.lock b/Cargo.lock index e1d684da52..d1df69b608 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3565,6 +3565,7 @@ dependencies = [ "tokio", "toml 0.8.10", "uuid 1.7.0", + "whoami", "zone 0.3.0", ] @@ -8738,11 +8739,15 @@ dependencies = [ name = "sled-storage" version = "0.1.0" dependencies = [ + "anyhow", "async-trait", "camino", "camino-tempfile", "cfg-if", + "debug-ignore", "derive_more", + "expectorate", + "futures", "glob", "illumos-utils", "key-manager", diff --git a/Cargo.toml b/Cargo.toml index 0d91aa076b..a384c8bed6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -444,6 +444,7 @@ update-engine = { path = "update-engine" } usdt = "0.5.0" uuid = { version = "1.7.0", features = ["serde", "v4"] } walkdir = "2.4" +whoami = "1.5" wicket = { path = "wicket" } wicket-common = { path = "wicket-common" } wicketd-client = { path = "clients/wicketd-client" } diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 2901226d16..d500bdca3a 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -35,6 +35,7 @@ progenitor::generate_api!( // replace directives below? replace = { ByteCount = omicron_common::api::external::ByteCount, + DiskIdentity = omicron_common::disk::DiskIdentity, Generation = omicron_common::api::external::Generation, MacAddr = omicron_common::api::external::MacAddr, Name = omicron_common::api::external::Name, @@ -230,16 +231,6 @@ impl omicron_common::api::external::ClientError for types::Error { } } -impl From for omicron_common::disk::DiskIdentity { - fn from(identity: types::DiskIdentity) -> Self { - Self { - vendor: identity.vendor, - serial: identity.serial, - model: identity.model, - } - } -} - impl From for types::InstanceRuntimeState { diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 324231f469..4eecd74a04 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -881,6 +881,7 @@ pub enum ResourceType { ServiceNetworkInterface, Sled, SledInstance, + SledLedger, Switch, SagaDbg, Snapshot, diff --git a/common/src/ledger.rs b/common/src/ledger.rs index 71d03fa8ee..ed5f0b57cf 100644 --- a/common/src/ledger.rs +++ b/common/src/ledger.rs @@ -7,7 +7,7 @@ use async_trait::async_trait; use camino::{Utf8Path, Utf8PathBuf}; use serde::{de::DeserializeOwned, Serialize}; -use slog::{debug, info, warn, Logger}; +use slog::{debug, error, info, warn, Logger}; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -127,7 +127,7 @@ impl Ledger { let mut one_successful_write = false; for path in self.paths.iter() { if let Err(e) = self.atomic_write(&path).await { - warn!(self.log, "Failed to write to {}: {e}", path); + warn!(self.log, "Failed to write ledger"; "path" => ?path, "err" => ?e); failed_paths.push((path.to_path_buf(), e)); } else { one_successful_write = true; @@ -135,6 +135,7 @@ impl Ledger { } if !one_successful_write { + error!(self.log, "No successful writes to ledger"); return Err(Error::FailedToWrite { failed_paths }); } Ok(()) diff --git a/illumos-utils/Cargo.toml b/illumos-utils/Cargo.toml index e4a99095fd..39b24f7ccd 100644 --- a/illumos-utils/Cargo.toml +++ b/illumos-utils/Cargo.toml @@ -28,6 +28,7 @@ smf.workspace = true thiserror.workspace = true tokio.workspace = true uuid.workspace = true +whoami.workspace = true zone.workspace = true # only enabled via the `testing` feature @@ -46,6 +47,3 @@ toml.workspace = true [features] # Enable to generate MockZones testing = ["mockall"] -# Useful for tests that want real functionality and ability to run without -# pfexec -tmp_keypath = [] diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index c111955761..3dbf018ecc 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -5,7 +5,7 @@ //! Utilities for poking at ZFS. use crate::{execute, PFEXEC}; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use omicron_common::disk::DiskIdentity; use std::fmt; @@ -28,8 +28,6 @@ pub const ZFS: &str = "/usr/sbin/zfs"; /// the keys and recreate the files on demand when creating and mounting /// encrypted filesystems. We then zero them and unlink them. pub const KEYPATH_ROOT: &str = "/var/run/oxide/"; -// Use /tmp so we don't have to worry about running tests with pfexec -pub const TEST_KEYPATH_ROOT: &str = "/tmp"; /// Error returned by [`Zfs::list_datasets`]. #[derive(thiserror::Error, Debug)] @@ -168,27 +166,34 @@ impl fmt::Display for Keypath { } } -#[cfg(not(feature = "tmp_keypath"))] -impl From<&DiskIdentity> for Keypath { - fn from(id: &DiskIdentity) -> Self { - build_keypath(id, KEYPATH_ROOT) - } -} - -#[cfg(feature = "tmp_keypath")] -impl From<&DiskIdentity> for Keypath { - fn from(id: &DiskIdentity) -> Self { - build_keypath(id, TEST_KEYPATH_ROOT) +impl Keypath { + /// Constructs a Keypath for the specified disk within the supplied root + /// directory. + /// + /// By supplying "root", tests can override the location where these paths + /// are stored to non-global locations. + pub fn new>(id: &DiskIdentity, root: &P) -> Keypath { + let keypath_root = Utf8PathBuf::from(KEYPATH_ROOT); + let mut keypath = keypath_root.as_path(); + let keypath_directory = loop { + match keypath.strip_prefix("/") { + Ok(stripped) => keypath = stripped, + Err(_) => break root.as_ref().join(keypath), + } + }; + std::fs::create_dir_all(&keypath_directory) + .expect("Cannot ensure directory for keys"); + + let filename = format!( + "{}-{}-{}-zfs-aes-256-gcm.key", + id.vendor, id.serial, id.model + ); + let path: Utf8PathBuf = + [keypath_directory.as_str(), &filename].iter().collect(); + Keypath(path) } } -fn build_keypath(id: &DiskIdentity, root: &str) -> Keypath { - let filename = - format!("{}-{}-{}-zfs-aes-256-gcm.key", id.vendor, id.serial, id.model); - let path: Utf8PathBuf = [root, &filename].iter().collect(); - Keypath(path) -} - #[derive(Debug)] pub struct EncryptionDetails { pub keypath: Keypath, @@ -332,6 +337,20 @@ impl Zfs { err: err.into(), })?; + // We ensure that the currently running process has the ability to + // act on the underlying mountpoint. + if !zoned { + let mut command = std::process::Command::new(PFEXEC); + let user = whoami::username(); + let mount = format!("{mountpoint}"); + let cmd = command.args(["chown", "-R", &user, &mount]); + execute(cmd).map_err(|err| EnsureFilesystemError { + name: name.to_string(), + mountpoint: mountpoint.clone(), + err: err.into(), + })?; + } + if let Some(SizeDetails { quota, compression }) = size_details { // Apply any quota and compression mode. Self::apply_properties(name, &mountpoint, quota, compression)?; diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs index f2c395e22b..27d7e0d700 100644 --- a/illumos-utils/src/zpool.rs +++ b/illumos-utils/src/zpool.rs @@ -12,10 +12,12 @@ use std::fmt; use std::str::FromStr; use uuid::Uuid; -const ZPOOL_EXTERNAL_PREFIX: &str = "oxp_"; -const ZPOOL_INTERNAL_PREFIX: &str = "oxi_"; +pub const ZPOOL_EXTERNAL_PREFIX: &str = "oxp_"; +pub const ZPOOL_INTERNAL_PREFIX: &str = "oxi_"; const ZPOOL: &str = "/usr/sbin/zpool"; +pub const ZPOOL_MOUNTPOINT_ROOT: &str = "/"; + #[derive(thiserror::Error, Debug, PartialEq, Eq)] #[error("Failed to parse output: {0}")] pub struct ParseError(String); @@ -192,7 +194,7 @@ impl Zpool { let mut cmd = std::process::Command::new(PFEXEC); cmd.env_clear(); cmd.env("LC_ALL", "C.UTF-8"); - cmd.arg(ZPOOL).arg("create"); + cmd.arg(ZPOOL).args(["create", "-o", "ashift=12"]); cmd.arg(&name.to_string()); cmd.arg(vdev); execute(&mut cmd).map_err(Error::from)?; @@ -374,9 +376,14 @@ impl ZpoolName { /// Returns a path to a dataset's mountpoint within the zpool. /// /// For example: oxp_(UUID) -> /pool/ext/(UUID)/(dataset) - pub fn dataset_mountpoint(&self, dataset: &str) -> Utf8PathBuf { + pub fn dataset_mountpoint( + &self, + root: &Utf8Path, + dataset: &str, + ) -> Utf8PathBuf { let mut path = Utf8PathBuf::new(); - path.push("/pool"); + path.push(root); + path.push("pool"); match self.kind { ZpoolKind::External => path.push("ext"), ZpoolKind::Internal => path.push("int"), diff --git a/installinator/src/hardware.rs b/installinator/src/hardware.rs index b037384cbe..90859e3754 100644 --- a/installinator/src/hardware.rs +++ b/installinator/src/hardware.rs @@ -9,6 +9,7 @@ use anyhow::Result; use sled_hardware::DiskVariant; use sled_hardware::HardwareManager; use sled_hardware::SledMode; +use sled_storage::config::MountConfig; use sled_storage::disk::Disk; use sled_storage::disk::RawDisk; use slog::info; @@ -49,9 +50,15 @@ impl Hardware { ); } DiskVariant::M2 => { - let disk = Disk::new(log, disk, None) - .await - .context("failed to instantiate Disk handle for M.2")?; + let disk = Disk::new( + log, + &MountConfig::default(), + disk, + None, + None, + ) + .await + .context("failed to instantiate Disk handle for M.2")?; m2_disks.push(disk); } } diff --git a/installinator/src/write.rs b/installinator/src/write.rs index 380595b4cd..c7710baff7 100644 --- a/installinator/src/write.rs +++ b/installinator/src/write.rs @@ -116,6 +116,7 @@ impl WriteDestination { let zpool_name = disk.zpool_name().clone(); let control_plane_dir = zpool_name.dataset_mountpoint( + illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT.into(), sled_storage::dataset::INSTALL_DATASET, ); diff --git a/key-manager/src/lib.rs b/key-manager/src/lib.rs index 7ca3cfa3bb..13dd9543a8 100644 --- a/key-manager/src/lib.rs +++ b/key-manager/src/lib.rs @@ -102,7 +102,7 @@ enum StorageKeyRequest { /// the sled-agent starts. The `HardwareMonitor` gets the StorageKeyRequester /// from the bootstrap agent. If this changes, we should remove the `Clone` to /// limit who has access to the storage keys. -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct StorageKeyRequester { tx: mpsc::Sender, } diff --git a/nexus/db-model/src/physical_disk.rs b/nexus/db-model/src/physical_disk.rs index 3628f7077f..3a011d0c72 100644 --- a/nexus/db-model/src/physical_disk.rs +++ b/nexus/db-model/src/physical_disk.rs @@ -29,6 +29,7 @@ pub struct PhysicalDisk { impl PhysicalDisk { pub fn new( + id: Uuid, vendor: String, serial: String, model: String, @@ -36,7 +37,7 @@ impl PhysicalDisk { sled_id: Uuid, ) -> Self { Self { - identity: PhysicalDiskIdentity::new(Uuid::new_v4()), + identity: PhysicalDiskIdentity::new(id), time_deleted: None, rcgen: Generation::new(), vendor, @@ -47,20 +48,10 @@ impl PhysicalDisk { } } - pub fn uuid(&self) -> Uuid { + pub fn id(&self) -> Uuid { self.identity.id } - // This is slightly gross, but: - // the `authz_resource` macro really expects that the "primary_key" - // for an object can be acquired by "id()". - // - // The PhysicalDisk object does actually have a separate convenience - // UUID, but may be looked by up vendor/serial/model too. - pub fn id(&self) -> (String, String, String) { - (self.vendor.clone(), self.serial.clone(), self.model.clone()) - } - pub fn time_deleted(&self) -> Option> { self.time_deleted } diff --git a/nexus/db-queries/src/authz/api_resources.rs b/nexus/db-queries/src/authz/api_resources.rs index 70bc9ab2eb..69b883a8cf 100644 --- a/nexus/db-queries/src/authz/api_resources.rs +++ b/nexus/db-queries/src/authz/api_resources.rs @@ -1060,7 +1060,7 @@ authz_resource! { authz_resource! { name = "PhysicalDisk", parent = "Fleet", - primary_key = (String, String, String), + primary_key = Uuid, roles_allowed = false, polar_snippet = FleetChild, } diff --git a/nexus/db-queries/src/authz/policy_test/resources.rs b/nexus/db-queries/src/authz/policy_test/resources.rs index 96cefb3db4..bc30e77fac 100644 --- a/nexus/db-queries/src/authz/policy_test/resources.rs +++ b/nexus/db-queries/src/authz/policy_test/resources.rs @@ -102,10 +102,12 @@ pub async fn make_resources( make_services(&mut builder).await; + let physical_disk_id = + "c9f923f6-caf3-4c83-96f9-8ffe8c627dd2".parse().unwrap(); builder.new_resource(authz::PhysicalDisk::new( authz::FLEET, - ("vendor".to_string(), "serial".to_string(), "model".to_string()), - LookupType::ByCompositeId("vendor-serial-model".to_string()), + physical_disk_id, + LookupType::ById(physical_disk_id), )); let device_user_code = String::from("a-device-user-code"); diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 292f13354f..bfc4d61926 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -230,7 +230,10 @@ mod test { // Create a fake zpool that backs our fake datasets. let zpool_id = Uuid::new_v4(); let zpool = Zpool::new(zpool_id, sled_id, Uuid::new_v4()); - datastore.zpool_upsert(zpool).await.expect("failed to upsert zpool"); + datastore + .zpool_upsert(opctx, zpool) + .await + .expect("failed to upsert zpool"); // Inserting a new dataset should succeed. let dataset1 = datastore diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 13d6bfcc8d..a6ae108376 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -640,6 +640,7 @@ mod test { kind: PhysicalDiskKind, ) -> Uuid { let physical_disk = PhysicalDisk::new( + Uuid::new_v4(), TEST_VENDOR.into(), TEST_SERIAL.into(), TEST_MODEL.into(), @@ -650,17 +651,19 @@ mod test { .physical_disk_upsert(opctx, physical_disk.clone()) .await .expect("Failed to upsert physical disk"); - physical_disk.uuid() + physical_disk.id() } // Creates a test zpool, returns its UUID. async fn create_test_zpool( datastore: &DataStore, + opctx: &OpContext, sled_id: Uuid, physical_disk_id: Uuid, ) -> Uuid { let zpool_id = create_test_zpool_not_in_inventory( datastore, + opctx, sled_id, physical_disk_id, ) @@ -676,12 +679,13 @@ mod test { // However, this helper doesn't add the zpool to the inventory just yet. async fn create_test_zpool_not_in_inventory( datastore: &DataStore, + opctx: &OpContext, sled_id: Uuid, physical_disk_id: Uuid, ) -> Uuid { let zpool_id = Uuid::new_v4(); let zpool = Zpool::new(zpool_id, sled_id, physical_disk_id); - datastore.zpool_upsert(zpool).await.unwrap(); + datastore.zpool_upsert(opctx, zpool).await.unwrap(); zpool_id } @@ -856,6 +860,7 @@ mod test { .then(|disk| { let pool_id_future = create_test_zpool( &datastore, + &opctx, disk.sled_id, disk.disk_id, ); @@ -1232,6 +1237,7 @@ mod test { .then(|_| { create_test_zpool_not_in_inventory( &datastore, + &opctx, sled_id, physical_disk_id, ) @@ -1327,7 +1333,12 @@ mod test { let zpool_ids: Vec = stream::iter(0..REGION_REDUNDANCY_THRESHOLD - 1) .then(|_| { - create_test_zpool(&datastore, sled_id, physical_disk_id) + create_test_zpool( + &datastore, + &opctx, + sled_id, + physical_disk_id, + ) }) .collect() .await; diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index 81fc14d1d7..b977c4dffe 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -42,6 +42,15 @@ impl DataStore { &self, opctx: &OpContext, disk: PhysicalDisk, + ) -> CreateResult { + let conn = &*self.pool_connection_authorized(&opctx).await?; + Self::physical_disk_upsert_on_connection(&conn, opctx, disk).await + } + + pub async fn physical_disk_upsert_on_connection( + conn: &async_bb8_diesel::Connection, + opctx: &OpContext, + disk: PhysicalDisk, ) -> CreateResult { opctx.authorize(authz::Action::Read, &authz::FLEET).await?; use db::schema::physical_disk::dsl; @@ -60,9 +69,7 @@ impl DataStore { dsl::time_modified.eq(now), )), ) - .insert_and_get_result_async( - &*self.pool_connection_authorized(&opctx).await?, - ) + .insert_and_get_result_async(conn) .await .map_err(|e| match e { AsyncInsertError::CollectionNotFound => Error::ObjectNotFound { @@ -203,6 +210,7 @@ mod test { // Insert a disk let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -213,7 +221,7 @@ mod test { .physical_disk_upsert(&opctx, disk.clone()) .await .expect("Failed first attempt at upserting disk"); - assert_eq!(disk.uuid(), first_observed_disk.uuid()); + assert_eq!(disk.id(), first_observed_disk.id()); assert_disks_equal_ignore_uuid(&disk, &first_observed_disk); // Observe the inserted disk @@ -223,11 +231,12 @@ mod test { .await .expect("Failed to list physical disks"); assert_eq!(disks.len(), 1); - assert_eq!(disk.uuid(), disks[0].uuid()); + assert_eq!(disk.id(), disks[0].id()); assert_disks_equal_ignore_uuid(&disk, &disks[0]); // Insert the same disk, with a different UUID primary key let disk_again = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -240,8 +249,8 @@ mod test { .expect("Failed second upsert of physical disk"); // This check is pretty important - note that we return the original // UUID, not the new one. - assert_ne!(disk_again.uuid(), second_observed_disk.uuid()); - assert_eq!(disk_again.id(), second_observed_disk.id()); + assert_eq!(disk.id(), second_observed_disk.id()); + assert_ne!(disk_again.id(), second_observed_disk.id()); assert_disks_equal_ignore_uuid(&disk_again, &second_observed_disk); assert!( first_observed_disk.time_modified() @@ -255,8 +264,8 @@ mod test { // We'll use the old primary key assert_eq!(disks.len(), 1); - assert_eq!(disk.uuid(), disks[0].uuid()); - assert_ne!(disk_again.uuid(), disks[0].uuid()); + assert_eq!(disk.id(), disks[0].id()); + assert_ne!(disk_again.id(), disks[0].id()); assert_disks_equal_ignore_uuid(&disk, &disks[0]); assert_disks_equal_ignore_uuid(&disk_again, &disks[0]); @@ -276,6 +285,7 @@ mod test { // Insert a disk let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -286,14 +296,14 @@ mod test { .physical_disk_upsert(&opctx, disk.clone()) .await .expect("Failed first attempt at upserting disk"); - assert_eq!(disk.uuid(), first_observed_disk.uuid()); + assert_eq!(disk.id(), first_observed_disk.id()); // Insert a disk with an identical UUID let second_observed_disk = datastore .physical_disk_upsert(&opctx, disk.clone()) .await .expect("Should have succeeded upserting disk"); - assert_eq!(disk.uuid(), second_observed_disk.uuid()); + assert_eq!(disk.id(), second_observed_disk.id()); assert!( first_observed_disk.time_modified() <= second_observed_disk.time_modified() @@ -326,6 +336,7 @@ mod test { // Insert a disk let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -339,6 +350,7 @@ mod test { // Insert a second disk let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Noxide"), String::from("456"), String::from("UnrealDisk"), @@ -371,6 +383,7 @@ mod test { // Insert a disk let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -439,6 +452,7 @@ mod test { // Insert a disk let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -485,6 +499,7 @@ mod test { // "Report the disk" from the second sled let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -530,6 +545,7 @@ mod test { // Insert a disk let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), @@ -554,6 +570,7 @@ mod test { // "Report the disk" from the second sled let disk = PhysicalDisk::new( + Uuid::new_v4(), String::from("Oxide"), String::from("123"), String::from("FakeDisk"), diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index e753a0cf09..09f635e0f3 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -23,6 +23,7 @@ use crate::db::fixed_data::vpc_subnet::NTP_VPC_SUBNET; use crate::db::identity::Asset; use crate::db::model::Dataset; use crate::db::model::IncompleteExternalIp; +use crate::db::model::PhysicalDisk; use crate::db::model::Rack; use crate::db::model::Zpool; use crate::db::pagination::paginated; @@ -70,6 +71,8 @@ pub struct RackInit { pub rack_subnet: IpNetwork, pub blueprint: Blueprint, pub services: Vec, + pub physical_disks: Vec, + pub zpools: Vec, pub datasets: Vec, pub service_ip_pool_ranges: Vec, pub internal_dns: InitialDnsGroup, @@ -90,6 +93,8 @@ enum RackInitError { BlueprintTargetSet(Error), ServiceInsert(Error), DatasetInsert { err: AsyncInsertError, zpool_id: Uuid }, + PhysicalDiskInsert(Error), + ZpoolInsert(Error), RackUpdate { err: DieselError, rack_id: Uuid }, DnsSerialization(Error), Silo(Error), @@ -126,6 +131,8 @@ impl From for Error { public_error_from_diesel(e, ErrorHandler::Server) } }, + RackInitError::PhysicalDiskInsert(err) => err, + RackInitError::ZpoolInsert(err) => err, RackInitError::ServiceInsert(err) => Error::internal_error( &format!("failed to insert Service record: {:#}", err), ), @@ -610,6 +617,8 @@ impl DataStore { let rack_id = rack_init.rack_id; let blueprint = rack_init.blueprint; let services = rack_init.services; + let physical_disks = rack_init.physical_disks; + let zpools = rack_init.zpools; let datasets = rack_init.datasets; let service_ip_pool_ranges = rack_init.service_ip_pool_ranges; @@ -640,7 +649,14 @@ impl DataStore { return Ok::<_, DieselError>(rack); } - // Otherwise, insert blueprint and datasets. + // Otherwise, insert: + // - Services + // - PhysicalDisks + // - Zpools + // - Datasets + // - A blueprint + // + // Which RSS has already allocated during bootstrapping. // Set up the IP pool for internal services. for range in service_ip_pool_ranges { @@ -713,12 +729,38 @@ impl DataStore { ) .await .map_err(|e| { + error!(log, "Failed to upsert physical disk"; "err" => ?e); err.set(e).unwrap(); DieselError::RollbackTransaction })?; } info!(log, "Inserted services"); + for physical_disk in physical_disks { + Self::physical_disk_upsert_on_connection(&conn, &opctx, physical_disk) + .await + .map_err(|e| { + error!(log, "Failed to upsert physical disk"; "err" => #%e); + err.set(RackInitError::PhysicalDiskInsert(e)) + .unwrap(); + DieselError::RollbackTransaction + })?; + } + + info!(log, "Inserted physical disks"); + + for zpool in zpools { + Self::zpool_upsert_on_connection(&conn, &opctx, zpool).await.map_err( + |e| { + error!(log, "Failed to upsert zpool"; "err" => #%e); + err.set(RackInitError::ZpoolInsert(e)).unwrap(); + DieselError::RollbackTransaction + }, + )?; + } + + info!(log, "Inserted zpools"); + for dataset in datasets { use db::schema::dataset::dsl; let zpool_id = dataset.pool_id; @@ -954,6 +996,8 @@ mod test { comment: "test suite".to_string(), }, services: vec![], + physical_disks: vec![], + zpools: vec![], datasets: vec![], service_ip_pool_ranges: vec![], internal_dns: InitialDnsGroup::new( diff --git a/nexus/db-queries/src/db/datastore/zpool.rs b/nexus/db-queries/src/db/datastore/zpool.rs index b894d5c509..0ab6bcf3af 100644 --- a/nexus/db-queries/src/db/datastore/zpool.rs +++ b/nexus/db-queries/src/db/datastore/zpool.rs @@ -32,8 +32,23 @@ use omicron_common::api::external::ResourceType; use uuid::Uuid; impl DataStore { + pub async fn zpool_upsert( + &self, + opctx: &OpContext, + zpool: Zpool, + ) -> CreateResult { + let conn = &*self.pool_connection_authorized(&opctx).await?; + Self::zpool_upsert_on_connection(&conn, opctx, zpool).await + } + /// Stores a new zpool in the database. - pub async fn zpool_upsert(&self, zpool: Zpool) -> CreateResult { + pub async fn zpool_upsert_on_connection( + conn: &async_bb8_diesel::Connection, + opctx: &OpContext, + zpool: Zpool, + ) -> CreateResult { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + use db::schema::zpool::dsl; let sled_id = zpool.sled_id; @@ -48,9 +63,7 @@ impl DataStore { dsl::sled_id.eq(excluded(dsl::sled_id)), )), ) - .insert_and_get_result_async( - &*self.pool_connection_unauthorized().await?, - ) + .insert_and_get_result_async(conn) .await .map_err(|e| match e { AsyncInsertError::CollectionNotFound => Error::ObjectNotFound { diff --git a/nexus/db-queries/src/db/lookup.rs b/nexus/db-queries/src/db/lookup.rs index 380c9db140..487a68b517 100644 --- a/nexus/db-queries/src/db/lookup.rs +++ b/nexus/db-queries/src/db/lookup.rs @@ -364,18 +364,8 @@ impl<'a> LookupPath<'a> { } /// Select a resource of type PhysicalDisk, identified by its id - pub fn physical_disk( - self, - vendor: &str, - serial: &str, - model: &str, - ) -> PhysicalDisk<'a> { - PhysicalDisk::PrimaryKey( - Root { lookup_root: self }, - vendor.to_string(), - serial.to_string(), - model.to_string(), - ) + pub fn physical_disk(self, id: Uuid) -> PhysicalDisk<'a> { + PhysicalDisk::PrimaryKey(Root { lookup_root: self }, id) } pub fn silo_image_id(self, id: Uuid) -> SiloImage<'a> { @@ -836,11 +826,7 @@ lookup_resource! { children = [], lookup_by_name = false, soft_deletes = true, - primary_key_columns = [ - { column_name = "vendor", rust_type = String }, - { column_name = "serial", rust_type = String }, - { column_name = "model", rust_type = String } - ] + primary_key_columns = [ { column_name = "id", rust_type = Uuid } ] } lookup_resource! { diff --git a/nexus/db-queries/tests/output/authz-roles.out b/nexus/db-queries/tests/output/authz-roles.out index ee55d775f0..0482cdfd2a 100644 --- a/nexus/db-queries/tests/output/authz-roles.out +++ b/nexus/db-queries/tests/output/authz-roles.out @@ -894,7 +894,7 @@ resource: Service id "7f7bb301-5dc9-41f1-ab29-d369f4835079" silo1-proj1-viewer ✘ ✘ ✘ ✘ ✘ ✘ ✘ ✘ unauthenticated ! ! ! ! ! ! ! ! -resource: PhysicalDisk id "vendor-serial-model" +resource: PhysicalDisk id "c9f923f6-caf3-4c83-96f9-8ffe8c627dd2" USER Q R LC RP M MP CC D fleet-admin ✘ ✔ ✔ ✔ ✔ ✔ ✔ ✔ diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 5cc6b687d4..8af81d957d 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -276,7 +276,7 @@ pub fn representative() -> Representative { let disks = vec![ // Let's say we have one manufacturer for our M.2... sled_agent_client::types::InventoryDisk { - identity: sled_agent_client::types::DiskIdentity { + identity: omicron_common::disk::DiskIdentity { vendor: "macrohard".to_string(), model: "box".to_string(), serial: "XXIV".to_string(), @@ -286,7 +286,7 @@ pub fn representative() -> Representative { }, // ... and a couple different vendors for our U.2s sled_agent_client::types::InventoryDisk { - identity: sled_agent_client::types::DiskIdentity { + identity: omicron_common::disk::DiskIdentity { vendor: "memetendo".to_string(), model: "swatch".to_string(), serial: "0001".to_string(), @@ -295,7 +295,7 @@ pub fn representative() -> Representative { slot: 1, }, sled_agent_client::types::InventoryDisk { - identity: sled_agent_client::types::DiskIdentity { + identity: omicron_common::disk::DiskIdentity { vendor: "memetendo".to_string(), model: "swatch".to_string(), serial: "0002".to_string(), @@ -304,7 +304,7 @@ pub fn representative() -> Representative { slot: 2, }, sled_agent_client::types::InventoryDisk { - identity: sled_agent_client::types::DiskIdentity { + identity: omicron_common::disk::DiskIdentity { vendor: "tony".to_string(), model: "craystation".to_string(), serial: "5".to_string(), diff --git a/nexus/reconfigurator/execution/src/datasets.rs b/nexus/reconfigurator/execution/src/datasets.rs index 1d08f3b294..361e23b7e6 100644 --- a/nexus/reconfigurator/execution/src/datasets.rs +++ b/nexus/reconfigurator/execution/src/datasets.rs @@ -202,7 +202,7 @@ mod tests { Uuid::new_v4(), // physical_disk_id ); datastore - .zpool_upsert(zpool) + .zpool_upsert(opctx, zpool) .await .expect("failed to upsert zpool"); } @@ -271,7 +271,7 @@ mod tests { Uuid::new_v4(), // physical_disk_id ); datastore - .zpool_upsert(zpool) + .zpool_upsert(opctx, zpool) .await .expect("failed to upsert zpool"); } diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 4a4a61142e..5b85acb929 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -87,7 +87,7 @@ impl super::Nexus { Ok(db_rack) } - /// Marks the rack as initialized with a set of services. + /// Marks the rack as initialized with information supplied by RSS. /// /// This function is a no-op if the rack has already been initialized. pub(crate) async fn rack_initialize( @@ -96,8 +96,37 @@ impl super::Nexus { rack_id: Uuid, request: RackInitializationRequest, ) -> Result<(), Error> { + let log = &opctx.log; + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + let physical_disks: Vec<_> = request + .physical_disks + .into_iter() + .map(|disk| { + db::model::PhysicalDisk::new( + disk.id, + disk.vendor, + disk.serial, + disk.model, + disk.variant.into(), + disk.sled_id, + ) + }) + .collect(); + + let zpools: Vec<_> = request + .zpools + .into_iter() + .map(|pool| { + db::model::Zpool::new( + pool.id, + pool.sled_id, + pool.physical_disk_id, + ) + }) + .collect(); + let datasets: Vec<_> = request .datasets .into_iter() @@ -224,10 +253,7 @@ impl super::Nexus { match request.external_port_count { ExternalPortDiscovery::Auto(switch_mgmt_addrs) => { use dpd_client::Client as DpdClient; - info!( - self.log, - "Using automatic external switchport discovery" - ); + info!(log, "Using automatic external switchport discovery"); for (switch, addr) in switch_mgmt_addrs { let dpd_client = DpdClient::new( @@ -238,7 +264,7 @@ impl super::Nexus { ), dpd_client::ClientState { tag: "nexus".to_string(), - log: self.log.new(o!("component" => "DpdClient")), + log: log.new(o!("component" => "DpdClient")), }, ); @@ -247,10 +273,7 @@ impl super::Nexus { Error::internal_error(&format!("encountered error while discovering ports for {switch:#?}: {e}")) })?; - info!( - self.log, - "discovered ports for {switch}: {all_ports:#?}" - ); + info!(log, "discovered ports for {switch}: {all_ports:#?}"); let qsfp_ports: Vec = all_ports .iter() @@ -261,7 +284,7 @@ impl super::Nexus { .collect(); info!( - self.log, + log, "populating ports for {switch}: {qsfp_ports:#?}" ); @@ -276,7 +299,7 @@ impl super::Nexus { // TODO: #3602 Eliminate need for static port mappings for switch ports ExternalPortDiscovery::Static(port_mappings) => { info!( - self.log, + log, "Using static configuration for external switchports" ); for (switch, ports) in port_mappings { @@ -295,7 +318,7 @@ impl super::Nexus { // Currently calling some of the apis directly, but should we be using sagas // going forward via self.run_saga()? Note that self.create_runnable_saga and // self.execute_saga are currently not available within this scope. - info!(self.log, "Recording Rack Network Configuration"); + info!(log, "Recording Rack Network Configuration"); let address_lot_name = Name::from_str(INFRA_LOT).map_err(|e| { Error::internal_error(&format!( "unable to use `initial-infra` as `Name`: {e}" @@ -591,6 +614,8 @@ impl super::Nexus { rack_id, blueprint, services: request.services, + physical_disks, + zpools, datasets, service_ip_pool_ranges, internal_dns, diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 06e50f2ecd..4bb4d6daef 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -5,14 +5,12 @@ //! Sleds, and the hardware and services within them. use crate::internal_api::params::{ - PhysicalDiskDeleteRequest, PhysicalDiskPutRequest, SledAgentInfo, SledRole, - ZpoolPutRequest, + PhysicalDiskPutRequest, SledAgentInfo, SledRole, ZpoolPutRequest, }; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::lookup; -use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::model::DatasetKind; use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledProvisionPolicy; @@ -200,12 +198,14 @@ impl super::Nexus { ) -> Result<(), Error> { info!( self.log, "upserting physical disk"; - "sled_id" => request.sled_id.to_string(), - "vendor" => request.vendor.to_string(), - "serial" => request.serial.to_string(), - "model" => request.model.to_string() + "physical_disk_id" => %request.id, + "sled_id" => %request.sled_id, + "vendor" => %request.vendor, + "serial" => %request.serial, + "model" => %request.model, ); let disk = db::model::PhysicalDisk::new( + request.id, request.vendor, request.serial, request.model, @@ -216,56 +216,27 @@ impl super::Nexus { Ok(()) } - /// Removes a physical disk from the database. - /// - /// TODO: Remove Zpools and datasets contained within this disk. - pub(crate) async fn delete_physical_disk( - &self, - opctx: &OpContext, - request: PhysicalDiskDeleteRequest, - ) -> Result<(), Error> { - info!( - self.log, "deleting physical disk"; - "sled_id" => request.sled_id.to_string(), - "vendor" => request.vendor.to_string(), - "serial" => request.serial.to_string(), - "model" => request.model.to_string() - ); - self.db_datastore - .physical_disk_delete( - &opctx, - request.vendor, - request.serial, - request.model, - request.sled_id, - ) - .await?; - Ok(()) - } - // Zpools (contained within sleds) /// Upserts a Zpool into the database, updating it if it already exists. pub(crate) async fn upsert_zpool( &self, opctx: &OpContext, - id: Uuid, - sled_id: Uuid, - info: ZpoolPutRequest, + request: ZpoolPutRequest, ) -> Result<(), Error> { - info!(self.log, "upserting zpool"; "sled_id" => sled_id.to_string(), "zpool_id" => id.to_string()); + info!( + self.log, "upserting zpool"; + "sled_id" => %request.sled_id, + "zpool_id" => %request.id, + "physical_disk_id" => %request.physical_disk_id, + ); - let (_authz_disk, db_disk) = - LookupPath::new(&opctx, &self.db_datastore) - .physical_disk( - &info.disk_vendor, - &info.disk_serial, - &info.disk_model, - ) - .fetch() - .await?; - let zpool = db::model::Zpool::new(id, sled_id, db_disk.uuid()); - self.db_datastore.zpool_upsert(zpool).await?; + let zpool = db::model::Zpool::new( + request.id, + request.sled_id, + request.physical_disk_id, + ); + self.db_datastore.zpool_upsert(&opctx, zpool).await?; Ok(()) } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 3758b5289b..6d2484c19d 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -6,11 +6,7 @@ use crate::ServerContext; -use super::params::{ - OximeterInfo, PhysicalDiskDeleteRequest, PhysicalDiskPutRequest, - PhysicalDiskPutResponse, RackInitializationRequest, SledAgentInfo, - ZpoolPutRequest, ZpoolPutResponse, -}; +use super::params::{OximeterInfo, RackInitializationRequest}; use dropshot::endpoint; use dropshot::ApiDescription; use dropshot::FreeformBody; @@ -34,6 +30,7 @@ use nexus_types::external_api::params::SledSelector; use nexus_types::external_api::params::UninitializedSledId; use nexus_types::external_api::shared::UninitializedSled; use nexus_types::external_api::views::SledPolicy; +use nexus_types::internal_api::params::SledAgentInfo; use nexus_types::internal_api::params::SwitchPutRequest; use nexus_types::internal_api::params::SwitchPutResponse; use nexus_types::internal_api::views::to_list; @@ -75,9 +72,6 @@ pub(crate) fn internal_api() -> NexusApiDescription { api.register(sled_firewall_rules_request)?; api.register(switch_put)?; api.register(rack_initialization_complete)?; - api.register(physical_disk_put)?; - api.register(physical_disk_delete)?; - api.register(zpool_put)?; api.register(cpapi_instances_put)?; api.register(cpapi_disks_put)?; api.register(cpapi_volume_remove_read_only_parent)?; @@ -257,77 +251,6 @@ async fn switch_put( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } -/// Report that a physical disk for the specified sled has come online. -#[endpoint { - method = PUT, - path = "/physical-disk", - }] -async fn physical_disk_put( - rqctx: RequestContext>, - body: TypedBody, -) -> Result, HttpError> { - let apictx = rqctx.context(); - let nexus = &apictx.nexus; - let disk = body.into_inner(); - let handler = async { - let opctx = crate::context::op_context_for_internal_api(&rqctx).await; - nexus.upsert_physical_disk(&opctx, disk).await?; - Ok(HttpResponseOk(PhysicalDiskPutResponse {})) - }; - apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await -} - -/// Report that a physical disk for the specified sled has gone offline. -#[endpoint { - method = DELETE, - path = "/physical-disk", - }] -async fn physical_disk_delete( - rqctx: RequestContext>, - body: TypedBody, -) -> Result { - let apictx = rqctx.context(); - let nexus = &apictx.nexus; - let disk = body.into_inner(); - - let handler = async { - let opctx = crate::context::op_context_for_internal_api(&rqctx).await; - nexus.delete_physical_disk(&opctx, disk).await?; - Ok(HttpResponseDeleted()) - }; - apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await -} - -/// Path parameters for Zpool requests (internal API) -#[derive(Deserialize, JsonSchema)] -struct ZpoolPathParam { - sled_id: Uuid, - zpool_id: Uuid, -} - -/// Report that a pool for a specified sled has come online. -#[endpoint { - method = PUT, - path = "/sled-agents/{sled_id}/zpools/{zpool_id}", - }] -async fn zpool_put( - rqctx: RequestContext>, - path_params: Path, - pool_info: TypedBody, -) -> Result, HttpError> { - let apictx = rqctx.context(); - let nexus = &apictx.nexus; - let path = path_params.into_inner(); - let pi = pool_info.into_inner(); - - let handler = async { - let opctx = crate::context::op_context_for_internal_api(&rqctx).await; - nexus.upsert_zpool(&opctx, path.zpool_id, path.sled_id, pi).await?; - Ok(HttpResponseOk(ZpoolPutResponse {})) - }; - apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await -} - /// Path parameters for Instance requests (internal API) #[derive(Deserialize, JsonSchema)] struct InstancePathParam { diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index bd5a13dfd1..80c972363f 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -29,7 +29,9 @@ use internal_api::http_entrypoints::internal_api; use nexus_config::NexusConfig; use nexus_types::deployment::Blueprint; use nexus_types::external_api::views::SledProvisionPolicy; -use nexus_types::internal_api::params::ServiceKind; +use nexus_types::internal_api::params::{ + PhysicalDiskPutRequest, ServiceKind, ZpoolPutRequest, +}; use nexus_types::inventory::Collection; use omicron_common::address::IpRange; use omicron_common::api::external::Error; @@ -237,6 +239,10 @@ impl nexus_test_interface::NexusServer for Server { config: &NexusConfig, blueprint: Blueprint, services: Vec, + physical_disks: Vec< + nexus_types::internal_api::params::PhysicalDiskPutRequest, + >, + zpools: Vec, datasets: Vec, internal_dns_zone_config: nexus_types::internal_api::params::DnsConfigParams, external_dns_zone_name: &str, @@ -282,6 +288,8 @@ impl nexus_test_interface::NexusServer for Server { internal_api::params::RackInitializationRequest { blueprint, services, + physical_disks, + zpools, datasets, internal_services_ip_pool_ranges, certs, @@ -341,14 +349,26 @@ impl nexus_test_interface::NexusServer for Server { async fn upsert_crucible_dataset( &self, - id: Uuid, - zpool_id: Uuid, + physical_disk: PhysicalDiskPutRequest, + zpool: ZpoolPutRequest, + dataset_id: Uuid, address: SocketAddrV6, ) { + let opctx = self.apictx.nexus.opctx_for_internal_api(); + self.apictx + .nexus + .upsert_physical_disk(&opctx, physical_disk) + .await + .unwrap(); + + let zpool_id = zpool.id; + + self.apictx.nexus.upsert_zpool(&opctx, zpool).await.unwrap(); + self.apictx .nexus .upsert_dataset( - id, + dataset_id, zpool_id, address, nexus_db_queries::db::model::DatasetKind::Crucible, diff --git a/nexus/test-interface/src/lib.rs b/nexus/test-interface/src/lib.rs index 2e3428a1dd..54478c0876 100644 --- a/nexus/test-interface/src/lib.rs +++ b/nexus/test-interface/src/lib.rs @@ -34,6 +34,9 @@ use async_trait::async_trait; use nexus_config::NexusConfig; use nexus_types::deployment::Blueprint; +use nexus_types::internal_api::params::{ + PhysicalDiskPutRequest, ZpoolPutRequest, +}; use nexus_types::inventory::Collection; use omicron_common::api::external::Error; use slog::Logger; @@ -55,6 +58,8 @@ pub trait NexusServer: Send + Sync + 'static { config: &NexusConfig, blueprint: Blueprint, services: Vec, + physical_disks: Vec, + zpools: Vec, datasets: Vec, internal_dns_config: nexus_types::internal_api::params::DnsConfigParams, external_dns_zone_name: &str, @@ -75,6 +80,10 @@ pub trait NexusServer: Send + Sync + 'static { // control over dataset provisioning is shifting to Nexus. There is // a short window where RSS controls dataset provisioning, but afterwards, // Nexus should be calling the shots on "when to provision datasets". + // Furthermore, with https://github.com/oxidecomputer/omicron/pull/5172, + // physical disk and zpool provisioning has already moved into Nexus. This + // provides a "back-door" for tests to control the set of control plane + // disks that are considered active. // // For test purposes, we have many situations where we want to carve up // zpools and datasets precisely for disk-based tests. As a result, we @@ -88,8 +97,9 @@ pub trait NexusServer: Send + Sync + 'static { // However, doing so would let us remove this test-only API. async fn upsert_crucible_dataset( &self, - id: Uuid, - zpool_id: Uuid, + physical_disk: PhysicalDiskPutRequest, + zpool: ZpoolPutRequest, + dataset_id: Uuid, address: SocketAddrV6, ); diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 76ef600fbb..c124e3b58f 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -861,6 +861,8 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { // asynchronously, and we're not making any effort (currently) to // wait for them to be known to Nexus. vec![], + vec![], + vec![], dns_config, &external_dns_zone_name, recovery_silo, diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index b67028a996..b50a60eb8b 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -15,7 +15,6 @@ use http::StatusCode; use nexus_db_queries::db::fixed_data::silo::DEFAULT_SILO; use nexus_test_interface::NexusServer; use nexus_types::external_api::params; -use nexus_types::external_api::params::PhysicalDiskKind; use nexus_types::external_api::params::UserId; use nexus_types::external_api::shared; use nexus_types::external_api::shared::Baseboard; @@ -37,6 +36,7 @@ use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Instance; use omicron_common::api::external::InstanceCpuCount; use omicron_common::api::external::NameOrId; +use omicron_common::disk::DiskIdentity; use omicron_sled_agent::sim::SledAgent; use omicron_test_utils::dev::poll::wait_for_condition; use omicron_test_utils::dev::poll::CondCheckError; @@ -340,55 +340,6 @@ pub async fn create_switch( .await } -pub async fn create_physical_disk( - client: &ClientTestContext, - vendor: &str, - serial: &str, - model: &str, - variant: PhysicalDiskKind, - sled_id: Uuid, -) -> internal_params::PhysicalDiskPutResponse { - object_put( - client, - "/physical-disk", - &internal_params::PhysicalDiskPutRequest { - vendor: vendor.to_string(), - serial: serial.to_string(), - model: model.to_string(), - variant, - sled_id, - }, - ) - .await -} - -pub async fn delete_physical_disk( - client: &ClientTestContext, - vendor: &str, - serial: &str, - model: &str, - sled_id: Uuid, -) { - let body = internal_params::PhysicalDiskDeleteRequest { - vendor: vendor.to_string(), - serial: serial.to_string(), - model: model.to_string(), - sled_id, - }; - - NexusRequest::new( - RequestBuilder::new(client, http::Method::DELETE, "/physical-disk") - .body(Some(&body)) - .expect_status(Some(http::StatusCode::NO_CONTENT)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap_or_else(|_| { - panic!("failed to make \"delete\" request of physical disk") - }); -} - pub async fn create_silo( client: &ClientTestContext, silo_name: &str, @@ -781,36 +732,60 @@ impl DiskTest { cptestctx: &ControlPlaneTestContext, gibibytes: u32, ) { + // To get a dataset, we actually need to create a new simulated physical + // disk, zpool, and dataset, all contained within one another. let zpool = TestZpool { id: Uuid::new_v4(), size: ByteCount::from_gibibytes_u32(gibibytes), datasets: vec![TestDataset { id: Uuid::new_v4() }], }; + let physical_disk_id = Uuid::new_v4(); + + let disk_identity = DiskIdentity { + vendor: "test-vendor".into(), + serial: "test-serial".into(), + model: "test-model".into(), + }; + + let physical_disk_request = + nexus_types::internal_api::params::PhysicalDiskPutRequest { + id: physical_disk_id, + vendor: disk_identity.vendor.clone(), + serial: disk_identity.serial.clone(), + model: disk_identity.model.clone(), + variant: + nexus_types::external_api::params::PhysicalDiskKind::U2, + sled_id: self.sled_agent.id, + }; + + let zpool_request = + nexus_types::internal_api::params::ZpoolPutRequest { + id: zpool.id, + physical_disk_id, + sled_id: self.sled_agent.id, + }; + + // Tell the simulated sled agent to create the disk and zpool containing + // these datasets. + self.sled_agent .create_external_physical_disk( - "test-vendor".into(), - "test-serial".into(), - "test-model".into(), + physical_disk_id, + disk_identity.clone(), ) .await; self.sled_agent - .create_zpool( - zpool.id, - "test-vendor".into(), - "test-serial".into(), - "test-model".into(), - zpool.size.to_bytes(), - ) + .create_zpool(zpool.id, physical_disk_id, zpool.size.to_bytes()) .await; for dataset in &zpool.datasets { + // Sled Agent side: Create the Dataset, make sure regions can be + // created immediately if Nexus requests anything. let address = self .sled_agent .create_crucible_dataset(zpool.id, dataset.id) .await; - - // By default, regions are created immediately. let crucible = self .sled_agent .get_crucible_dataset(zpool.id, dataset.id) @@ -819,6 +794,9 @@ impl DiskTest { .set_create_callback(Box::new(|_| RegionState::Created)) .await; + // Nexus side: Notify Nexus of the physical disk/zpool/dataset + // combination that exists. + let address = match address { std::net::SocketAddr::V6(addr) => addr, _ => panic!("Unsupported address type: {address} "), @@ -826,7 +804,12 @@ impl DiskTest { cptestctx .server - .upsert_crucible_dataset(dataset.id, zpool.id, address) + .upsert_crucible_dataset( + physical_disk_request.clone(), + zpool_request.clone(), + dataset.id, + address, + ) .await; } diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index 804694c0b2..80a5534790 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -50,7 +50,6 @@ mod vpc_firewall; mod vpc_routers; mod vpc_subnets; mod vpcs; -mod zpools; // This module is used only for shared data, not test cases. mod endpoints; diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index 743a76be17..b6ed9183a3 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -6,17 +6,17 @@ use camino::Utf8Path; use dropshot::test_util::ClientTestContext; +use nexus_db_model::PhysicalDisk as DbPhysicalDisk; +use nexus_db_model::PhysicalDiskKind as DbPhysicalDiskKind; +use nexus_db_queries::context::OpContext; use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::create_default_ip_pool; use nexus_test_utils::resource_helpers::create_instance; -use nexus_test_utils::resource_helpers::create_physical_disk; use nexus_test_utils::resource_helpers::create_project; -use nexus_test_utils::resource_helpers::delete_physical_disk; use nexus_test_utils::resource_helpers::objects_list_page_authz; use nexus_test_utils::start_sled_agent; use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; -use nexus_types::external_api::params::PhysicalDiskKind; use nexus_types::external_api::views::SledInstance; use nexus_types::external_api::views::{PhysicalDisk, Sled}; use omicron_sled_agent::sim; @@ -95,7 +95,6 @@ async fn test_physical_disk_create_list_delete( cptestctx: &ControlPlaneTestContext, ) { let external_client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; // Verify that there are two sleds to begin with. let sleds_url = "/v1/system/hardware/sleds"; @@ -106,17 +105,26 @@ async fn test_physical_disk_create_list_delete( format!("/v1/system/hardware/sleds/{SLED_AGENT_UUID}/disks"); let disks_initial = physical_disks_list(&external_client, &disks_url).await; - // Insert a new disk using the internal API, observe it in the external API + // Inject a disk into the database, observe it in the external API + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); let sled_id = Uuid::from_str(&SLED_AGENT_UUID).unwrap(); - create_physical_disk( - &internal_client, - "v", - "s", - "m", - PhysicalDiskKind::U2, + let physical_disk = DbPhysicalDisk::new( + Uuid::new_v4(), + "v".into(), + "s".into(), + "m".into(), + DbPhysicalDiskKind::U2, sled_id, - ) - .await; + ); + + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + let _disk_id = datastore + .physical_disk_upsert(&opctx, physical_disk.clone()) + .await + .expect("Failed to upsert physical disk"); + let disks = physical_disks_list(&external_client, &disks_url).await; assert_eq!(disks.len(), disks_initial.len() + 1); let _new_disk = disks @@ -129,7 +137,17 @@ async fn test_physical_disk_create_list_delete( .expect("did not find the new disk"); // Delete that disk using the internal API, observe it in the external API - delete_physical_disk(&internal_client, "v", "s", "m", sled_id).await; + datastore + .physical_disk_delete( + &opctx, + "v".into(), + "s".into(), + "m".into(), + sled_id, + ) + .await + .expect("Failed to upsert physical disk"); + assert_eq!( physical_disks_list(&external_client, &disks_url).await, disks_initial diff --git a/nexus/tests/integration_tests/switches.rs b/nexus/tests/integration_tests/switches.rs index f56d42f6d1..d665d6ff8e 100644 --- a/nexus/tests/integration_tests/switches.rs +++ b/nexus/tests/integration_tests/switches.rs @@ -6,15 +6,11 @@ use dropshot::test_util::ClientTestContext; use nexus_test_interface::NexusServer; -use nexus_test_utils::resource_helpers::create_physical_disk; -use nexus_test_utils::resource_helpers::delete_physical_disk; use nexus_test_utils::resource_helpers::objects_list_page_authz; use nexus_test_utils::start_sled_agent; use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; -use nexus_types::external_api::views::{ - PhysicalDisk, PhysicalDiskType, Sled, -}; +use nexus_types::external_api::views::Sled; use nexus_types::internal_api::params as internal_params; use omicron_sled_agent::sim; use std::str::FromStr; @@ -75,42 +71,3 @@ async fn test_switches_list(cptestctx: &ControlPlaneTestContext) { sa.http_server.close().await.unwrap(); } } - -#[nexus_test] -async fn test_physical_disk_create_list_delete( - cptestctx: &ControlPlaneTestContext, -) { - let external_client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - - // Verify that there is one sled to begin with. - let switches_url = "/v1/system/hardware/switches"; - assert_eq!(switches_list(&external_client, &switches_url).await.len(), 1); - - // Verify that there are no disks. - let disks_url = - format!("/v1/system/hardware/switches/{SLED_AGENT_UUID}/disks"); - assert!(physical_disks_list(&external_client, &disks_url).await.is_empty()); - - // Insert a new disk using the internal API, observe it in the external API - let sled_id = Uuid::from_str(&SLED_AGENT_UUID).unwrap(); - create_physical_disk( - &internal_client, - "v", - "s", - "m", - internal_params::PhysicalDiskKind::U2, - sled_id, - ) - .await; - let disks = physical_disks_list(&external_client, &disks_url).await; - assert_eq!(disks.len(), 1); - assert_eq!(disks[0].vendor, "v"); - assert_eq!(disks[0].serial, "s"); - assert_eq!(disks[0].model, "m"); - assert_eq!(disks[0].disk_type, PhysicalDiskType::External); - - // Delete that disk using the internal API, observe it in the external API - delete_physical_disk(&internal_client, "v", "s", "m", sled_id).await; - assert!(physical_disks_list(&external_client, &disks_url).await.is_empty()); -} diff --git a/nexus/tests/integration_tests/zpools.rs b/nexus/tests/integration_tests/zpools.rs deleted file mode 100644 index 8e058f9349..0000000000 --- a/nexus/tests/integration_tests/zpools.rs +++ /dev/null @@ -1,128 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use dropshot::test_util::ClientTestContext; -use http::method::Method; -use http::StatusCode; -use nexus_types::external_api::params::PhysicalDiskKind; -use nexus_types::internal_api::params::PhysicalDiskPutRequest; -use nexus_types::internal_api::params::ZpoolPutRequest; -use omicron_common::api::external::ByteCount; -use uuid::Uuid; - -use nexus_test_utils::SLED_AGENT_UUID; -use nexus_test_utils_macros::nexus_test; - -type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - -const VENDOR: &str = "test-vendor"; -const SERIAL: &str = "test-serial"; -const MODEL: &str = "test-model"; - -async fn create_test_physical_disk(client: &ClientTestContext) { - let request = PhysicalDiskPutRequest { - vendor: VENDOR.into(), - serial: SERIAL.into(), - model: MODEL.into(), - variant: PhysicalDiskKind::U2, - sled_id: SLED_AGENT_UUID.parse().unwrap(), - }; - let physical_disk_put_url = "/physical-disk"; - client - .make_request( - Method::PUT, - &physical_disk_put_url, - Some(request), - StatusCode::OK, - ) - .await - .unwrap(); -} - -// Tests the "normal" case of zpool_put: inserting a known Zpool. -// -// This will typically be invoked by the Sled Agent, after performing inventory. -#[nexus_test] -async fn test_zpool_put_success(cptestctx: &ControlPlaneTestContext) { - let client = &cptestctx.internal_client; - create_test_physical_disk(&client).await; - - let zpool_id = Uuid::new_v4(); - let zpool_put_url = - format!("/sled-agents/{}/zpools/{}", SLED_AGENT_UUID, zpool_id); - - let request = ZpoolPutRequest { - size: ByteCount::from_gibibytes_u32(1), - disk_vendor: VENDOR.into(), - disk_serial: SERIAL.into(), - disk_model: MODEL.into(), - }; - client - .make_request( - Method::PUT, - &zpool_put_url, - Some(request), - StatusCode::OK, - ) - .await - .unwrap(); -} - -// Tests a failure case of zpool_put: Inserting a zpool into a sled agent that -// does not exist. -#[nexus_test] -async fn test_zpool_put_bad_sled_returns_not_found( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.internal_client; - create_test_physical_disk(&client).await; - - // A sled with the "nil" UUID should not exist. - let sled_id = Uuid::nil(); - let zpool_id = Uuid::new_v4(); - let zpool_put_url = format!("/sled_agents/{}/zpools/{}", sled_id, zpool_id); - - let request = ZpoolPutRequest { - size: ByteCount::from_gibibytes_u32(1), - disk_vendor: VENDOR.into(), - disk_serial: SERIAL.into(), - disk_model: MODEL.into(), - }; - client - .make_request_error_body( - Method::PUT, - &zpool_put_url, - request, - StatusCode::NOT_FOUND, - ) - .await; -} - -// Tests a failure case of zpool_put: Inserting a zpool into a sled agent that -// exists, but into a disk that does not exist -#[nexus_test] -async fn test_zpool_put_bad_physical_disk_returns_not_found( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.internal_client; - let zpool_id = Uuid::new_v4(); - let zpool_put_url = - format!("/sled_agents/{}/zpools/{}", SLED_AGENT_UUID, zpool_id); - - let request = ZpoolPutRequest { - size: ByteCount::from_gibibytes_u32(1), - disk_vendor: VENDOR.into(), - disk_serial: SERIAL.into(), - disk_model: MODEL.into(), - }; - client - .make_request_error_body( - Method::PUT, - &zpool_put_url, - request, - StatusCode::NOT_FOUND, - ) - .await; -} diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index 9f80d313fd..a811106c2c 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -82,43 +82,25 @@ pub struct SwitchPutResponse {} #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct PhysicalDiskPutRequest { - pub vendor: String, - pub serial: String, - pub model: String, - - pub variant: PhysicalDiskKind, - pub sled_id: Uuid, -} - -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct PhysicalDiskPutResponse {} + pub id: Uuid, -#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] -pub struct PhysicalDiskDeleteRequest { pub vendor: String, pub serial: String, pub model: String, + pub variant: PhysicalDiskKind, pub sled_id: Uuid, } -/// Sent by a sled agent on startup to Nexus to request further instruction +/// Identifies information about a Zpool that should be part of the control +/// plane. #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] pub struct ZpoolPutRequest { - /// Total size of the pool. - pub size: ByteCount, - - // Information to identify the disk to which this zpool belongs - pub disk_vendor: String, - pub disk_serial: String, - pub disk_model: String, - // TODO: We could include any other data from `ZpoolInfo` we want, - // such as "allocated/free" space and pool health? + pub id: Uuid, + pub sled_id: Uuid, + pub physical_disk_id: Uuid, } -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct ZpoolPutResponse {} - /// Describes the purpose of the dataset. #[derive( Debug, Serialize, Deserialize, JsonSchema, Clone, Copy, PartialEq, Eq, @@ -253,6 +235,13 @@ pub struct RackInitializationRequest { pub blueprint: Blueprint, /// Services on the rack which have been created by RSS. pub services: Vec, + + /// "Managed" physical disks owned by the control plane + pub physical_disks: Vec, + + /// Zpools created within the physical disks created by the control plane. + pub zpools: Vec, + /// Datasets on the rack which have been provisioned by RSS. pub datasets: Vec, /// Ranges of the service IP pool which may be used for internal services, diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 40da26047b..bf2fd16971 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -360,7 +360,7 @@ pub struct PhysicalDisk { impl From for PhysicalDisk { fn from(disk: sled_agent_client::types::InventoryDisk) -> PhysicalDisk { PhysicalDisk { - identity: disk.identity.into(), + identity: disk.identity, variant: disk.variant.into(), slot: disk.slot, } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index db3199833e..fee389dfdc 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -919,65 +919,6 @@ } } }, - "/physical-disk": { - "put": { - "summary": "Report that a physical disk for the specified sled has come online.", - "operationId": "physical_disk_put", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/PhysicalDiskPutRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/PhysicalDiskPutResponse" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - }, - "delete": { - "summary": "Report that a physical disk for the specified sled has gone offline.", - "operationId": "physical_disk_delete", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/PhysicalDiskDeleteRequest" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "successful deletion" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/probes/{sled}": { "get": { "summary": "Get all the probes associated with a given sled.", @@ -1277,60 +1218,6 @@ } } }, - "/sled-agents/{sled_id}/zpools/{zpool_id}": { - "put": { - "summary": "Report that a pool for a specified sled has come online.", - "operationId": "zpool_put", - "parameters": [ - { - "in": "path", - "name": "sled_id", - "required": true, - "schema": { - "type": "string", - "format": "uuid" - } - }, - { - "in": "path", - "name": "zpool_id", - "required": true, - "schema": { - "type": "string", - "format": "uuid" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ZpoolPutRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ZpoolPutResponse" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/sleds/add": { "post": { "summary": "Add sled to initialized rack", @@ -5980,30 +5867,6 @@ "collector_id" ] }, - "PhysicalDiskDeleteRequest": { - "type": "object", - "properties": { - "model": { - "type": "string" - }, - "serial": { - "type": "string" - }, - "sled_id": { - "type": "string", - "format": "uuid" - }, - "vendor": { - "type": "string" - } - }, - "required": [ - "model", - "serial", - "sled_id", - "vendor" - ] - }, "PhysicalDiskKind": { "description": "Describes the form factor of physical disks.", "type": "string", @@ -6015,6 +5878,10 @@ "PhysicalDiskPutRequest": { "type": "object", "properties": { + "id": { + "type": "string", + "format": "uuid" + }, "model": { "type": "string" }, @@ -6033,6 +5900,7 @@ } }, "required": [ + "id", "model", "serial", "sled_id", @@ -6040,9 +5908,6 @@ "vendor" ] }, - "PhysicalDiskPutResponse": { - "type": "object" - }, "PortConfigV1": { "type": "object", "properties": { @@ -6468,6 +6333,13 @@ "$ref": "#/components/schemas/IpRange" } }, + "physical_disks": { + "description": "\"Managed\" physical disks owned by the control plane", + "type": "array", + "items": { + "$ref": "#/components/schemas/PhysicalDiskPutRequest" + } + }, "rack_network_config": { "description": "Initial rack network configuration", "allOf": [ @@ -6490,6 +6362,13 @@ "items": { "$ref": "#/components/schemas/ServicePutRequest" } + }, + "zpools": { + "description": "Zpools created within the physical disks created by the control plane.", + "type": "array", + "items": { + "$ref": "#/components/schemas/ZpoolPutRequest" + } } }, "required": [ @@ -6500,9 +6379,11 @@ "external_port_count", "internal_dns_zone_config", "internal_services_ip_pool_ranges", + "physical_disks", "rack_network_config", "recovery_silo", - "services" + "services", + "zpools" ] }, "RackNetworkConfigV1": { @@ -7667,37 +7548,28 @@ "type": "string" }, "ZpoolPutRequest": { - "description": "Sent by a sled agent on startup to Nexus to request further instruction", + "description": "Identifies information about a Zpool that should be part of the control plane.", "type": "object", "properties": { - "disk_model": { - "type": "string" - }, - "disk_serial": { - "type": "string" + "id": { + "type": "string", + "format": "uuid" }, - "disk_vendor": { - "type": "string" + "physical_disk_id": { + "type": "string", + "format": "uuid" }, - "size": { - "description": "Total size of the pool.", - "allOf": [ - { - "$ref": "#/components/schemas/ByteCount" - } - ] + "sled_id": { + "type": "string", + "format": "uuid" } }, "required": [ - "disk_model", - "disk_serial", - "disk_vendor", - "size" + "id", + "physical_disk_id", + "sled_id" ] }, - "ZpoolPutResponse": { - "type": "object" - }, "SemverVersion": { "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index e5b3a1c56f..07a42b461f 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -623,6 +623,60 @@ } } }, + "/omicron-physical-disks": { + "get": { + "operationId": "omicron_physical_disks_get", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OmicronPhysicalDisksConfig" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "operationId": "omicron_physical_disks_put", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OmicronPhysicalDisksConfig" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DisksManagementResult" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/omicron-zones": { "get": { "operationId": "omicron_zones_get", @@ -3571,6 +3625,112 @@ "vendor" ] }, + "DiskManagementError": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "not_found" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "zpool_uuid_mismatch" + ] + }, + "value": { + "type": "object", + "properties": { + "expected": { + "type": "string", + "format": "uuid" + }, + "observed": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "expected", + "observed" + ] + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "key_manager" + ] + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "other" + ] + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ] + } + ] + }, + "DiskManagementStatus": { + "description": "Identifies how a single disk management operation may have succeeded or failed.", + "type": "object", + "properties": { + "err": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/DiskManagementError" + } + ] + }, + "identity": { + "$ref": "#/components/schemas/DiskIdentity" + } + }, + "required": [ + "identity" + ] + }, "DiskRequest": { "description": "DiskRequest\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"device\", \"name\", \"read_only\", \"slot\", \"volume_construction_request\" ], \"properties\": { \"device\": { \"type\": \"string\" }, \"name\": { \"type\": \"string\" }, \"read_only\": { \"type\": \"boolean\" }, \"slot\": { \"$ref\": \"#/components/schemas/Slot\" }, \"volume_construction_request\": { \"$ref\": \"#/components/schemas/VolumeConstructionRequest\" } } } ```
", "type": "object", @@ -3911,6 +4071,21 @@ "M2" ] }, + "DisksManagementResult": { + "description": "The result from attempting to manage underlying disks.\n\nThis is more complex than a simple \"Error\" type because it's possible for some disks to be initialized correctly, while others can fail.\n\nThis structure provides a mechanism for callers to learn about partial failures, and handle them appropriately on a per-disk basis.", + "type": "object", + "properties": { + "status": { + "type": "array", + "items": { + "$ref": "#/components/schemas/DiskManagementStatus" + } + } + }, + "required": [ + "status" + ] + }, "Duration": { "type": "object", "properties": { @@ -5817,6 +5992,50 @@ } ] }, + "OmicronPhysicalDiskConfig": { + "type": "object", + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "identity": { + "$ref": "#/components/schemas/DiskIdentity" + }, + "pool_id": { + "type": "string", + "format": "uuid" + } + }, + "required": [ + "id", + "identity", + "pool_id" + ] + }, + "OmicronPhysicalDisksConfig": { + "type": "object", + "properties": { + "disks": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OmicronPhysicalDiskConfig" + } + }, + "generation": { + "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened). It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + } + }, + "required": [ + "disks", + "generation" + ] + }, "OmicronZoneConfig": { "description": "Describes one Omicron-managed zone running on a sled", "type": "object", diff --git a/schema/omicron-physical-disks.json b/schema/omicron-physical-disks.json new file mode 100644 index 0000000000..efc1b2cdd2 --- /dev/null +++ b/schema/omicron-physical-disks.json @@ -0,0 +1,74 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "OmicronPhysicalDisksConfig", + "type": "object", + "required": [ + "disks", + "generation" + ], + "properties": { + "disks": { + "type": "array", + "items": { + "$ref": "#/definitions/OmicronPhysicalDiskConfig" + } + }, + "generation": { + "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened). It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.", + "allOf": [ + { + "$ref": "#/definitions/Generation" + } + ] + } + }, + "definitions": { + "DiskIdentity": { + "description": "Uniquely identifies a disk.", + "type": "object", + "required": [ + "model", + "serial", + "vendor" + ], + "properties": { + "model": { + "type": "string" + }, + "serial": { + "type": "string" + }, + "vendor": { + "type": "string" + } + } + }, + "Generation": { + "description": "Generation numbers stored in the database, used for optimistic concurrency control", + "type": "integer", + "format": "uint64", + "minimum": 0.0 + }, + "OmicronPhysicalDiskConfig": { + "type": "object", + "required": [ + "id", + "identity", + "pool_id" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "identity": { + "$ref": "#/definitions/DiskIdentity" + }, + "pool_id": { + "type": "string", + "format": "uuid" + } + } + } + } +} \ No newline at end of file diff --git a/schema/rss-service-plan-v3.json b/schema/rss-service-plan-v3.json new file mode 100644 index 0000000000..fcc672a93b --- /dev/null +++ b/schema/rss-service-plan-v3.json @@ -0,0 +1,848 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Plan", + "type": "object", + "required": [ + "dns_config", + "services" + ], + "properties": { + "dns_config": { + "$ref": "#/definitions/DnsConfigParams" + }, + "services": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/SledConfig" + } + } + }, + "definitions": { + "DiskIdentity": { + "description": "Uniquely identifies a disk.", + "type": "object", + "required": [ + "model", + "serial", + "vendor" + ], + "properties": { + "model": { + "type": "string" + }, + "serial": { + "type": "string" + }, + "vendor": { + "type": "string" + } + } + }, + "DnsConfigParams": { + "description": "DnsConfigParams\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"generation\", \"time_created\", \"zones\" ], \"properties\": { \"generation\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"time_created\": { \"type\": \"string\", \"format\": \"date-time\" }, \"zones\": { \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/DnsConfigZone\" } } } } ```
", + "type": "object", + "required": [ + "generation", + "time_created", + "zones" + ], + "properties": { + "generation": { + "type": "integer", + "format": "uint64", + "minimum": 0.0 + }, + "time_created": { + "type": "string", + "format": "date-time" + }, + "zones": { + "type": "array", + "items": { + "$ref": "#/definitions/DnsConfigZone" + } + } + } + }, + "DnsConfigZone": { + "description": "DnsConfigZone\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"records\", \"zone_name\" ], \"properties\": { \"records\": { \"type\": \"object\", \"additionalProperties\": { \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/DnsRecord\" } } }, \"zone_name\": { \"type\": \"string\" } } } ```
", + "type": "object", + "required": [ + "records", + "zone_name" + ], + "properties": { + "records": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { + "$ref": "#/definitions/DnsRecord" + } + } + }, + "zone_name": { + "type": "string" + } + } + }, + "DnsRecord": { + "description": "DnsRecord\n\n
JSON schema\n\n```json { \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"type\": \"string\", \"format\": \"ipv4\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"A\" ] } } }, { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"type\": \"string\", \"format\": \"ipv6\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"AAAA\" ] } } }, { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"$ref\": \"#/components/schemas/Srv\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"SRV\" ] } } } ] } ```
", + "oneOf": [ + { + "type": "object", + "required": [ + "data", + "type" + ], + "properties": { + "data": { + "type": "string", + "format": "ipv4" + }, + "type": { + "type": "string", + "enum": [ + "A" + ] + } + } + }, + { + "type": "object", + "required": [ + "data", + "type" + ], + "properties": { + "data": { + "type": "string", + "format": "ipv6" + }, + "type": { + "type": "string", + "enum": [ + "AAAA" + ] + } + } + }, + { + "type": "object", + "required": [ + "data", + "type" + ], + "properties": { + "data": { + "$ref": "#/definitions/Srv" + }, + "type": { + "type": "string", + "enum": [ + "SRV" + ] + } + } + } + ] + }, + "Generation": { + "description": "Generation numbers stored in the database, used for optimistic concurrency control", + "type": "integer", + "format": "uint64", + "minimum": 0.0 + }, + "IpNet": { + "oneOf": [ + { + "title": "v4", + "allOf": [ + { + "$ref": "#/definitions/Ipv4Net" + } + ] + }, + { + "title": "v6", + "allOf": [ + { + "$ref": "#/definitions/Ipv6Net" + } + ] + } + ] + }, + "Ipv4Net": { + "title": "An IPv4 subnet", + "description": "An IPv4 subnet, including prefix and subnet mask", + "examples": [ + "192.168.1.0/24" + ], + "type": "string", + "pattern": "^(([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])/([0-9]|1[0-9]|2[0-9]|3[0-2])$" + }, + "Ipv6Net": { + "title": "An IPv6 subnet", + "description": "An IPv6 subnet, including prefix and subnet mask", + "examples": [ + "fd12:3456::/64" + ], + "type": "string", + "pattern": "^([fF][dD])[0-9a-fA-F]{2}:(([0-9a-fA-F]{1,4}:){6}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,6}:)([0-9a-fA-F]{1,4})?\\/([0-9]|[1-9][0-9]|1[0-1][0-9]|12[0-8])$" + }, + "MacAddr": { + "title": "A MAC address", + "description": "A Media Access Control address, in EUI-48 format", + "examples": [ + "ff:ff:ff:ff:ff:ff" + ], + "type": "string", + "maxLength": 17, + "minLength": 5, + "pattern": "^([0-9a-fA-F]{0,2}:){5}[0-9a-fA-F]{0,2}$" + }, + "Name": { + "title": "A name unique within the parent collection", + "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID though they may contain a UUID.", + "type": "string", + "maxLength": 63, + "minLength": 1, + "pattern": "^(?![0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$)^[a-z]([a-zA-Z0-9-]*[a-zA-Z0-9]+)?$" + }, + "NetworkInterface": { + "description": "Information required to construct a virtual network interface", + "type": "object", + "required": [ + "id", + "ip", + "kind", + "mac", + "name", + "primary", + "slot", + "subnet", + "vni" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "ip": { + "type": "string", + "format": "ip" + }, + "kind": { + "$ref": "#/definitions/NetworkInterfaceKind" + }, + "mac": { + "$ref": "#/definitions/MacAddr" + }, + "name": { + "$ref": "#/definitions/Name" + }, + "primary": { + "type": "boolean" + }, + "slot": { + "type": "integer", + "format": "uint8", + "minimum": 0.0 + }, + "subnet": { + "$ref": "#/definitions/IpNet" + }, + "vni": { + "$ref": "#/definitions/Vni" + } + } + }, + "NetworkInterfaceKind": { + "description": "The type of network interface", + "oneOf": [ + { + "description": "A vNIC attached to a guest instance", + "type": "object", + "required": [ + "id", + "type" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "type": { + "type": "string", + "enum": [ + "instance" + ] + } + } + }, + { + "description": "A vNIC associated with an internal service", + "type": "object", + "required": [ + "id", + "type" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "type": { + "type": "string", + "enum": [ + "service" + ] + } + } + }, + { + "description": "A vNIC associated with a probe", + "type": "object", + "required": [ + "id", + "type" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "type": { + "type": "string", + "enum": [ + "probe" + ] + } + } + } + ] + }, + "OmicronPhysicalDiskConfig": { + "type": "object", + "required": [ + "id", + "identity", + "pool_id" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "identity": { + "$ref": "#/definitions/DiskIdentity" + }, + "pool_id": { + "type": "string", + "format": "uuid" + } + } + }, + "OmicronPhysicalDisksConfig": { + "type": "object", + "required": [ + "disks", + "generation" + ], + "properties": { + "disks": { + "type": "array", + "items": { + "$ref": "#/definitions/OmicronPhysicalDiskConfig" + } + }, + "generation": { + "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened). It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.", + "allOf": [ + { + "$ref": "#/definitions/Generation" + } + ] + } + } + }, + "OmicronZoneConfig": { + "description": "Describes one Omicron-managed zone running on a sled", + "type": "object", + "required": [ + "id", + "underlay_address", + "zone_type" + ], + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "underlay_address": { + "type": "string", + "format": "ipv6" + }, + "zone_type": { + "$ref": "#/definitions/OmicronZoneType" + } + } + }, + "OmicronZoneDataset": { + "description": "Describes a persistent ZFS dataset associated with an Omicron zone", + "type": "object", + "required": [ + "pool_name" + ], + "properties": { + "pool_name": { + "$ref": "#/definitions/ZpoolName" + } + } + }, + "OmicronZoneType": { + "description": "Describes what kind of zone this is (i.e., what component is running in it) as well as any type-specific configuration", + "oneOf": [ + { + "type": "object", + "required": [ + "address", + "dns_servers", + "nic", + "ntp_servers", + "snat_cfg", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dns_servers": { + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "domain": { + "type": [ + "string", + "null" + ] + }, + "nic": { + "description": "The service vNIC providing outbound connectivity using OPTE.", + "allOf": [ + { + "$ref": "#/definitions/NetworkInterface" + } + ] + }, + "ntp_servers": { + "type": "array", + "items": { + "type": "string" + } + }, + "snat_cfg": { + "description": "The SNAT configuration for outbound connections.", + "allOf": [ + { + "$ref": "#/definitions/SourceNatConfig" + } + ] + }, + "type": { + "type": "string", + "enum": [ + "boundary_ntp" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "clickhouse_keeper" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "cockroach_db" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dataset", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "type": { + "type": "string", + "enum": [ + "crucible" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "crucible_pantry" + ] + } + } + }, + { + "type": "object", + "required": [ + "dataset", + "dns_address", + "http_address", + "nic", + "type" + ], + "properties": { + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "dns_address": { + "description": "The address at which the external DNS server is reachable.", + "type": "string" + }, + "http_address": { + "description": "The address at which the external DNS server API is reachable.", + "type": "string" + }, + "nic": { + "description": "The service vNIC providing external connectivity using OPTE.", + "allOf": [ + { + "$ref": "#/definitions/NetworkInterface" + } + ] + }, + "type": { + "type": "string", + "enum": [ + "external_dns" + ] + } + } + }, + { + "type": "object", + "required": [ + "dataset", + "dns_address", + "gz_address", + "gz_address_index", + "http_address", + "type" + ], + "properties": { + "dataset": { + "$ref": "#/definitions/OmicronZoneDataset" + }, + "dns_address": { + "type": "string" + }, + "gz_address": { + "description": "The addresses in the global zone which should be created\n\nFor the DNS service, which exists outside the sleds's typical subnet - adding an address in the GZ is necessary to allow inter-zone traffic routing.", + "type": "string", + "format": "ipv6" + }, + "gz_address_index": { + "description": "The address is also identified with an auxiliary bit of information to ensure that the created global zone address can have a unique name.", + "type": "integer", + "format": "uint32", + "minimum": 0.0 + }, + "http_address": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "internal_dns" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "dns_servers", + "ntp_servers", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "dns_servers": { + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "domain": { + "type": [ + "string", + "null" + ] + }, + "ntp_servers": { + "type": "array", + "items": { + "type": "string" + } + }, + "type": { + "type": "string", + "enum": [ + "internal_ntp" + ] + } + } + }, + { + "type": "object", + "required": [ + "external_dns_servers", + "external_ip", + "external_tls", + "internal_address", + "nic", + "type" + ], + "properties": { + "external_dns_servers": { + "description": "External DNS servers Nexus can use to resolve external hosts.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + }, + "external_ip": { + "description": "The address at which the external nexus server is reachable.", + "type": "string", + "format": "ip" + }, + "external_tls": { + "description": "Whether Nexus's external endpoint should use TLS", + "type": "boolean" + }, + "internal_address": { + "description": "The address at which the internal nexus server is reachable.", + "type": "string" + }, + "nic": { + "description": "The service vNIC providing external connectivity using OPTE.", + "allOf": [ + { + "$ref": "#/definitions/NetworkInterface" + } + ] + }, + "type": { + "type": "string", + "enum": [ + "nexus" + ] + } + } + }, + { + "type": "object", + "required": [ + "address", + "type" + ], + "properties": { + "address": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "oximeter" + ] + } + } + } + ] + }, + "SledConfig": { + "type": "object", + "required": [ + "disks", + "zones" + ], + "properties": { + "disks": { + "description": "Control plane disks configured for this sled", + "allOf": [ + { + "$ref": "#/definitions/OmicronPhysicalDisksConfig" + } + ] + }, + "zones": { + "description": "zones configured for this sled", + "type": "array", + "items": { + "$ref": "#/definitions/OmicronZoneConfig" + } + } + } + }, + "SourceNatConfig": { + "description": "An IP address and port range used for source NAT, i.e., making outbound network connections from guests or services.", + "type": "object", + "required": [ + "first_port", + "ip", + "last_port" + ], + "properties": { + "first_port": { + "description": "The first port used for source NAT, inclusive.", + "type": "integer", + "format": "uint16", + "minimum": 0.0 + }, + "ip": { + "description": "The external address provided to the instance or service.", + "type": "string", + "format": "ip" + }, + "last_port": { + "description": "The last port used for source NAT, also inclusive.", + "type": "integer", + "format": "uint16", + "minimum": 0.0 + } + } + }, + "Srv": { + "description": "Srv\n\n
JSON schema\n\n```json { \"type\": \"object\", \"required\": [ \"port\", \"prio\", \"target\", \"weight\" ], \"properties\": { \"port\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 }, \"prio\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 }, \"target\": { \"type\": \"string\" }, \"weight\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 } } } ```
", + "type": "object", + "required": [ + "port", + "prio", + "target", + "weight" + ], + "properties": { + "port": { + "type": "integer", + "format": "uint16", + "minimum": 0.0 + }, + "prio": { + "type": "integer", + "format": "uint16", + "minimum": 0.0 + }, + "target": { + "type": "string" + }, + "weight": { + "type": "integer", + "format": "uint16", + "minimum": 0.0 + } + } + }, + "Vni": { + "description": "A Geneve Virtual Network Identifier", + "type": "integer", + "format": "uint32", + "minimum": 0.0 + }, + "ZpoolName": { + "title": "The name of a Zpool", + "description": "Zpool names are of the format ox{i,p}_. They are either Internal or External, and should be unique", + "type": "string", + "pattern": "^ox[ip]_[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$" + } + } +} \ No newline at end of file diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index c941ee2625..734055b9e5 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -106,7 +106,7 @@ tempfile.workspace = true tokio-stream.workspace = true tokio-util.workspace = true -illumos-utils = { workspace = true, features = ["testing", "tmp_keypath"] } +illumos-utils = { workspace = true, features = ["testing"] } sled-storage = { workspace = true, features = ["testing"] } # diff --git a/sled-agent/src/bootstrap/bootstore_setup.rs b/sled-agent/src/bootstrap/bootstore_setup.rs index e5079b978e..ee9a321474 100644 --- a/sled-agent/src/bootstrap/bootstore_setup.rs +++ b/sled-agent/src/bootstrap/bootstore_setup.rs @@ -15,7 +15,7 @@ use omicron_ddm_admin_client::Client as DdmAdminClient; use sled_hardware_types::underlay::BootstrapInterface; use sled_hardware_types::Baseboard; use sled_storage::dataset::CLUSTER_DATASET; -use sled_storage::resources::StorageResources; +use sled_storage::resources::AllDisks; use slog::Logger; use std::collections::BTreeSet; use std::net::Ipv6Addr; @@ -26,7 +26,7 @@ const BOOTSTORE_FSM_STATE_FILE: &str = "bootstore-fsm-state.json"; const BOOTSTORE_NETWORK_CONFIG_FILE: &str = "bootstore-network-config.json"; pub fn new_bootstore_config( - storage_resources: &StorageResources, + all_disks: &AllDisks, baseboard: Baseboard, global_zone_bootstrap_ip: Ipv6Addr, ) -> Result { @@ -37,17 +37,17 @@ pub fn new_bootstore_config( learn_timeout: Duration::from_secs(5), rack_init_timeout: Duration::from_secs(300), rack_secret_request_timeout: Duration::from_secs(5), - fsm_state_ledger_paths: bootstore_fsm_state_paths(&storage_resources)?, + fsm_state_ledger_paths: bootstore_fsm_state_paths(&all_disks)?, network_config_ledger_paths: bootstore_network_config_paths( - &storage_resources, + &all_disks, )?, }) } fn bootstore_fsm_state_paths( - storage: &StorageResources, + all_disks: &AllDisks, ) -> Result, StartError> { - let paths: Vec<_> = storage + let paths: Vec<_> = all_disks .all_m2_mountpoints(CLUSTER_DATASET) .into_iter() .map(|p| p.join(BOOTSTORE_FSM_STATE_FILE)) @@ -60,9 +60,9 @@ fn bootstore_fsm_state_paths( } fn bootstore_network_config_paths( - storage: &StorageResources, + all_disks: &AllDisks, ) -> Result, StartError> { - let paths: Vec<_> = storage + let paths: Vec<_> = all_disks .all_m2_mountpoints(CLUSTER_DATASET) .into_iter() .map(|p| p.join(BOOTSTORE_NETWORK_CONFIG_FILE)) diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 38bedf921c..0657004b72 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -20,7 +20,6 @@ use crate::long_running_tasks::{ use crate::services::ServiceManager; use crate::services::TimeSyncConfig; use crate::sled_agent::SledAgent; -use crate::storage_monitor::UnderlayAccess; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; use futures::stream; @@ -54,7 +53,6 @@ pub(super) struct BootstrapAgentStartup { pub(super) service_manager: ServiceManager, pub(super) long_running_task_handles: LongRunningTaskHandles, pub(super) sled_agent_started_tx: oneshot::Sender, - pub(super) underlay_available_tx: oneshot::Sender, } impl BootstrapAgentStartup { @@ -126,7 +124,6 @@ impl BootstrapAgentStartup { long_running_task_handles, sled_agent_started_tx, service_manager_ready_tx, - underlay_available_tx, ) = spawn_all_longrunning_tasks( &base_log, sled_mode, @@ -172,7 +169,6 @@ impl BootstrapAgentStartup { service_manager, long_running_task_handles, sled_agent_started_tx, - underlay_available_tx, }) } } diff --git a/sled-agent/src/bootstrap/server.rs b/sled-agent/src/bootstrap/server.rs index bca3350696..6f61e87663 100644 --- a/sled-agent/src/bootstrap/server.rs +++ b/sled-agent/src/bootstrap/server.rs @@ -26,7 +26,6 @@ use crate::long_running_tasks::LongRunningTaskHandles; use crate::server::Server as SledAgentServer; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; -use crate::storage_monitor::UnderlayAccess; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use cancel_safe_futures::TryStreamExt; @@ -179,7 +178,6 @@ impl Server { service_manager, long_running_task_handles, sled_agent_started_tx, - underlay_available_tx, } = BootstrapAgentStartup::run(config).await?; // Do we have a StartSledAgentRequest stored in the ledger? @@ -242,7 +240,6 @@ impl Server { &config, start_sled_agent_request, long_running_task_handles.clone(), - underlay_available_tx, service_manager.clone(), &ddm_admin_localhost_client, &base_log, @@ -264,10 +261,7 @@ impl Server { sled_agent.load_services().await; SledAgentState::ServerStarted(sled_agent_server) } else { - SledAgentState::Bootstrapping( - Some(sled_agent_started_tx), - Some(underlay_available_tx), - ) + SledAgentState::Bootstrapping(Some(sled_agent_started_tx)) }; // Spawn our inner task that handles any future hardware updates and any @@ -310,10 +304,7 @@ impl Server { // bootstrap server). enum SledAgentState { // We're still in the bootstrapping phase, waiting for a sled-agent request. - Bootstrapping( - Option>, - Option>, - ), + Bootstrapping(Option>), // ... or the sled agent server is running. ServerStarted(SledAgentServer), } @@ -357,7 +348,6 @@ async fn start_sled_agent( config: &SledConfig, request: StartSledAgentRequest, long_running_task_handles: LongRunningTaskHandles, - underlay_available_tx: oneshot::Sender, service_manager: ServiceManager, ddmd_client: &DdmAdminClient, base_log: &Logger, @@ -429,7 +419,6 @@ async fn start_sled_agent( request.clone(), long_running_task_handles.clone(), service_manager, - underlay_available_tx, ) .await .map_err(SledAgentServerStartError::FailedStartingServer)?; @@ -495,7 +484,7 @@ impl From for SledAgentServerStartError { async fn sled_config_paths( storage: &StorageHandle, ) -> Result, MissingM2Paths> { - let resources = storage.get_latest_resources().await; + let resources = storage.get_latest_disks().await; let paths: Vec<_> = resources .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -573,10 +562,7 @@ impl Inner { log: &Logger, ) { match &mut self.state { - SledAgentState::Bootstrapping( - sled_agent_started_tx, - underlay_available_tx, - ) => { + SledAgentState::Bootstrapping(sled_agent_started_tx) => { let request_id = request.body.id; // Extract from options to satisfy the borrow checker. @@ -587,14 +573,11 @@ impl Inner { // See https://github.com/oxidecomputer/omicron/issues/4494 let sled_agent_started_tx = sled_agent_started_tx.take().unwrap(); - let underlay_available_tx = - underlay_available_tx.take().unwrap(); let response = match start_sled_agent( &self.config, request, self.long_running_task_handles.clone(), - underlay_available_tx, self.service_manager.clone(), &self.ddm_admin_localhost_client, &self.base_log, @@ -664,7 +647,7 @@ impl Inner { let config_dirs = self .long_running_task_handles .storage_manager - .get_latest_resources() + .get_latest_disks() .await .all_m2_mountpoints(CONFIG_DATASET) .into_iter(); diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index 058f343e2a..d084f5f546 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -12,7 +12,6 @@ use illumos_utils::dladm::Dladm; use illumos_utils::dladm::FindPhysicalLinkError; use illumos_utils::dladm::PhysicalLink; use illumos_utils::dladm::CHELSIO_LINK_PREFIX; -use illumos_utils::zpool::ZpoolName; use omicron_common::vlan::VlanID; use serde::Deserialize; use sled_hardware::is_gimlet; @@ -65,8 +64,8 @@ pub struct Config { pub swap_device_size_gb: Option, /// Optional VLAN ID to be used for tagging guest VNICs. pub vlan: Option, - /// Optional list of zpools to be used as "discovered disks". - pub zpools: Option>, + /// Optional list of virtual devices to be used as "discovered disks". + pub vdevs: Option>, /// Optionally skip waiting for time synchronization pub skip_timesync: Option, diff --git a/sled-agent/src/dump_setup.rs b/sled-agent/src/dump_setup.rs index bdbc008ccb..4717f8b49e 100644 --- a/sled-agent/src/dump_setup.rs +++ b/sled-agent/src/dump_setup.rs @@ -89,13 +89,12 @@ use illumos_utils::dumpadm::{DumpAdm, DumpContentType}; use illumos_utils::zone::ZONE_PREFIX; use illumos_utils::zpool::{ZpoolHealth, ZpoolName}; use illumos_utils::ExecutionError; -use omicron_common::disk::DiskIdentity; use sled_hardware::DiskVariant; +use sled_storage::config::MountConfig; use sled_storage::dataset::{CRASH_DATASET, DUMP_DATASET}; use sled_storage::disk::Disk; -use sled_storage::pool::Pool; use slog::Logger; -use std::collections::{BTreeMap, HashSet}; +use std::collections::HashSet; use std::ffi::OsString; use std::path::{Path, PathBuf}; use std::sync::{Arc, Weak}; @@ -119,32 +118,50 @@ struct DebugDataset(Utf8PathBuf); struct CoreDataset(Utf8PathBuf); #[derive(AsRef, Clone, From)] -pub(super) struct CoreZpool(pub ZpoolName); +struct CoreZpool { + mount_config: MountConfig, + name: ZpoolName, +} + #[derive(AsRef, Clone, From)] -pub(super) struct DebugZpool(pub ZpoolName); +struct DebugZpool { + mount_config: MountConfig, + name: ZpoolName, +} impl GetMountpoint for DebugZpool { type NewType = DebugDataset; const MOUNTPOINT: &'static str = DUMP_DATASET; + fn mount_config(&self) -> &MountConfig { + &self.mount_config + } } impl GetMountpoint for CoreZpool { type NewType = CoreDataset; const MOUNTPOINT: &'static str = CRASH_DATASET; + fn mount_config(&self) -> &MountConfig { + &self.mount_config + } } // only want to access these directories after they're mounted! trait GetMountpoint: AsRef { type NewType: From; const MOUNTPOINT: &'static str; + + fn mount_config(&self) -> &MountConfig; + fn mountpoint( &self, invoker: &dyn ZfsInvoker, ) -> Result, ZfsGetError> { if invoker.zfs_get_prop(&self.as_ref().to_string(), "mounted")? == "yes" { - Ok(Some(Self::NewType::from( - invoker.mountpoint(self.as_ref(), Self::MOUNTPOINT), - ))) + Ok(Some(Self::NewType::from(invoker.mountpoint( + self.mount_config(), + self.as_ref(), + Self::MOUNTPOINT, + )))) } else { Ok(None) } @@ -172,12 +189,13 @@ struct DumpSetupWorker { pub struct DumpSetup { worker: Arc>, + mount_config: MountConfig, _poller: std::thread::JoinHandle<()>, log: Logger, } impl DumpSetup { - pub fn new(log: &Logger) -> Self { + pub fn new(log: &Logger, mount_config: MountConfig) -> Self { let worker = Arc::new(std::sync::Mutex::new(DumpSetupWorker::new( Box::new(RealCoreDumpAdm {}), Box::new(RealZfs {}), @@ -190,18 +208,19 @@ impl DumpSetup { Self::poll_file_archival(worker_weak, log_poll) }); let log = log.new(o!("component" => "DumpSetup")); - Self { worker, _poller, log } + Self { worker, mount_config, _poller, log } } pub(crate) async fn update_dumpdev_setup( &self, - disks: &BTreeMap, + disks: impl Iterator, ) { let log = &self.log; let mut m2_dump_slices = Vec::new(); let mut u2_debug_datasets = Vec::new(); let mut m2_core_datasets = Vec::new(); - for (_id, (disk, _)) in disks.iter() { + let mount_config = self.mount_config.clone(); + for disk in disks { if disk.is_synthetic() { // We only setup dump devices on real disks continue; @@ -222,8 +241,10 @@ impl DumpSetup { illumos_utils::zpool::Zpool::get_info(&name.to_string()) { if info.health() == ZpoolHealth::Online { - m2_core_datasets - .push(CoreZpool::from(name.clone())); + m2_core_datasets.push(CoreZpool { + mount_config: mount_config.clone(), + name: name.clone(), + }); } else { warn!(log, "Zpool {name:?} not online, won't attempt to save process core dumps there"); } @@ -235,8 +256,10 @@ impl DumpSetup { illumos_utils::zpool::Zpool::get_info(&name.to_string()) { if info.health() == ZpoolHealth::Online { - u2_debug_datasets - .push(DebugZpool::from(name.clone())); + u2_debug_datasets.push(DebugZpool { + mount_config: mount_config.clone(), + name: name.clone(), + }); } else { warn!(log, "Zpool {name:?} not online, won't attempt to save kernel core dumps there"); } @@ -349,6 +372,7 @@ trait ZfsInvoker { fn mountpoint( &self, + mount_config: &MountConfig, zpool: &ZpoolName, mountpoint: &'static str, ) -> Utf8PathBuf; @@ -458,10 +482,11 @@ impl ZfsInvoker for RealZfs { fn mountpoint( &self, + mount_config: &MountConfig, zpool: &ZpoolName, mountpoint: &'static str, ) -> Utf8PathBuf { - zpool.dataset_mountpoint(mountpoint) + zpool.dataset_mountpoint(&mount_config.root, mountpoint) } } @@ -1120,6 +1145,7 @@ mod tests { fn mountpoint( &self, + _mount_config: &MountConfig, zpool: &ZpoolName, mountpoint: &'static str, ) -> Utf8PathBuf { @@ -1174,8 +1200,10 @@ mod tests { assert_eq!(worker.chosen_core_dir, None); // nothing when only a disk that's not ready - let non_mounted_zpool = - CoreZpool(ZpoolName::from_str(NOT_MOUNTED_INTERNAL).unwrap()); + let non_mounted_zpool = CoreZpool { + mount_config: MountConfig::default(), + name: ZpoolName::from_str(NOT_MOUNTED_INTERNAL).unwrap(), + }; worker.update_disk_loadout(vec![], vec![], vec![non_mounted_zpool]); assert_eq!(worker.chosen_core_dir, None); logctx.cleanup_successful(); @@ -1191,11 +1219,18 @@ mod tests { const MOUNTED_INTERNAL: &str = "oxi_474e554e-6174-616c-6965-4e677579656e"; const ERROR_INTERNAL: &str = "oxi_4861636b-2054-6865-2050-6c616e657421"; - let mounted_zpool = - CoreZpool(ZpoolName::from_str(MOUNTED_INTERNAL).unwrap()); - let non_mounted_zpool = - CoreZpool(ZpoolName::from_str(NOT_MOUNTED_INTERNAL).unwrap()); - let err_zpool = CoreZpool(ZpoolName::from_str(ERROR_INTERNAL).unwrap()); + let mounted_zpool = CoreZpool { + mount_config: MountConfig::default(), + name: ZpoolName::from_str(MOUNTED_INTERNAL).unwrap(), + }; + let non_mounted_zpool = CoreZpool { + mount_config: MountConfig::default(), + name: ZpoolName::from_str(NOT_MOUNTED_INTERNAL).unwrap(), + }; + let err_zpool = CoreZpool { + mount_config: MountConfig::default(), + name: ZpoolName::from_str(ERROR_INTERNAL).unwrap(), + }; const ZPOOL_MNT: &str = "/path/to/internal/zpool"; let mut worker = DumpSetupWorker::new( Box::::default(), @@ -1364,8 +1399,10 @@ mod tests { let tempdir = TempDir::new().unwrap(); let (occupied, _) = populate_tempdir_with_fake_dumps(&tempdir); - let mounted_zpool = - DebugZpool(ZpoolName::from_str(MOUNTED_EXTERNAL).unwrap()); + let mounted_zpool = DebugZpool { + mount_config: MountConfig::default(), + name: ZpoolName::from_str(MOUNTED_EXTERNAL).unwrap(), + }; worker.update_disk_loadout( vec![occupied.clone()], vec![mounted_zpool], @@ -1447,10 +1484,14 @@ mod tests { ) .unwrap(); - let mounted_core_zpool = - CoreZpool(ZpoolName::from_str(MOUNTED_INTERNAL).unwrap()); - let mounted_debug_zpool = - DebugZpool(ZpoolName::from_str(MOUNTED_EXTERNAL).unwrap()); + let mounted_core_zpool = CoreZpool { + mount_config: MountConfig::default(), + name: ZpoolName::from_str(MOUNTED_INTERNAL).unwrap(), + }; + let mounted_debug_zpool = DebugZpool { + mount_config: MountConfig::default(), + name: ZpoolName::from_str(MOUNTED_EXTERNAL).unwrap(), + }; worker.update_disk_loadout( vec![], diff --git a/sled-agent/src/hardware_monitor.rs b/sled-agent/src/hardware_monitor.rs index cbd3134cf0..3708a642f3 100644 --- a/sled-agent/src/hardware_monitor.rs +++ b/sled-agent/src/hardware_monitor.rs @@ -177,10 +177,27 @@ impl HardwareMonitor { } } HardwareUpdate::DiskAdded(disk) => { - self.storage_manager.upsert_disk(disk.into()).await; + // We notify the storage manager of the hardware, but do not need to + // wait for the result to be fully processed. + // + // Here and below, we're "dropping a future" rather than + // awaiting it. That's intentional - the hardware monitor + // doesn't care when this work is finished, just when it's + // enqueued. + #[allow(clippy::let_underscore_future)] + let _ = self + .storage_manager + .detected_raw_disk(disk.into()) + .await; } HardwareUpdate::DiskRemoved(disk) => { - self.storage_manager.delete_disk(disk.into()).await; + // We notify the storage manager of the hardware, but do not need to + // wait for the result to be fully processed. + #[allow(clippy::let_underscore_future)] + let _ = self + .storage_manager + .detected_raw_disk_removal(disk.into()) + .await; } }, Err(broadcast::error::RecvError::Lagged(count)) => { @@ -251,7 +268,11 @@ impl HardwareMonitor { self.deactivate_switch().await; } - self.storage_manager + // We notify the storage manager of the hardware, but do not need to + // wait for the result to be fully processed. + #[allow(clippy::let_underscore_future)] + let _ = self + .storage_manager .ensure_using_exactly_these_disks( self.hardware_manager.disks().into_iter().map(RawDisk::from), ) diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index bf1102d897..23a1bde4d8 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -11,8 +11,8 @@ use crate::params::{ BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, Inventory, - OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRulesEnsureBody, - ZoneBundleId, ZoneBundleMetadata, Zpool, + OmicronPhysicalDisksConfig, OmicronZonesConfig, SledRole, TimeSync, + VpcFirewallRulesEnsureBody, ZoneBundleId, ZoneBundleMetadata, Zpool, }; use crate::sled_agent::Error as SledAgentError; use crate::zone_bundle; @@ -40,6 +40,7 @@ use oximeter_producer::ProducerIdPathParams; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_hardware::DiskVariant; +use sled_storage::resources::DisksManagementResult; use std::collections::BTreeMap; use uuid::Uuid; @@ -60,6 +61,8 @@ pub fn api() -> SledApiDescription { api.register(omicron_zones_get)?; api.register(omicron_zones_put)?; api.register(zones_list)?; + api.register(omicron_physical_disks_get)?; + api.register(omicron_physical_disks_put)?; api.register(zone_bundle_list)?; api.register(zone_bundle_list_all)?; api.register(zone_bundle_create)?; @@ -338,6 +341,31 @@ async fn omicron_zones_get( Ok(HttpResponseOk(sa.omicron_zones_list().await?)) } +#[endpoint { + method = PUT, + path = "/omicron-physical-disks", +}] +async fn omicron_physical_disks_put( + rqctx: RequestContext, + body: TypedBody, +) -> Result, HttpError> { + let sa = rqctx.context(); + let body_args = body.into_inner(); + let result = sa.omicron_physical_disks_ensure(body_args).await?; + Ok(HttpResponseOk(result)) +} + +#[endpoint { + method = GET, + path = "/omicron-physical-disks", +}] +async fn omicron_physical_disks_get( + rqctx: RequestContext, +) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) +} + #[endpoint { method = PUT, path = "/omicron-zones", @@ -839,8 +867,8 @@ async fn host_os_write_start( // Find our corresponding disk. let maybe_disk_path = - sa.storage().get_latest_resources().await.disks().values().find_map( - |(disk, _pool)| { + sa.storage().get_latest_disks().await.iter_managed().find_map( + |(_identity, disk)| { // Synthetic disks panic if asked for their `slot()`, so filter // them out first; additionally, filter out any non-M2 disks. if disk.is_synthetic() || disk.variant() != DiskVariant::M2 { diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index b859c08a94..d016715591 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1340,7 +1340,7 @@ impl InstanceRunner { let mut rng = rand::rngs::StdRng::from_entropy(); let root = self .storage - .get_latest_resources() + .get_latest_disks() .await .all_u2_mountpoints(ZONE_DATASET) .choose(&mut rng) @@ -1520,17 +1520,15 @@ impl InstanceRunner { } } -#[cfg(test)] +#[cfg(all(test, target_os = "illumos"))] mod tests { use super::*; use crate::fakes::nexus::{FakeNexusServer, ServerContext}; - use crate::nexus::NexusClient; use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::CleanupContext; use camino_tempfile::Utf8TempDir; - use dns_server::dns_server::ServerHandle as DnsServerHandle; - use dropshot::test_util::LogContext; - use dropshot::{HandlerTaskMode, HttpServer}; + use dns_server::TransientServer; + use dropshot::HttpServer; use illumos_utils::dladm::MockDladm; use illumos_utils::dladm::__mock_MockDladm::__create_vnic::Context as MockDladmCreateVnicContext; use illumos_utils::dladm::__mock_MockDladm::__delete_vnic::Context as MockDladmDeleteVnicContext; @@ -1539,15 +1537,13 @@ mod tests { use illumos_utils::zone::MockZones; use illumos_utils::zone::__mock_MockZones::__boot::Context as MockZonesBootContext; use illumos_utils::zone::__mock_MockZones::__id::Context as MockZonesIdContext; - use illumos_utils::zpool::ZpoolName; use internal_dns::resolver::Resolver; - use internal_dns::ServiceName; use omicron_common::api::external::{ ByteCount, Generation, Hostname, InstanceCpuCount, InstanceState, }; use omicron_common::api::internal::nexus::InstanceProperties; - use sled_storage::disk::{RawDisk, SyntheticDisk}; - use sled_storage::manager::FakeStorageManager; + use omicron_common::FileKv; + use sled_storage::manager_test_harness::StorageManagerTestHarness; use std::net::Ipv6Addr; use std::str::FromStr; use tokio::sync::watch::Receiver; @@ -1584,26 +1580,42 @@ mod tests { } struct FakeNexusParts { - nexus_client: NexusClient, - nexus_server: HttpServer, + nexus_client: NexusClientWithResolver, + _nexus_server: HttpServer, state_rx: Receiver, + _dns_server: TransientServer, } impl FakeNexusParts { - fn new(logctx: &LogContext) -> Self { + async fn new(log: &Logger) -> Self { let (state_tx, state_rx) = tokio::sync::watch::channel(ReceivedInstanceState::None); - let nexus_server = crate::fakes::nexus::start_test_server( - logctx.log.new(o!("component" => "FakeNexusServer")), + let _nexus_server = crate::fakes::nexus::start_test_server( + log.new(o!("component" => "FakeNexusServer")), Box::new(NexusServer { observed_runtime_state: state_tx }), ); - let nexus_client = NexusClient::new( - &format!("http://{}", nexus_server.local_addr()), - logctx.log.new(o!("component" => "NexusClient")), + + let _dns_server = + crate::fakes::nexus::start_dns_server(&log, &_nexus_server) + .await; + + let resolver = Arc::new( + Resolver::new_from_addrs( + log.clone(), + &[_dns_server.dns_server.local_address()], + ) + .unwrap(), ); - Self { nexus_client, nexus_server, state_rx } + let nexus_client = + NexusClientWithResolver::new_from_resolver_with_port( + &log, + resolver, + _nexus_server.local_addr().port(), + ); + + Self { nexus_client, _nexus_server, state_rx, _dns_server } } } @@ -1639,65 +1651,6 @@ mod tests { (boot_ctx, wait_ctx, zone_id_ctx) } - async fn dns_server( - logctx: &LogContext, - nexus_server: &HttpServer, - ) -> (DnsServerHandle, Arc, Utf8TempDir) { - let storage_path = - Utf8TempDir::new().expect("Failed to create temporary directory"); - let config_store = dns_server::storage::Config { - keep_old_generations: 3, - storage_path: storage_path.path().to_owned(), - }; - - let (dns_server, dns_dropshot) = dns_server::start_servers( - logctx.log.new(o!("component" => "DnsServer")), - dns_server::storage::Store::new( - logctx.log.new(o!("component" => "DnsStore")), - &config_store, - ) - .unwrap(), - &dns_server::dns_server::Config { - bind_address: "[::1]:0".parse().unwrap(), - }, - &dropshot::ConfigDropshot { - bind_address: "[::1]:0".parse().unwrap(), - request_body_max_bytes: 8 * 1024, - default_handler_task_mode: HandlerTaskMode::Detached, - }, - ) - .await - .expect("starting DNS server"); - - let dns_dropshot_client = dns_service_client::Client::new( - &format!("http://{}", dns_dropshot.local_addr()), - logctx.log.new(o!("component" => "DnsDropshotClient")), - ); - let mut dns_config = internal_dns::DnsConfigBuilder::new(); - let IpAddr::V6(nexus_ip_addr) = nexus_server.local_addr().ip() else { - panic!("IPv6 address required for nexus_server") - }; - let zone = dns_config.host_zone(Uuid::new_v4(), nexus_ip_addr).unwrap(); - dns_config - .service_backend_zone( - ServiceName::Nexus, - &zone, - nexus_server.local_addr().port(), - ) - .unwrap(); - let dns_config = dns_config.build_full_config_for_initial_generation(); - dns_dropshot_client.dns_config_put(&dns_config).await.unwrap(); - - let resolver = Arc::new( - Resolver::new_from_addrs( - logctx.log.new(o!("component" => "Resolver")), - &[dns_server.local_address()], - ) - .unwrap(), - ); - (dns_server, resolver, storage_path) - } - // note the "mock" here is different from the vnic/zone contexts above. // this is actually running code for a dropshot server from propolis. // (might we want a locally-defined fake whose behavior we can control @@ -1736,19 +1689,22 @@ mod tests { (srv, client) } - // make a FakeStorageManager with a "U2" upserted - async fn fake_storage_manager_with_u2() -> StorageHandle { - let (storage_manager, storage_handle) = FakeStorageManager::new(); - tokio::spawn(storage_manager.run()); - let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let external_disk: RawDisk = - SyntheticDisk::new(external_zpool_name, 0).into(); - storage_handle.upsert_disk(external_disk).await; - storage_handle + async fn setup_storage_manager(log: &Logger) -> StorageManagerTestHarness { + let mut harness = StorageManagerTestHarness::new(log).await; + let raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + harness.handle().key_manager_ready().await; + let config = harness.make_config(1, &raw_disks); + let _ = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring disks should work after key manager is ready"); + harness } async fn instance_struct( - logctx: &LogContext, + log: &Logger, propolis_addr: SocketAddr, nexus_client_with_resolver: NexusClientWithResolver, storage_handle: StorageHandle, @@ -1763,7 +1719,7 @@ mod tests { fake_instance_initial_state(propolis_id, propolis_addr); let services = fake_instance_manager_services( - logctx, + log, storage_handle, nexus_client_with_resolver, temp_dir, @@ -1775,7 +1731,7 @@ mod tests { }; Instance::new( - logctx.log.new(o!("component" => "Instance")), + log.new(o!("component" => "Instance")), id, propolis_id, ticket, @@ -1833,7 +1789,7 @@ mod tests { } fn fake_instance_manager_services( - logctx: &LogContext, + log: &Logger, storage_handle: StorageHandle, nexus_client_with_resolver: NexusClientWithResolver, temp_dir: &String, @@ -1841,13 +1797,13 @@ mod tests { let vnic_allocator = VnicAllocator::new("Foo", Etherstub("mystub".to_string())); let port_manager = PortManager::new( - logctx.log.new(o!("component" => "PortManager")), + log.new(o!("component" => "PortManager")), Ipv6Addr::new(0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01), ); let cleanup_context = CleanupContext::default(); let zone_bundler = ZoneBundler::new( - logctx.log.new(o!("component" => "ZoneBundler")), + log.new(o!("component" => "ZoneBundler")), storage_handle.clone(), cleanup_context, ); @@ -1867,27 +1823,24 @@ mod tests { let logctx = omicron_test_utils::dev::test_setup_log( "test_instance_create_events_normal", ); + let log = logctx.log.new(o!(FileKv)); - let (propolis_server, _propolis_client) = - propolis_mock_server(&logctx.log); + let (propolis_server, _propolis_client) = propolis_mock_server(&log); let propolis_addr = propolis_server.local_addr(); // automock'd things used during this test let _mock_vnic_contexts = mock_vnic_contexts(); let _mock_zone_contexts = mock_zone_contexts(); - let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = - FakeNexusParts::new(&logctx); - - let (_dns_server, resolver, _dns_config_dir) = - timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) - .await - .expect("timed out making DNS server and Resolver"); - - let nexus_client_with_resolver = - NexusClientWithResolver::new_with_client(nexus_client, resolver); + let FakeNexusParts { + nexus_client, + mut state_rx, + _dns_server, + _nexus_server, + } = FakeNexusParts::new(&log).await; - let storage_handle = fake_storage_manager_with_u2().await; + let mut storage_harness = setup_storage_manager(&log).await; + let storage_handle = storage_harness.handle().clone(); let temp_guard = Utf8TempDir::new().unwrap(); let temp_dir = temp_guard.path().to_string(); @@ -1895,9 +1848,9 @@ mod tests { let inst = timeout( TIMEOUT_DURATION, instance_struct( - &logctx, + &log, propolis_addr, - nexus_client_with_resolver, + nexus_client, storage_handle, &temp_dir, ), @@ -1935,6 +1888,7 @@ mod tests { .expect("timed out waiting for InstanceState::Running in FakeNexus") .expect("failed to receive FakeNexus' InstanceState"); + storage_harness.cleanup().await; logctx.cleanup_successful(); } @@ -1944,23 +1898,21 @@ mod tests { let logctx = omicron_test_utils::dev::test_setup_log( "test_instance_create_timeout_while_starting_propolis", ); + let log = logctx.log.new(o!(FileKv)); // automock'd things used during this test let _mock_vnic_contexts = mock_vnic_contexts(); let _mock_zone_contexts = mock_zone_contexts(); - let FakeNexusParts { nexus_client, nexus_server, state_rx } = - FakeNexusParts::new(&logctx); - - let (_dns_server, resolver, _dns_config_dir) = - timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) - .await - .expect("timed out making DNS server and Resolver"); - - let nexus_client_with_resolver = - NexusClientWithResolver::new_with_client(nexus_client, resolver); + let FakeNexusParts { + nexus_client, + state_rx, + _dns_server, + _nexus_server, + } = FakeNexusParts::new(&log).await; - let storage_handle = fake_storage_manager_with_u2().await; + let mut storage_harness = setup_storage_manager(&logctx.log).await; + let storage_handle = storage_harness.handle().clone(); let temp_guard = Utf8TempDir::new().unwrap(); let temp_dir = temp_guard.path().to_string(); @@ -1968,10 +1920,10 @@ mod tests { let inst = timeout( TIMEOUT_DURATION, instance_struct( - &logctx, + &log, // we want to test propolis not ever coming up SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), - nexus_client_with_resolver, + nexus_client, storage_handle, &temp_dir, ), @@ -2007,6 +1959,7 @@ mod tests { panic!("Nexus's InstanceState should never have reached running if zone creation timed out"); } + storage_harness.cleanup().await; logctx.cleanup_successful(); } @@ -2015,6 +1968,7 @@ mod tests { let logctx = omicron_test_utils::dev::test_setup_log( "test_instance_create_timeout_while_creating_zone", ); + let log = logctx.log.new(o!(FileKv)); // automock'd things used during this test let _mock_vnic_contexts = mock_vnic_contexts(); @@ -2032,18 +1986,15 @@ mod tests { let zone_id_ctx = MockZones::id_context(); zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); - let FakeNexusParts { nexus_client, nexus_server, state_rx } = - FakeNexusParts::new(&logctx); - - let (_dns_server, resolver, _dns_config_dir) = - timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) - .await - .expect("timed out making DNS server and Resolver"); - - let nexus_client_with_resolver = - NexusClientWithResolver::new_with_client(nexus_client, resolver); + let FakeNexusParts { + nexus_client, + state_rx, + _dns_server, + _nexus_server, + } = FakeNexusParts::new(&log).await; - let storage_handle = fake_storage_manager_with_u2().await; + let mut storage_harness = setup_storage_manager(&logctx.log).await; + let storage_handle = storage_harness.handle().clone(); let temp_guard = Utf8TempDir::new().unwrap(); let temp_dir = temp_guard.path().to_string(); @@ -2051,10 +2002,10 @@ mod tests { let inst = timeout( TIMEOUT_DURATION, instance_struct( - &logctx, + &log, // isn't running because the "zone" never "boots" SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), - nexus_client_with_resolver, + nexus_client, storage_handle, &temp_dir, ), @@ -2090,6 +2041,7 @@ mod tests { panic!("Nexus's InstanceState should never have reached running if zone creation timed out"); } + storage_harness.cleanup().await; logctx.cleanup_successful(); } @@ -2098,23 +2050,21 @@ mod tests { let logctx = omicron_test_utils::dev::test_setup_log( "test_instance_manager_creation", ); + let log = logctx.log.new(o!(FileKv)); // automock'd things used during this test let _mock_vnic_contexts = mock_vnic_contexts(); let _mock_zone_contexts = mock_zone_contexts(); - let storage_handle = fake_storage_manager_with_u2().await; + let mut storage_harness = setup_storage_manager(&logctx.log).await; + let storage_handle = storage_harness.handle().clone(); - let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = - FakeNexusParts::new(&logctx); - - let (_dns_server, resolver, _dns_config_dir) = - timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) - .await - .expect("timed out making DNS server and Resolver"); - - let nexus_client_with_resolver = - NexusClientWithResolver::new_with_client(nexus_client, resolver); + let FakeNexusParts { + nexus_client, + mut state_rx, + _dns_server, + _nexus_server, + } = FakeNexusParts::new(&log).await; let temp_guard = Utf8TempDir::new().unwrap(); let temp_dir = temp_guard.path().to_string(); @@ -2127,9 +2077,9 @@ mod tests { zone_bundler, zone_builder_factory, } = fake_instance_manager_services( - &logctx, + &log, storage_handle, - nexus_client_with_resolver, + nexus_client, &temp_dir, ); @@ -2196,6 +2146,7 @@ mod tests { .expect("timed out waiting for InstanceState::Running in FakeNexus") .expect("failed to receive FakeNexus' InstanceState"); + storage_harness.cleanup().await; logctx.cleanup_successful(); } } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 2c9780b3ce..cf6563b117 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -750,7 +750,7 @@ impl InstanceTicket { InstanceTicket { id, terminate_tx: Some(terminate_tx) } } - #[cfg(test)] + #[cfg(all(test, target_os = "illumos"))] pub(crate) fn new_without_manager_for_test(id: Uuid) -> Self { Self { id, terminate_tx: None } } diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 3b29bdda60..9b0ea7ac6c 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -20,12 +20,13 @@ use crate::config::Config; use crate::hardware_monitor::HardwareMonitor; use crate::services::ServiceManager; use crate::sled_agent::SledAgent; -use crate::storage_monitor::{StorageMonitor, UnderlayAccess}; +use crate::storage_monitor::StorageMonitor; use crate::zone_bundle::{CleanupContext, ZoneBundler}; use bootstore::schemes::v0 as bootstore; use key_manager::{KeyManager, StorageKeyRequester}; use sled_hardware::{HardwareManager, SledMode}; -use sled_storage::disk::SyntheticDisk; +use sled_storage::config::MountConfig; +use sled_storage::disk::RawSyntheticDisk; use sled_storage::manager::{StorageHandle, StorageManager}; use slog::{info, Logger}; use std::net::Ipv6Addr; @@ -65,14 +66,12 @@ pub async fn spawn_all_longrunning_tasks( LongRunningTaskHandles, oneshot::Sender, oneshot::Sender, - oneshot::Sender, ) { let storage_key_requester = spawn_key_manager(log); let mut storage_manager = spawn_storage_manager(log, storage_key_requester.clone()); - let underlay_available_tx = - spawn_storage_monitor(log, storage_manager.clone()); + spawn_storage_monitor(log, storage_manager.clone()); let hardware_manager = spawn_hardware_manager(log, sled_mode).await; @@ -81,7 +80,7 @@ pub async fn spawn_all_longrunning_tasks( spawn_hardware_monitor(log, &hardware_manager, &storage_manager); // Add some synthetic disks if necessary. - upsert_synthetic_zpools_if_needed(&log, &storage_manager, &config).await; + upsert_synthetic_disks_if_needed(&log, &storage_manager, &config).await; // Wait for the boot disk so that we can work with any ledgers, // such as those needed by the bootstore and sled-agent @@ -109,7 +108,6 @@ pub async fn spawn_all_longrunning_tasks( }, sled_agent_started_tx, service_manager_ready_tx, - underlay_available_tx, ) } @@ -127,24 +125,21 @@ fn spawn_storage_manager( key_requester: StorageKeyRequester, ) -> StorageHandle { info!(log, "Starting StorageManager"); - let (manager, handle) = StorageManager::new(log, key_requester); + let (manager, handle) = + StorageManager::new(log, MountConfig::default(), key_requester); tokio::spawn(async move { manager.run().await; }); handle } -fn spawn_storage_monitor( - log: &Logger, - storage_handle: StorageHandle, -) -> oneshot::Sender { +fn spawn_storage_monitor(log: &Logger, storage_handle: StorageHandle) { info!(log, "Starting StorageMonitor"); - let (storage_monitor, underlay_available_tx) = - StorageMonitor::new(log, storage_handle); + let storage_monitor = + StorageMonitor::new(log, MountConfig::default(), storage_handle); tokio::spawn(async move { storage_monitor.run().await; }); - underlay_available_tx } async fn spawn_hardware_manager( @@ -188,9 +183,9 @@ async fn spawn_bootstore_tasks( hardware_manager: &HardwareManager, global_zone_bootstrap_ip: Ipv6Addr, ) -> bootstore::NodeHandle { - let storage_resources = storage_handle.get_latest_resources().await; + let iter_all = storage_handle.get_latest_disks().await; let config = new_bootstore_config( - &storage_resources, + &iter_all, hardware_manager.baseboard(), global_zone_bootstrap_ip, ) @@ -222,21 +217,22 @@ fn spawn_zone_bundler_tasks( ZoneBundler::new(log, storage_handle.clone(), CleanupContext::default()) } -async fn upsert_synthetic_zpools_if_needed( +async fn upsert_synthetic_disks_if_needed( log: &Logger, storage_manager: &StorageHandle, config: &Config, ) { - if let Some(pools) = &config.zpools { - for (i, pool) in pools.iter().enumerate() { + if let Some(vdevs) = &config.vdevs { + for (i, vdev) in vdevs.iter().enumerate() { info!( log, - "Upserting synthetic zpool to Storage Manager: {}", - pool.to_string() + "Upserting synthetic device to Storage Manager"; + "vdev" => vdev.to_string(), ); - let disk = - SyntheticDisk::new(pool.clone(), i.try_into().unwrap()).into(); - storage_manager.upsert_disk(disk).await; + let disk = RawSyntheticDisk::load(vdev, i.try_into().unwrap()) + .expect("Failed to parse synthetic disk") + .into(); + storage_manager.detected_raw_disk(disk).await.await.unwrap(); } } } diff --git a/sled-agent/src/nexus.rs b/sled-agent/src/nexus.rs index 3f24c6a806..12fcc05ce3 100644 --- a/sled-agent/src/nexus.rs +++ b/sled-agent/src/nexus.rs @@ -60,16 +60,6 @@ impl NexusClientWithResolver { } } - // for when we have a NexusClient constructed from a FakeNexusServer - // (no need to expose this function outside of tests) - #[cfg(test)] - pub(crate) fn new_with_client( - client: NexusClient, - resolver: Arc, - ) -> Self { - Self { client, resolver } - } - /// Access the progenitor-based Nexus Client. pub fn client(&self) -> &NexusClient { &self.client diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index c9e0211690..12c2907f49 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -294,6 +294,11 @@ impl std::fmt::Display for ZoneType { } } +pub type OmicronPhysicalDiskConfig = + sled_storage::disk::OmicronPhysicalDiskConfig; +pub type OmicronPhysicalDisksConfig = + sled_storage::disk::OmicronPhysicalDisksConfig; + /// Describes the set of Omicron-managed zones running on a sled #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs index 8481dc4b79..16559039a2 100644 --- a/sled-agent/src/probe_manager.rs +++ b/sled-agent/src/probe_manager.rs @@ -206,7 +206,7 @@ impl ProbeManagerInner { let mut rng = rand::rngs::StdRng::from_entropy(); let root = self .storage - .get_latest_resources() + .get_latest_disks() .await .all_u2_mountpoints(ZONE_DATASET) .choose(&mut rng) diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 153031a545..9e0a2941c5 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -5,7 +5,10 @@ //! Plan generation for "where should services be initialized". use crate::bootstrap::params::StartSledAgentRequest; -use crate::params::{OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType}; +use crate::params::{ + OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, OmicronZoneConfig, + OmicronZoneDataset, OmicronZoneType, +}; use crate::rack_setup::config::SetupServiceConfig as Config; use camino::Utf8PathBuf; use dns_service_client::types::DnsConfigParams; @@ -18,7 +21,7 @@ use omicron_common::address::{ MGD_PORT, MGS_PORT, NEXUS_REDUNDANCY, NTP_PORT, NUM_SOURCE_NAT_PORTS, RSS_RESERVED_ADDRESSES, SLED_PREFIX, }; -use omicron_common::api::external::{MacAddr, Vni}; +use omicron_common::api::external::{Generation, MacAddr, Vni}; use omicron_common::api::internal::shared::{ NetworkInterface, NetworkInterfaceKind, SourceNatConfig, }; @@ -59,7 +62,7 @@ const CLICKHOUSE_COUNT: usize = 1; const CLICKHOUSE_KEEPER_COUNT: usize = 0; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove. // when Nexus provisions Crucible. -const MINIMUM_U2_ZPOOL_COUNT: usize = 3; +const MINIMUM_U2_COUNT: usize = 3; // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove. // when Nexus provisions the Pantry. const PANTRY_COUNT: usize = 3; @@ -94,10 +97,16 @@ pub enum PlanError { #[error("Found only v1 service plan")] FoundV1, + + #[error("Found only v2 service plan")] + FoundV2, } #[derive(Clone, Debug, Default, Serialize, Deserialize, JsonSchema)] pub struct SledConfig { + /// Control plane disks configured for this sled + pub disks: OmicronPhysicalDisksConfig, + /// zones configured for this sled pub zones: Vec, } @@ -115,7 +124,8 @@ impl Ledgerable for Plan { fn generation_bump(&mut self) {} } const RSS_SERVICE_PLAN_V1_FILENAME: &str = "rss-service-plan.json"; -const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan-v2.json"; +const RSS_SERVICE_PLAN_V2_FILENAME: &str = "rss-service-plan-v2.json"; +const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan-v3.json"; impl Plan { pub async fn load( @@ -123,7 +133,7 @@ impl Plan { storage_manager: &StorageHandle, ) -> Result, PlanError> { let paths: Vec = storage_manager - .get_latest_resources() + .get_latest_disks() .await .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -167,6 +177,14 @@ impl Plan { // support a condition that we do not believe can ever happen in any // system. Err(PlanError::FoundV1) + } else if Self::has_v2(storage_manager).await.map_err(|err| { + // Same as the comment above, but for version 2. + PlanError::Io { + message: String::from("looking for v2 RSS plan"), + err, + } + })? { + Err(PlanError::FoundV2) } else { Ok(None) } @@ -176,7 +194,7 @@ impl Plan { storage_manager: &StorageHandle, ) -> Result { let paths = storage_manager - .get_latest_resources() + .get_latest_disks() .await .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -191,6 +209,25 @@ impl Plan { Ok(false) } + async fn has_v2( + storage_manager: &StorageHandle, + ) -> Result { + let paths = storage_manager + .get_latest_disks() + .await + .all_m2_mountpoints(CONFIG_DATASET) + .into_iter() + .map(|p| p.join(RSS_SERVICE_PLAN_V2_FILENAME)); + + for p in paths { + if p.try_exists()? { + return Ok(true); + } + } + + Ok(false) + } + async fn is_sled_scrimlet( log: &Logger, address: SocketAddrV6, @@ -214,11 +251,10 @@ impl Plan { } } - // Gets zpool UUIDs from U.2 devices on the sled. - async fn get_u2_zpools_from_sled( + async fn get_inventory( log: &Logger, address: SocketAddrV6, - ) -> Result, PlanError> { + ) -> Result { let dur = std::time::Duration::from_secs(60); let client = reqwest::ClientBuilder::new() .connect_timeout(dur) @@ -231,52 +267,47 @@ impl Plan { log.new(o!("SledAgentClient" => address.to_string())), ); - let get_u2_zpools = || async { - let zpools: Vec = client - .zpools_get() + let get_inventory = || async { + let inventory = client + .inventory() .await - .map(|response| { - response - .into_inner() - .into_iter() - .filter_map(|zpool| match zpool.disk_type { - SledAgentTypes::DiskType::U2 => { - Some(ZpoolName::new_external(zpool.id)) - } - SledAgentTypes::DiskType::M2 => None, - }) - .collect() - }) + .map(|response| response.into_inner()) .map_err(|err| { BackoffError::transient(PlanError::SledApi(err)) })?; - if zpools.len() < MINIMUM_U2_ZPOOL_COUNT { + if inventory + .disks + .iter() + .filter(|disk| { + matches!(disk.variant, SledAgentTypes::DiskVariant::U2) + }) + .count() + < MINIMUM_U2_COUNT + { return Err(BackoffError::transient( - PlanError::SledInitialization( - "Awaiting zpools".to_string(), - ), + PlanError::SledInitialization("Awaiting disks".to_string()), )); } - Ok(zpools) + Ok(inventory) }; - let log_failure = |error, call_count, total_duration| { + let log_failure = |error: PlanError, call_count, total_duration| { if call_count == 0 { - info!(log, "failed to get zpools from {address}"; "error" => ?error); + info!(log, "failed to get inventory from {address}"; "error" => ?error); } else if total_duration > std::time::Duration::from_secs(20) { - warn!(log, "failed to get zpools from {address}"; "error" => ?error, "total duration" => ?total_duration); + warn!(log, "failed to get inventory from {address}"; "error" => ?error, "total duration" => ?total_duration); } }; - let u2_zpools = retry_notify_ext( + let inventory = retry_notify_ext( retry_policy_internal_service_aggressive(), - get_u2_zpools, + get_inventory, log_failure, ) .await?; - Ok(u2_zpools) + Ok(inventory) } pub fn create_transient( @@ -307,6 +338,37 @@ impl Plan { .unwrap(); } + // Set up storage early, as it'll be necessary for placement of + // many subsequent services. + // + // Our policy at RSS time is currently "adopt all the U.2 disks we can see". + for sled_info in sled_info.iter_mut() { + let disks = sled_info + .inventory + .disks + .iter() + .filter(|disk| { + matches!(disk.variant, SledAgentTypes::DiskVariant::U2) + }) + .map(|disk| OmicronPhysicalDiskConfig { + identity: disk.identity.clone(), + id: Uuid::new_v4(), + pool_id: Uuid::new_v4(), + }) + .collect(); + sled_info.request.disks = OmicronPhysicalDisksConfig { + generation: Generation::new(), + disks, + }; + sled_info.u2_zpools = sled_info + .request + .disks + .disks + .iter() + .map(|disk| ZpoolName::new_external(disk.pool_id)) + .collect(); + } + // We'll stripe most services across all available Sleds, round-robin // style. In development and CI, this might only be one Sled. We'll // only report `NotEnoughSleds` below if there are zero Sleds or if we @@ -708,16 +770,15 @@ impl Plan { |sled_request| async { let subnet = sled_request.body.subnet; let sled_address = get_sled_address(subnet); - let u2_zpools = - Self::get_u2_zpools_from_sled(log, sled_address) - .await?; + let inventory = + Self::get_inventory(log, sled_address).await?; let is_scrimlet = Self::is_sled_scrimlet(log, sled_address).await?; Ok(SledInfo::new( sled_request.body.id, subnet, sled_address, - u2_zpools, + inventory, is_scrimlet, )) }, @@ -730,7 +791,7 @@ impl Plan { // Once we've constructed a plan, write it down to durable storage. let paths: Vec = storage_manager - .get_latest_resources() + .get_latest_disks() .await .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -771,7 +832,9 @@ pub struct SledInfo { subnet: Ipv6Subnet, /// the address of the Sled Agent on the sled's subnet pub sled_address: SocketAddrV6, - /// the list of zpools on the Sled + /// the inventory returned by the Sled + inventory: SledAgentTypes::Inventory, + /// The Zpools available for usage by services u2_zpools: Vec, /// spreads components across a Sled's zpools u2_zpool_allocators: @@ -789,14 +852,15 @@ impl SledInfo { sled_id: Uuid, subnet: Ipv6Subnet, sled_address: SocketAddrV6, - u2_zpools: Vec, + inventory: SledAgentTypes::Inventory, is_scrimlet: bool, ) -> SledInfo { SledInfo { sled_id, subnet, sled_address, - u2_zpools, + inventory, + u2_zpools: vec![], u2_zpool_allocators: HashMap::new(), is_scrimlet, addr_alloc: AddressBumpAllocator::new(subnet), @@ -1207,10 +1271,10 @@ mod tests { } #[test] - fn test_rss_service_plan_v2_schema() { + fn test_rss_service_plan_v3_schema() { let schema = schemars::schema_for!(Plan); expectorate::assert_contents( - "../schema/rss-service-plan-v2.json", + "../schema/rss-service-plan-v3.json", &serde_json::to_string_pretty(&schema).unwrap(), ); } diff --git a/sled-agent/src/rack_setup/plan/sled.rs b/sled-agent/src/rack_setup/plan/sled.rs index efdd86d2f9..a3fd57369a 100644 --- a/sled-agent/src/rack_setup/plan/sled.rs +++ b/sled-agent/src/rack_setup/plan/sled.rs @@ -59,7 +59,7 @@ impl Plan { storage: &StorageHandle, ) -> Result, PlanError> { let paths: Vec = storage - .get_latest_resources() + .get_latest_disks() .await .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -126,7 +126,7 @@ impl Plan { // Once we've constructed a plan, write it down to durable storage. let paths: Vec = storage_manager - .get_latest_resources() + .get_latest_disks() .await .all_m2_mountpoints(CONFIG_DATASET) .into_iter() diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 587625fe7b..5ff6074249 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -17,7 +17,7 @@ //! state files that get generated as RSS executes: //! //! - /pool/int/UUID/config/rss-sled-plan.json (Sled Plan) -//! - /pool/int/UUID/config/rss-service-plan-v2.json (Service Plan) +//! - /pool/int/UUID/config/rss-service-plan-v3.json (Service Plan) //! - /pool/int/UUID/config/rss-plan-completed.marker (Plan Execution Complete) //! //! These phases are described below. As each phase completes, a corresponding @@ -115,6 +115,7 @@ use std::collections::{btree_map, BTreeMap, BTreeSet}; use std::collections::{HashMap, HashSet}; use std::iter; use std::net::{Ipv6Addr, SocketAddrV6}; +use std::time::Duration; use thiserror::Error; use uuid::Uuid; @@ -276,6 +277,125 @@ impl ServiceInner { ServiceInner { log } } + // Ensures that all storage for a particular generation is configured. + // + // This will either return: + // - Ok if the requests are all successful (where "successful" also + // includes any of the sleds having a storage configuration more recent than + // what we've requested), or + // - An error from attempting to configure storage on the underlying sleds + async fn ensure_storage_config_at_least( + &self, + plan: &ServicePlan, + ) -> Result<(), SetupServiceError> { + cancel_safe_futures::future::join_all_then_try( + plan.services.iter().map(|(sled_address, config)| async move { + self.initialize_storage_on_sled( + *sled_address, + SledAgentTypes::OmicronPhysicalDisksConfig { + generation: config.disks.generation, + disks: config + .disks + .disks + .iter() + .map(|disk| { + SledAgentTypes::OmicronPhysicalDiskConfig { + identity: disk.identity.clone(), + id: disk.id, + pool_id: disk.pool_id, + } + }) + .collect(), + }, + ) + .await + }), + ) + .await?; + Ok(()) + } + + /// Requests that the specified sled configure storage as described + /// by `storage_config`. + /// + /// This function succeeds if either the configuration is supplied, or if + /// the configuration on the target sled is newer than what we're supplying. + // This function shares a lot of implementation details with + // [Self::initialize_zones_on_sled]. Although it has a different meaning, + // the usage (and expectations around generation numbers) are similar. + async fn initialize_storage_on_sled( + &self, + sled_address: SocketAddrV6, + storage_config: SledAgentTypes::OmicronPhysicalDisksConfig, + ) -> Result<(), SetupServiceError> { + let dur = std::time::Duration::from_secs(60); + let client = reqwest::ClientBuilder::new() + .connect_timeout(dur) + .build() + .map_err(SetupServiceError::HttpClient)?; + let log = self.log.new(o!("sled_address" => sled_address.to_string())); + let client = SledAgentClient::new_with_client( + &format!("http://{}", sled_address), + client, + log.clone(), + ); + + let storage_put = || async { + info!( + log, + "attempting to set up sled's storage: {:?}", storage_config, + ); + let result = client + .omicron_physical_disks_put(&storage_config.clone()) + .await; + let Err(error) = result else { + return Ok::< + (), + BackoffError>, + >(()); + }; + + if let sled_agent_client::Error::ErrorResponse(response) = &error { + if response.status() == http::StatusCode::CONFLICT { + warn!( + log, + "ignoring attempt to initialize storage because \ + the server seems to be newer"; + "attempted_generation" => i64::from(&storage_config.generation), + "req_id" => &response.request_id, + "server_message" => &response.message, + ); + + // If we attempt to initialize storage at generation X, and + // the server refuses because it's at some generation newer + // than X, then we treat that as success. See the doc + // comment on this function. + return Ok(()); + } + } + + // TODO Many other codes here should not be retried. See + // omicron#4578. + return Err(BackoffError::transient(error)); + }; + let log_failure = |error, delay| { + warn!( + log, + "failed to initialize Omicron storage"; + "error" => #%error, + "retry_after" => ?delay, + ); + }; + retry_notify( + retry_policy_internal_service_aggressive(), + storage_put, + log_failure, + ) + .await?; + + Ok(()) + } + /// Requests that the specified sled configure zones as described by /// `zones_config` /// @@ -345,7 +465,7 @@ impl ServiceInner { warn!( log, "failed to initialize Omicron zones"; - "error" => ?error, + "error" => #%error, "retry_after" => ?delay, ); }; @@ -564,8 +684,16 @@ impl ServiceInner { info!(self.log, "Nexus address: {}", nexus_address.to_string()); - let nexus_client = NexusClient::new( + const CLIENT_TIMEOUT: Duration = Duration::from_secs(60); + let client = reqwest::Client::builder() + .connect_timeout(CLIENT_TIMEOUT) + .timeout(CLIENT_TIMEOUT) + .build() + .map_err(SetupServiceError::HttpClient)?; + + let nexus_client = NexusClient::new_with_client( &format!("http://{}", nexus_address), + client, self.log.new(o!("component" => "NexusClient")), ); @@ -687,9 +815,44 @@ impl ServiceInner { info!(self.log, "rack_network_config: {:#?}", rack_network_config); + let physical_disks: Vec<_> = service_plan + .services + .iter() + .flat_map(|(addr, config)| { + let sled_id = id_map.get(addr).expect("Missing sled"); + config.disks.disks.iter().map(|config| { + NexusTypes::PhysicalDiskPutRequest { + id: config.id, + vendor: config.identity.vendor.clone(), + serial: config.identity.serial.clone(), + model: config.identity.model.clone(), + variant: NexusTypes::PhysicalDiskKind::U2, + sled_id: *sled_id, + } + }) + }) + .collect(); + + let zpools = service_plan + .services + .iter() + .flat_map(|(addr, config)| { + let sled_id = id_map.get(addr).expect("Missing sled"); + config.disks.disks.iter().map(|config| { + NexusTypes::ZpoolPutRequest { + id: config.pool_id, + physical_disk_id: config.id, + sled_id: *sled_id, + } + }) + }) + .collect(); + let request = NexusTypes::RackInitializationRequest { blueprint, services, + physical_disks, + zpools, datasets, internal_services_ip_pool_ranges, certs: config.external_certificates.clone(), @@ -789,7 +952,7 @@ impl ServiceInner { warn!( self.log, "Failed to initialize CockroachDB"; - "error" => ?error, + "error" => #%error, "retry_after" => ?delay ); }; @@ -839,7 +1002,7 @@ impl ServiceInner { )?; let marker_paths: Vec = storage_manager - .get_latest_resources() + .get_latest_disks() .await .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -1004,6 +1167,10 @@ impl ServiceInner { .await? }; + // Before we can ask for any services, we need to ensure that storage is + // operational. + self.ensure_storage_config_at_least(&service_plan).await?; + // Set up internal DNS services first and write the initial // DNS configuration to the internal DNS servers. let v1generator = OmicronZonesConfigGenerator::initial_version( @@ -1301,57 +1468,65 @@ mod test { params::OmicronZoneType, rack_setup::plan::service::{Plan as ServicePlan, SledInfo}, }; - use illumos_utils::zpool::ZpoolName; - use omicron_common::{address::Ipv6Subnet, api::external::Generation}; + use omicron_common::{ + address::{get_sled_address, Ipv6Subnet, SLED_PREFIX}, + api::external::{ByteCount, Generation}, + disk::DiskIdentity, + }; + use sled_agent_client::types as SledAgentTypes; + use uuid::Uuid; + + fn make_sled_info( + sled_id: Uuid, + subnet: Ipv6Subnet, + u2_count: usize, + ) -> SledInfo { + let sled_agent_address = get_sled_address(subnet); + SledInfo::new( + sled_id, + subnet, + sled_agent_address, + SledAgentTypes::Inventory { + sled_id, + sled_agent_address: sled_agent_address.to_string(), + sled_role: SledAgentTypes::SledRole::Scrimlet, + baseboard: SledAgentTypes::Baseboard::Unknown, + usable_hardware_threads: 32, + usable_physical_ram: ByteCount::from_gibibytes_u32(16), + reservoir_size: ByteCount::from_gibibytes_u32(0), + disks: (0..u2_count) + .map(|i| SledAgentTypes::InventoryDisk { + identity: DiskIdentity { + vendor: "test-manufacturer".to_string(), + serial: format!("test-{sled_id}-#{i}"), + model: "v1".to_string(), + }, + variant: SledAgentTypes::DiskVariant::U2, + slot: i.try_into().unwrap(), + }) + .collect(), + zpools: vec![], + }, + true, + ) + } fn make_test_service_plan() -> ServicePlan { let rss_config = crate::bootstrap::params::test_config(); let fake_sleds = vec![ - SledInfo::new( - "d4ba4bbe-8542-4907-bc8f-48df53eb5089".parse().unwrap(), - Ipv6Subnet::new("fd00:1122:3344:101::1".parse().unwrap()), - "[fd00:1122:3344:101::1]:80".parse().unwrap(), - vec![ - ZpoolName::new_internal( - "c5885278-0ae2-4f1e-9223-07f2ada818e1".parse().unwrap(), - ), - ZpoolName::new_internal( - "57465977-8275-43aa-a320-b6cd5cb20ca6".parse().unwrap(), - ), - ZpoolName::new_external( - "886f9fe7-bf70-4ddd-ae92-764dc3ed14ab".parse().unwrap(), - ), - ZpoolName::new_external( - "4c9061b1-345b-4985-8cbd-a2a899f15b68".parse().unwrap(), - ), - ZpoolName::new_external( - "b2bd488e-b187-42a0-b157-9ab0f70d91a8".parse().unwrap(), - ), - ], - true, + make_sled_info( + Uuid::new_v4(), + Ipv6Subnet::::new( + "fd00:1122:3344:101::1".parse().unwrap(), + ), + 5, ), - SledInfo::new( - "b4359dea-665d-41ca-a681-f55912f2d5d0".parse().unwrap(), - Ipv6Subnet::new("fd00:1122:3344:102::1".parse().unwrap()), - "[fd00:1122:3344:102::1]:80".parse().unwrap(), - vec![ - ZpoolName::new_internal( - "34d6b5e5-a09f-4e96-a599-fa306ce6d983".parse().unwrap(), - ), - ZpoolName::new_internal( - "e9b8d1ea-da29-4b61-a493-c0ed319098da".parse().unwrap(), - ), - ZpoolName::new_external( - "37f8e903-2adb-4613-b78c-198122c289f0".parse().unwrap(), - ), - ZpoolName::new_external( - "b50f787c-97b3-4b91-a5bd-99d11fc86fb8".parse().unwrap(), - ), - ZpoolName::new_external( - "809e50c8-930e-413a-950c-69a540b688e2".parse().unwrap(), - ), - ], - true, + make_sled_info( + Uuid::new_v4(), + Ipv6Subnet::::new( + "fd00:1122:3344:102::1".parse().unwrap(), + ), + 5, ), ]; let service_plan = diff --git a/sled-agent/src/server.rs b/sled-agent/src/server.rs index b93ad0721c..f702e4c67d 100644 --- a/sled-agent/src/server.rs +++ b/sled-agent/src/server.rs @@ -11,12 +11,10 @@ use crate::bootstrap::params::StartSledAgentRequest; use crate::long_running_tasks::LongRunningTaskHandles; use crate::nexus::NexusClientWithResolver; use crate::services::ServiceManager; -use crate::storage_monitor::UnderlayAccess; use internal_dns::resolver::Resolver; use slog::Logger; use std::net::SocketAddr; use std::sync::Arc; -use tokio::sync::oneshot; use uuid::Uuid; /// Packages up a [`SledAgent`], running the sled agent API under a Dropshot @@ -42,7 +40,6 @@ impl Server { request: StartSledAgentRequest, long_running_tasks_handles: LongRunningTaskHandles, services: ServiceManager, - underlay_available_tx: oneshot::Sender, ) -> Result { info!(log, "setting up sled agent server"); @@ -65,7 +62,6 @@ impl Server { request, services, long_running_tasks_handles, - underlay_available_tx, ) .await .map_err(|e| e.to_string())?; diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index e23cdf58b9..bfc0b91a71 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -94,6 +94,7 @@ use sled_hardware::underlay; use sled_hardware::SledMode; use sled_hardware_types::underlay::BOOTSTRAP_PREFIX; use sled_hardware_types::Baseboard; +use sled_storage::config::MountConfig; use sled_storage::dataset::{ DatasetKind, DatasetName, CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET, }; @@ -661,7 +662,7 @@ pub(crate) enum TimeSyncConfig { // Skips timesync unconditionally. Skip, // Fails timesync unconditionally. - #[cfg(test)] + #[cfg(all(test, target_os = "illumos"))] Fail, } @@ -734,12 +735,12 @@ impl ServiceManager { } } - #[cfg(test)] + #[cfg(all(test, target_os = "illumos"))] fn override_ledger_directory(&self, path: Utf8PathBuf) { self.inner.ledger_directory_override.set(path).unwrap(); } - #[cfg(test)] + #[cfg(all(test, target_os = "illumos"))] fn override_image_directory(&self, path: Utf8PathBuf) { self.inner.image_directory_override.set(path).unwrap(); } @@ -752,7 +753,7 @@ impl ServiceManager { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; } - let resources = self.inner.storage.get_latest_resources().await; + let resources = self.inner.storage.get_latest_disks().await; resources .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -764,7 +765,7 @@ impl ServiceManager { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(ZONES_LEDGER_FILENAME)]; } - let resources = self.inner.storage.get_latest_resources().await; + let resources = self.inner.storage.get_latest_disks().await; resources .all_m2_mountpoints(CONFIG_DATASET) .into_iter() @@ -1508,11 +1509,12 @@ impl ServiceManager { // If the boot disk exists, look for the image in the "install" dataset // there too. - if let Some((_, boot_zpool)) = - self.inner.storage.get_latest_resources().await.boot_disk() - { - zone_image_paths - .push(boot_zpool.dataset_mountpoint(INSTALL_DATASET)); + let all_disks = self.inner.storage.get_latest_disks().await; + if let Some((_, boot_zpool)) = all_disks.boot_disk() { + zone_image_paths.push(boot_zpool.dataset_mountpoint( + &all_disks.mount_config.root, + INSTALL_DATASET, + )); } let zone_type_str = match &request { @@ -2906,6 +2908,7 @@ impl ServiceManager { // storage configuration against the reality of the current sled. async fn start_omicron_zone( &self, + mount_config: &MountConfig, zone: &OmicronZoneConfig, time_is_synchronized: bool, all_u2_pools: &Vec, @@ -2924,7 +2927,11 @@ impl ServiceManager { // Ensure that this zone's storage is ready. let root = self - .validate_storage_and_pick_mountpoint(&zone, &all_u2_pools) + .validate_storage_and_pick_mountpoint( + mount_config, + &zone, + &all_u2_pools, + ) .await?; let config = OmicronZoneConfigLocal { zone: zone.clone(), root }; @@ -2953,6 +2960,7 @@ impl ServiceManager { // to start. async fn start_omicron_zones( &self, + mount_config: &MountConfig, requests: impl Iterator + Clone, time_is_synchronized: bool, all_u2_pools: &Vec, @@ -2969,6 +2977,7 @@ impl ServiceManager { let futures = requests.map(|zone| async move { self.start_omicron_zone( + mount_config, &zone, time_is_synchronized, all_u2_pools, @@ -3192,7 +3201,8 @@ impl ServiceManager { } // Collect information that's necessary to start new zones - let storage = self.inner.storage.get_latest_resources().await; + let storage = self.inner.storage.get_latest_disks().await; + let mount_config = &storage.mount_config; let all_u2_pools = storage.all_u2_zpools(); let time_is_synchronized = match self.timesync_get_locked(&existing_zones).await { @@ -3205,6 +3215,7 @@ impl ServiceManager { // Concurrently boot all new zones let StartZonesResult { new_zones, errors } = self .start_omicron_zones( + mount_config, zones_to_be_added, time_is_synchronized, &all_u2_pools, @@ -3305,6 +3316,7 @@ impl ServiceManager { // is valid. async fn validate_storage_and_pick_mountpoint( &self, + mount_config: &MountConfig, zone: &OmicronZoneConfig, all_u2_pools: &Vec, ) -> Result { @@ -3363,14 +3375,16 @@ impl ServiceManager { device: format!("zpool: {data_pool}"), }); } - data_pool.dataset_mountpoint(ZONE_DATASET) + data_pool.dataset_mountpoint(&mount_config.root, ZONE_DATASET) } else { // If the zone it not coupled to other datsets, we pick one // arbitrarily. let mut rng = rand::thread_rng(); all_u2_pools .choose(&mut rng) - .map(|pool| pool.dataset_mountpoint(ZONE_DATASET)) + .map(|pool| { + pool.dataset_mountpoint(&mount_config.root, ZONE_DATASET) + }) .ok_or_else(|| Error::U2NotFound)? .clone() }; @@ -3477,7 +3491,7 @@ impl ServiceManager { let skip_timesync = match &self.inner.time_sync_config { TimeSyncConfig::Normal => false, TimeSyncConfig::Skip => true, - #[cfg(test)] + #[cfg(all(test, target_os = "illumos"))] TimeSyncConfig::Fail => { info!(self.inner.log, "Configured to fail timesync checks"); return Err(Error::TimeNotSynchronized); @@ -4128,10 +4142,9 @@ impl ServiceManager { } } -#[cfg(test)] +#[cfg(all(test, target_os = "illumos"))] mod test { use super::*; - use illumos_utils::zpool::ZpoolName; use illumos_utils::{ dladm::{ Etherstub, MockDladm, BOOTSTRAP_ETHERSTUB_NAME, @@ -4140,9 +4153,8 @@ mod test { svc, zone::MockZones, }; - use sled_storage::disk::{RawDisk, SyntheticDisk}; - use sled_storage::manager::{FakeStorageManager, StorageHandle}; + use sled_storage::manager_test_harness::StorageManagerTestHarness; use std::net::{Ipv6Addr, SocketAddrV6}; use std::os::unix::process::ExitStatusExt; use uuid::Uuid; @@ -4366,18 +4378,21 @@ mod test { ) -> Result<(), Error> { let zone_prefix = format!("oxz_{}", zone_type.zone_type_str()); let _expectations = expect_new_service(&zone_prefix); - mgr.ensure_all_omicron_zones_persistent( - OmicronZonesConfig { - generation, - zones: vec![OmicronZoneConfig { - id, - underlay_address: Ipv6Addr::LOCALHOST, - zone_type, - }], - }, - Some(&tmp_dir), - ) - .await + let r = mgr + .ensure_all_omicron_zones_persistent( + OmicronZonesConfig { + generation, + zones: vec![OmicronZoneConfig { + id, + underlay_address: Ipv6Addr::LOCALHOST, + zone_type, + }], + }, + Some(&tmp_dir), + ) + .await; + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + r } // Prepare to call "ensure" for a service which already exists. We should @@ -4460,31 +4475,25 @@ mod test { } } - async fn setup_storage() -> StorageHandle { - let (manager, handle) = FakeStorageManager::new(); - - // Spawn the storage manager as done by sled-agent - tokio::spawn(async move { - manager.run().await; - }); - - let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); - let internal_disk: RawDisk = - SyntheticDisk::new(internal_zpool_name, 0).into(); - handle.upsert_disk(internal_disk).await; - let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let external_disk: RawDisk = - SyntheticDisk::new(external_zpool_name, 1).into(); - handle.upsert_disk(external_disk).await; - - handle + async fn setup_storage(log: &Logger) -> StorageManagerTestHarness { + let mut harness = StorageManagerTestHarness::new(&log).await; + let raw_disks = + harness.add_vdevs(&["u2_test.vdev", "m2_test.vdev"]).await; + harness.handle().key_manager_ready().await; + let config = harness.make_config(1, &raw_disks); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Failed to ensure disks"); + assert!(!result.has_error(), "{:?}", result); + harness } - #[derive(Clone)] struct LedgerTestHelper<'a> { log: slog::Logger, ddmd_client: DdmAdminClient, - storage_handle: StorageHandle, + storage_test_harness: StorageManagerTestHarness, zone_bundler: ZoneBundler, test_config: &'a TestConfig, } @@ -4495,41 +4504,45 @@ mod test { test_config: &'a TestConfig, ) -> LedgerTestHelper { let ddmd_client = DdmAdminClient::localhost(&log).unwrap(); - let storage_handle = setup_storage().await; + let storage_test_harness = setup_storage(&log).await; let zone_bundler = ZoneBundler::new( log.clone(), - storage_handle.clone(), + storage_test_harness.handle().clone(), Default::default(), ); LedgerTestHelper { log, ddmd_client, - storage_handle, + storage_test_harness, zone_bundler, test_config, } } - fn new_service_manager(self) -> ServiceManager { + async fn cleanup(&mut self) { + self.storage_test_harness.cleanup().await; + } + + fn new_service_manager(&self) -> ServiceManager { self.new_service_manager_with_timesync(TimeSyncConfig::Skip) } fn new_service_manager_with_timesync( - self, + &self, time_sync_config: TimeSyncConfig, ) -> ServiceManager { let log = &self.log; let mgr = ServiceManager::new( log, - self.ddmd_client, + self.ddmd_client.clone(), make_bootstrap_networking_config(), SledMode::Auto, time_sync_config, SidecarRevision::Physical("rev-test".to_string()), vec![], - self.storage_handle, - self.zone_bundler, + self.storage_test_harness.handle().clone(), + self.zone_bundler.clone(), ); self.test_config.override_paths(&mgr); mgr @@ -4563,7 +4576,7 @@ mod test { let logctx = omicron_test_utils::dev::test_setup_log("test_ensure_service"); let test_config = TestConfig::new().await; - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; let mgr = helper.new_service_manager(); LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); @@ -4592,6 +4605,7 @@ mod test { drop_service_manager(mgr); + helper.cleanup().await; logctx.cleanup_successful(); } @@ -4601,7 +4615,7 @@ mod test { "test_ensure_service_before_timesync", ); let test_config = TestConfig::new().await; - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; let mgr = @@ -4666,6 +4680,7 @@ mod test { .unwrap(); drop_service_manager(mgr); + helper.cleanup().await; logctx.cleanup_successful(); } @@ -4675,7 +4690,7 @@ mod test { "test_ensure_service_which_already_exists", ); let test_config = TestConfig::new().await; - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; let mgr = helper.new_service_manager(); LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); @@ -4694,6 +4709,7 @@ mod test { drop_service_manager(mgr); + helper.cleanup().await; logctx.cleanup_successful(); } @@ -4703,12 +4719,12 @@ mod test { "test_services_are_recreated_on_reboot", ); let test_config = TestConfig::new().await; - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; // First, spin up a ServiceManager, create a new zone, and then tear // down the ServiceManager. - let mgr = helper.clone().new_service_manager(); + let mgr = helper.new_service_manager(); LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); let v2 = Generation::new().next(); @@ -4727,6 +4743,7 @@ mod test { let _expectations = expect_new_service(EXPECTED_ZONE_NAME_PREFIX); let mgr = helper.new_service_manager(); LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let found = mgr.omicron_zones_list().await.expect("failed to list zones"); @@ -4736,6 +4753,7 @@ mod test { drop_service_manager(mgr); + helper.cleanup().await; logctx.cleanup_successful(); } @@ -4745,12 +4763,12 @@ mod test { "test_services_do_not_persist_without_config", ); let test_config = TestConfig::new().await; - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; // First, spin up a ServiceManager, create a new zone, and then tear // down the ServiceManager. - let mgr = helper.clone().new_service_manager(); + let mgr = helper.new_service_manager(); LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); let v1 = Generation::new(); @@ -4783,6 +4801,7 @@ mod test { drop_service_manager(mgr); + helper.cleanup().await; logctx.cleanup_successful(); } @@ -4792,7 +4811,7 @@ mod test { let logctx = omicron_test_utils::dev::test_setup_log("test_bad_generations"); let test_config = TestConfig::new().await; - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; let mgr = helper.new_service_manager(); LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); @@ -4900,6 +4919,8 @@ mod test { drop_service_manager(mgr); + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + helper.cleanup().await; logctx.cleanup_successful(); } @@ -4921,9 +4942,9 @@ mod test { .expect("failed to copy example old-format services ledger into place"); // Now start the service manager. - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; - let mgr = helper.clone().new_service_manager(); + let mgr = helper.new_service_manager(); LedgerTestHelper::sled_agent_started(&logctx.log, &test_config, &mgr); // Trigger the migration code. (Yes, it's hokey that we create this @@ -4964,6 +4985,7 @@ mod test { assert_eq!(found, expected_config); drop_service_manager(mgr); + helper.cleanup().await; logctx.cleanup_successful(); } @@ -4973,7 +4995,7 @@ mod test { "test_old_ledger_migration_bad", ); let test_config = TestConfig::new().await; - let helper = + let mut helper = LedgerTestHelper::new(logctx.log.clone(), &test_config).await; // Before we start things, stuff a broken ledger into place. For this @@ -5001,6 +5023,7 @@ mod test { format!("{:#}", error) ); + helper.cleanup().await; logctx.cleanup_successful(); } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index c3c92eb6fe..7d0d513a14 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -11,7 +11,7 @@ use crate::params::{ DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, Inventory, - OmicronZonesConfig, VpcFirewallRulesEnsureBody, + OmicronPhysicalDisksConfig, OmicronZonesConfig, VpcFirewallRulesEnsureBody, }; use dropshot::endpoint; use dropshot::ApiDescription; @@ -31,6 +31,7 @@ use omicron_common::api::internal::shared::RackNetworkConfig; use omicron_common::api::internal::shared::SwitchPorts; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use sled_storage::resources::DisksManagementResult; use std::net::{Ipv4Addr, Ipv6Addr}; use std::sync::Arc; use uuid::Uuid; @@ -60,6 +61,8 @@ pub fn api() -> SledApiDescription { api.register(read_network_bootstore_config)?; api.register(write_network_bootstore_config)?; api.register(inventory)?; + api.register(omicron_physical_disks_get)?; + api.register(omicron_physical_disks_put)?; api.register(omicron_zones_get)?; api.register(omicron_zones_put)?; @@ -441,6 +444,31 @@ async fn inventory( )) } +#[endpoint { + method = PUT, + path = "/omicron-physical-disks", +}] +async fn omicron_physical_disks_put( + rqctx: RequestContext>, + body: TypedBody, +) -> Result, HttpError> { + let sa = rqctx.context(); + let body_args = body.into_inner(); + let result = sa.omicron_physical_disks_ensure(body_args).await?; + Ok(HttpResponseOk(result)) +} + +#[endpoint { + method = GET, + path = "/omicron-physical-disks", +}] +async fn omicron_physical_disks_get( + rqctx: RequestContext>, +) -> Result, HttpError> { + let sa = rqctx.context(); + Ok(HttpResponseOk(sa.omicron_physical_disks_list().await?)) +} + #[endpoint { method = GET, path = "/omicron-zones", diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index dc770d179d..3a0ab2484a 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -31,6 +31,7 @@ use omicron_common::api::external::Vni; use omicron_common::backoff::{ retry_notify, retry_policy_internal_service_aggressive, BackoffError, }; +use omicron_common::disk::DiskIdentity; use omicron_common::FileKv; use slog::{info, Drain, Logger}; use std::collections::BTreeMap; @@ -163,20 +164,24 @@ impl Server { // Crucible dataset for each. This emulates the setup we expect to have // on the physical rack. for zpool in &config.storage.zpools { + let physical_disk_id = Uuid::new_v4(); let zpool_id = Uuid::new_v4(); let vendor = "synthetic-vendor".to_string(); let serial = format!("synthetic-serial-{zpool_id}"); let model = "synthetic-model".to_string(); sled_agent .create_external_physical_disk( - vendor.clone(), - serial.clone(), - model.clone(), + physical_disk_id, + DiskIdentity { + vendor: vendor.clone(), + serial: serial.clone(), + model: model.clone(), + }, ) .await; sled_agent - .create_zpool(zpool_id, vendor, serial, model, zpool.size) + .create_zpool(zpool_id, physical_disk_id, zpool.size) .await; let dataset_id = Uuid::new_v4(); let address = @@ -470,12 +475,14 @@ pub async fn run_standalone_server( }; let mut datasets = vec![]; - for zpool_id in server.sled_agent.get_zpools().await { + let physical_disks = server.sled_agent.get_all_physical_disks().await; + let zpools = server.sled_agent.get_zpools().await; + for zpool in &zpools { for (dataset_id, address) in - server.sled_agent.get_datasets(zpool_id).await + server.sled_agent.get_datasets(zpool.id).await { datasets.push(NexusTypes::DatasetCreateRequest { - zpool_id, + zpool_id: zpool.id, dataset_id, request: NexusTypes::DatasetPutRequest { address: address.to_string(), @@ -490,10 +497,11 @@ pub async fn run_standalone_server( None => vec![], }; + let disks = server.sled_agent.omicron_physical_disks_list().await?; let services = zones.iter().map(|z| z.to_nexus_service_req(config.id)).collect(); let mut sled_configs = BTreeMap::new(); - sled_configs.insert(config.id, SledConfig { zones }); + sled_configs.insert(config.id, SledConfig { disks, zones }); let rack_init_request = NexusTypes::RackInitializationRequest { blueprint: build_initial_blueprint_from_sled_configs( @@ -501,6 +509,8 @@ pub async fn run_standalone_server( internal_dns_version, ), services, + physical_disks, + zpools, datasets, internal_services_ip_pool_ranges, certs, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 1edde622a1..455c2988d3 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -15,13 +15,13 @@ use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, Inventory, - OmicronZonesConfig, SledRole, + OmicronPhysicalDisksConfig, OmicronZonesConfig, SledRole, }; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; use anyhow::bail; use anyhow::Context; -use dropshot::HttpServer; +use dropshot::{HttpError, HttpServer}; use futures::lock::Mutex; use illumos_utils::opte::params::{ DeleteVirtualNetworkInterfaceHost, SetVirtualNetworkInterfaceHost, @@ -35,10 +35,12 @@ use omicron_common::api::internal::nexus::{ use omicron_common::api::internal::nexus::{ InstanceRuntimeState, VmmRuntimeState, }; +use omicron_common::disk::DiskIdentity; use propolis_client::{ types::VolumeConstructionRequest, Client as PropolisClient, }; use propolis_mock_server::Context as PropolisContext; +use sled_storage::resources::DisksManagementResult; use slog::Logger; use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; @@ -156,7 +158,6 @@ impl SledAgent { )), storage: Mutex::new(Storage::new( id, - Arc::clone(&nexus_client), config.storage.ip, storage_log, )), @@ -521,19 +522,26 @@ impl SledAgent { /// Adds a Physical Disk to the simulated sled agent. pub async fn create_external_physical_disk( &self, - vendor: String, - serial: String, - model: String, + id: Uuid, + identity: DiskIdentity, ) { let variant = sled_hardware::DiskVariant::U2; self.storage .lock() .await - .insert_physical_disk(vendor, serial, model, variant) + .insert_physical_disk(id, identity, variant) .await; } - pub async fn get_zpools(&self) -> Vec { + pub async fn get_all_physical_disks( + &self, + ) -> Vec { + self.storage.lock().await.get_all_physical_disks() + } + + pub async fn get_zpools( + &self, + ) -> Vec { self.storage.lock().await.get_all_zpools() } @@ -548,15 +556,13 @@ impl SledAgent { pub async fn create_zpool( &self, id: Uuid, - vendor: String, - serial: String, - model: String, + physical_disk_id: Uuid, size: u64, ) { self.storage .lock() .await - .insert_zpool(id, vendor, serial, model, size) + .insert_zpool(id, physical_disk_id, size) .await; } @@ -780,9 +786,9 @@ impl SledAgent { .context("reservoir_size")?, disks: storage .physical_disks() - .iter() - .map(|(identity, info)| crate::params::InventoryDisk { - identity: identity.clone(), + .values() + .map(|info| crate::params::InventoryDisk { + identity: info.identity.clone(), variant: info.variant, slot: info.slot, }) @@ -800,6 +806,19 @@ impl SledAgent { }) } + pub async fn omicron_physical_disks_list( + &self, + ) -> Result { + self.storage.lock().await.omicron_physical_disks_list().await + } + + pub async fn omicron_physical_disks_ensure( + &self, + config: OmicronPhysicalDisksConfig, + ) -> Result { + self.storage.lock().await.omicron_physical_disks_ensure(config).await + } + pub async fn omicron_zones_list(&self) -> OmicronZonesConfig { self.fake_zones.lock().await.clone() } diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 8fb362c5b7..13c3da4fd0 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -8,7 +8,7 @@ //! than the representation of "virtual disks" which would be presented //! through Nexus' external API. -use crate::nexus::NexusClient; +use crate::params::OmicronPhysicalDisksConfig; use crate::sim::http_entrypoints_pantry::ExpectedDigest; use crate::sim::SledAgent; use anyhow::{self, bail, Result}; @@ -19,12 +19,11 @@ use crucible_agent_client::types::{ use dropshot::HandlerTaskMode; use dropshot::HttpError; use futures::lock::Mutex; -use nexus_client::types::{ - ByteCount, PhysicalDiskKind, PhysicalDiskPutRequest, ZpoolPutRequest, -}; use omicron_common::disk::DiskIdentity; use propolis_client::types::VolumeConstructionRequest; use sled_hardware::DiskVariant; +use sled_storage::resources::DiskManagementStatus; +use sled_storage::resources::DisksManagementResult; use slog::Logger; use std::collections::HashMap; use std::collections::HashSet; @@ -474,18 +473,21 @@ impl CrucibleServer { } pub(crate) struct PhysicalDisk { + pub(crate) identity: DiskIdentity, pub(crate) variant: DiskVariant, pub(crate) slot: i64, } pub(crate) struct Zpool { - datasets: HashMap, + id: Uuid, + physical_disk_id: Uuid, total_size: u64, + datasets: HashMap, } impl Zpool { - fn new(total_size: u64) -> Self { - Zpool { datasets: HashMap::new(), total_size } + fn new(id: Uuid, physical_disk_id: Uuid, total_size: u64) -> Self { + Zpool { id, physical_disk_id, total_size, datasets: HashMap::new() } } fn insert_dataset( @@ -541,9 +543,9 @@ impl Zpool { /// Simulated representation of all storage on a sled. pub struct Storage { sled_id: Uuid, - nexus_client: Arc, log: Logger, - physical_disks: HashMap, + config: Option, + physical_disks: HashMap, next_disk_slot: i64, zpools: HashMap, crucible_ip: IpAddr, @@ -551,16 +553,11 @@ pub struct Storage { } impl Storage { - pub fn new( - sled_id: Uuid, - nexus_client: Arc, - crucible_ip: IpAddr, - log: Logger, - ) -> Self { + pub fn new(sled_id: Uuid, crucible_ip: IpAddr, log: Logger) -> Self { Self { sled_id, - nexus_client, log, + config: None, physical_disks: HashMap::new(), next_disk_slot: 0, zpools: HashMap::new(), @@ -570,68 +567,70 @@ impl Storage { } /// Returns an immutable reference to all (currently known) physical disks - pub fn physical_disks(&self) -> &HashMap { + pub fn physical_disks(&self) -> &HashMap { &self.physical_disks } + pub async fn omicron_physical_disks_list( + &mut self, + ) -> Result { + let Some(config) = self.config.as_ref() else { + return Err(HttpError::for_not_found( + None, + "No control plane disks".into(), + )); + }; + Ok(config.clone()) + } + + pub async fn omicron_physical_disks_ensure( + &mut self, + config: OmicronPhysicalDisksConfig, + ) -> Result { + if let Some(stored_config) = self.config.as_ref() { + if stored_config.generation < config.generation { + return Err(HttpError::for_client_error( + None, + http::StatusCode::BAD_REQUEST, + "Generation number too old".to_string(), + )); + } + } + self.config.replace(config.clone()); + + Ok(DisksManagementResult { + status: config + .disks + .into_iter() + .map(|config| DiskManagementStatus { + identity: config.identity, + err: None, + }) + .collect(), + }) + } + pub async fn insert_physical_disk( &mut self, - vendor: String, - serial: String, - model: String, + id: Uuid, + identity: DiskIdentity, variant: DiskVariant, ) { - let identifier = DiskIdentity { - vendor: vendor.clone(), - serial: serial.clone(), - model: model.clone(), - }; let slot = self.next_disk_slot; self.next_disk_slot += 1; - self.physical_disks.insert(identifier, PhysicalDisk { variant, slot }); - - let variant = match variant { - DiskVariant::U2 => PhysicalDiskKind::U2, - DiskVariant::M2 => PhysicalDiskKind::M2, - }; - - // Notify Nexus - let request = PhysicalDiskPutRequest { - vendor, - serial, - model, - variant, - sled_id: self.sled_id, - }; - self.nexus_client - .physical_disk_put(&request) - .await - .expect("Failed to notify Nexus about new Physical Disk"); + self.physical_disks + .insert(id, PhysicalDisk { identity, variant, slot }); } - /// Adds a Zpool to the sled's simulated storage and notifies Nexus. + /// Adds a Zpool to the sled's simulated storage. pub async fn insert_zpool( &mut self, zpool_id: Uuid, - disk_vendor: String, - disk_serial: String, - disk_model: String, + disk_id: Uuid, size: u64, ) { // Update our local data - self.zpools.insert(zpool_id, Zpool::new(size)); - - // Notify Nexus - let request = ZpoolPutRequest { - size: ByteCount(size), - disk_vendor, - disk_serial, - disk_model, - }; - self.nexus_client - .zpool_put(&self.sled_id, &zpool_id, &request) - .await - .expect("Failed to notify Nexus about new Zpool"); + self.zpools.insert(zpool_id, Zpool::new(zpool_id, disk_id, size)); } /// Returns an immutable reference to all zpools @@ -661,8 +660,42 @@ impl Storage { dataset.address() } - pub fn get_all_zpools(&self) -> Vec { - self.zpools.keys().cloned().collect() + pub fn get_all_physical_disks( + &self, + ) -> Vec { + self.physical_disks + .iter() + .map(|(id, disk)| { + let variant = match disk.variant { + DiskVariant::U2 => { + nexus_client::types::PhysicalDiskKind::U2 + } + DiskVariant::M2 => { + nexus_client::types::PhysicalDiskKind::M2 + } + }; + + nexus_client::types::PhysicalDiskPutRequest { + id: *id, + vendor: disk.identity.vendor.clone(), + serial: disk.identity.serial.clone(), + model: disk.identity.model.clone(), + variant, + sled_id: self.sled_id, + } + }) + .collect() + } + + pub fn get_all_zpools(&self) -> Vec { + self.zpools + .values() + .map(|pool| nexus_client::types::ZpoolPutRequest { + id: pool.id, + sled_id: self.sled_id, + physical_disk_id: pool.physical_disk_id, + }) + .collect() } pub fn get_all_datasets(&self, zpool_id: Uuid) -> Vec<(Uuid, SocketAddr)> { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index cbda32bbe1..e42f708006 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -22,12 +22,11 @@ use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, Inventory, - OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRule, - ZoneBundleMetadata, Zpool, + OmicronPhysicalDisksConfig, OmicronZonesConfig, SledRole, TimeSync, + VpcFirewallRule, ZoneBundleMetadata, Zpool, }; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; -use crate::storage_monitor::UnderlayAccess; use crate::updates::{ConfigUpdates, UpdateManager}; use crate::vmm_reservoir::{ReservoirMode, VmmReservoirManager}; use crate::zone_bundle; @@ -70,11 +69,11 @@ use sled_hardware::{underlay, HardwareManager}; use sled_hardware_types::underlay::BootstrapInterface; use sled_hardware_types::Baseboard; use sled_storage::manager::StorageHandle; +use sled_storage::resources::DisksManagementResult; use slog::Logger; use std::collections::BTreeMap; use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::sync::Arc; -use tokio::sync::oneshot; use uuid::Uuid; use illumos_utils::running_zone::ZoneBuilderFactory; @@ -161,8 +160,9 @@ pub enum Error { impl From for omicron_common::api::external::Error { fn from(err: Error) -> Self { match err { - // Service errors can convert themselves into the external error + // Some errors can convert themselves into the external error Error::Services(err) => err.into(), + Error::Storage(err) => err.into(), _ => omicron_common::api::external::Error::InternalError { internal_message: err.to_string(), }, @@ -342,7 +342,6 @@ impl SledAgent { request: StartSledAgentRequest, services: ServiceManager, long_running_task_handles: LongRunningTaskHandles, - underlay_available_tx: oneshot::Sender, ) -> Result { // Pass the "parent_log" to all subcomponents that want to set their own // "component" value. @@ -357,7 +356,7 @@ impl SledAgent { let storage_manager = &long_running_task_handles.storage_manager; let boot_disk = storage_manager - .get_latest_resources() + .get_latest_disks() .await .boot_disk() .ok_or_else(|| Error::BootDiskNotFound)?; @@ -461,16 +460,6 @@ impl SledAgent { *sled_address.ip(), ); - // Inform the `StorageMonitor` that the underlay is available so that - // it can try to contact nexus. - underlay_available_tx - .send(UnderlayAccess { - nexus_client: nexus_client.clone(), - sled_id: request.body.id, - }) - .map_err(|_| ()) - .expect("Failed to send to StorageMonitor"); - // Configure the VMM reservoir as either a percentage of DRAM or as an // exact size in MiB. let reservoir_mode = ReservoirMode::from_config( @@ -802,6 +791,28 @@ impl SledAgent { self.inner.zone_bundler.cleanup().await.map_err(Error::from) } + /// Requests the set of physical disks currently managed by the Sled Agent. + /// + /// This should be contrasted by the set of disks in the inventory, which + /// may contain a slightly different set, if certain disks are not expected + /// to be in-use by the broader control plane. + pub async fn omicron_physical_disks_list( + &self, + ) -> Result { + Ok(self.storage().omicron_physical_disks_list().await?) + } + + /// Ensures that the specific set of Omicron Physical Disks are running + /// on this sled, and that no other disks are being used by the control + /// plane (with the exception of M.2s, which are always automatically + /// in-use). + pub async fn omicron_physical_disks_ensure( + &self, + config: OmicronPhysicalDisksConfig, + ) -> Result { + Ok(self.storage().omicron_physical_disks_ensure(config).await?) + } + /// List the Omicron zone configuration that's currently running pub async fn omicron_zones_list( &self, @@ -849,7 +860,7 @@ impl SledAgent { pub async fn zpools_get(&self) -> Vec { self.inner .storage - .get_latest_resources() + .get_latest_disks() .await .get_all_zpools() .into_iter() @@ -1105,17 +1116,33 @@ impl SledAgent { let mut disks = vec![]; let mut zpools = vec![]; - for (identity, (disk, pool)) in - self.storage().get_latest_resources().await.disks().iter() - { + let all_disks = self.storage().get_latest_disks().await; + for (identity, variant, slot) in all_disks.iter_all() { disks.push(crate::params::InventoryDisk { identity: identity.clone(), - variant: disk.variant(), - slot: disk.slot(), + variant, + slot, }); + } + for zpool in all_disks.all_u2_zpools() { + let info = + match illumos_utils::zpool::Zpool::get_info(&zpool.to_string()) + { + Ok(info) => info, + Err(err) => { + warn!( + self.log, + "Failed to access zpool info"; + "zpool" => %zpool, + "err" => %err + ); + continue; + } + }; + zpools.push(crate::params::InventoryZpool { - id: pool.name.id(), - total_size: ByteCount::try_from(pool.info.size())?, + id: zpool.id(), + total_size: ByteCount::try_from(info.size())?, }); } diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs index 0c9b287396..8cb63e31f8 100644 --- a/sled-agent/src/storage_monitor.rs +++ b/sled-agent/src/storage_monitor.rs @@ -3,67 +3,19 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! A task that listens for storage events from [`sled_storage::manager::StorageManager`] -//! and dispatches them to other parst of the bootstrap agent and sled agent +//! and dispatches them to other parts of the bootstrap agent and sled agent //! code. use crate::dump_setup::DumpSetup; -use crate::nexus::{ConvertInto, NexusClientWithResolver}; -use derive_more::From; -use futures::stream::FuturesOrdered; -use futures::FutureExt; -use futures::StreamExt; -use nexus_client::types::PhysicalDiskDeleteRequest; -use nexus_client::types::PhysicalDiskPutRequest; -use nexus_client::types::ZpoolPutRequest; -use omicron_common::api::external::ByteCount; -use omicron_common::backoff; -use omicron_common::disk::DiskIdentity; +use sled_storage::config::MountConfig; use sled_storage::manager::StorageHandle; -use sled_storage::pool::Pool; -use sled_storage::resources::StorageResources; +use sled_storage::resources::AllDisks; use slog::Logger; -use std::fmt::Debug; -use std::pin::Pin; -use tokio::sync::oneshot; -use uuid::Uuid; - -#[derive(From, Clone, Debug)] -enum NexusDiskRequest { - Put(PhysicalDiskPutRequest), - Delete(PhysicalDiskDeleteRequest), -} - -/// Describes the access to the underlay used by the StorageManager. -#[derive(Clone)] -pub struct UnderlayAccess { - pub nexus_client: NexusClientWithResolver, - pub sled_id: Uuid, -} - -impl Debug for UnderlayAccess { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("UnderlayAccess") - .field("sled_id", &self.sled_id) - .finish() - } -} pub struct StorageMonitor { log: Logger, storage_manager: StorageHandle, - // Receive a onetime notification that the underlay is available - underlay_available_rx: oneshot::Receiver, - - // A cached copy of the `StorageResources` from the last update - storage_resources: StorageResources, - - // Ability to access the underlay network - underlay: Option, - - // A queue for sending nexus notifications in order - nexus_notifications: FuturesOrdered, - // Invokes dumpadm(8) and savecore(8) when new disks are encountered dump_setup: DumpSetup, } @@ -71,24 +23,12 @@ pub struct StorageMonitor { impl StorageMonitor { pub fn new( log: &Logger, + mount_config: MountConfig, storage_manager: StorageHandle, - ) -> (StorageMonitor, oneshot::Sender) { - let (underlay_available_tx, underlay_available_rx) = oneshot::channel(); - let storage_resources = StorageResources::default(); - let dump_setup = DumpSetup::new(&log); + ) -> StorageMonitor { + let dump_setup = DumpSetup::new(&log, mount_config); let log = log.new(o!("component" => "StorageMonitor")); - ( - StorageMonitor { - log, - storage_manager, - underlay_available_rx, - storage_resources, - underlay: None, - nexus_notifications: FuturesOrdered::new(), - dump_setup, - }, - underlay_available_tx, - ) + StorageMonitor { log, storage_manager, dump_setup } } /// Run the main receive loop of the `StorageMonitor` @@ -97,277 +37,23 @@ impl StorageMonitor { pub async fn run(mut self) { loop { tokio::select! { - res = self.nexus_notifications.next(), - if !self.nexus_notifications.is_empty() => - { - match res { - Some(Ok(s)) => { - info!(self.log, "Nexus notification complete: {s}"); - } - e => error!(self.log, "Nexus notification error: {e:?}") - } - } - resources = self.storage_manager.wait_for_changes() => { + disks = self.storage_manager.wait_for_changes() => { info!( self.log, "Received storage manager update"; - "resources" => ?resources + "disks" => ?disks ); - self.handle_resource_update(resources).await; + self.handle_resource_update(disks).await; } - Ok(underlay) = &mut self.underlay_available_rx, - if self.underlay.is_none() => - { - let sled_id = underlay.sled_id; - info!( - self.log, - "Underlay Available"; "sled_id" => %sled_id - ); - self.underlay = Some(underlay); - self.notify_nexus_about_existing_resources(sled_id).await; - } - } - } - } - - /// When the underlay becomes available, we need to notify nexus about any - /// discovered disks and pools, since we don't attempt to notify until there - /// is an underlay available. - async fn notify_nexus_about_existing_resources(&mut self, sled_id: Uuid) { - let current = StorageResources::default(); - let updated = &self.storage_resources; - let nexus_updates = - compute_resource_diffs(&self.log, &sled_id, ¤t, updated); - for put in nexus_updates.disk_puts { - self.physical_disk_notify(put.into()).await; - } - for (pool, put) in nexus_updates.zpool_puts { - self.add_zpool_notify(pool, put).await; - } - } - - async fn handle_resource_update( - &mut self, - updated_resources: StorageResources, - ) { - // If the underlay isn't available, we only record the changes. Nexus - // isn't yet reachable to notify. - if self.underlay.is_some() { - let nexus_updates = compute_resource_diffs( - &self.log, - &self.underlay.as_ref().unwrap().sled_id, - &self.storage_resources, - &updated_resources, - ); - - for put in nexus_updates.disk_puts { - self.physical_disk_notify(put.into()).await; - } - for del in nexus_updates.disk_deletes { - self.physical_disk_notify(del.into()).await; - } - for (pool, put) in nexus_updates.zpool_puts { - self.add_zpool_notify(pool, put).await; } } - self.dump_setup.update_dumpdev_setup(updated_resources.disks()).await; - - // Save the updated `StorageResources` - self.storage_resources = updated_resources; - } - - // Adds a "notification to nexus" to `self.nexus_notifications`, informing it - // about the addition/removal of a physical disk to this sled. - async fn physical_disk_notify(&mut self, disk: NexusDiskRequest) { - let underlay = self.underlay.as_ref().unwrap().clone(); - let disk2 = disk.clone(); - let notify_nexus = move || { - let underlay = underlay.clone(); - let disk = disk.clone(); - async move { - let nexus_client = underlay.nexus_client.client().clone(); - - match &disk { - NexusDiskRequest::Put(request) => { - nexus_client - .physical_disk_put(&request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - } - NexusDiskRequest::Delete(request) => { - nexus_client - .physical_disk_delete(&request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - } - } - let msg = format!("{:?}", disk); - Ok(msg) - } - }; - - let log = self.log.clone(); - // This notification is often invoked before Nexus has started - // running, so avoid flagging any errors as concerning until some - // time has passed. - let log_post_failure = move |err, call_count, total_duration| { - if call_count == 0 { - info!(log, "failed to notify nexus about {disk2:?}"; - "err" => ?err - ); - } else if total_duration > std::time::Duration::from_secs(30) { - warn!(log, "failed to notify nexus about {disk2:?}"; - "err" => ?err, - "total duration" => ?total_duration); - } - }; - self.nexus_notifications.push_back( - backoff::retry_notify_ext( - backoff::retry_policy_internal_service_aggressive(), - notify_nexus, - log_post_failure, - ) - .boxed(), - ); } - // Adds a "notification to nexus" to `nexus_notifications`, - // informing it about the addition of `pool_id` to this sled. - async fn add_zpool_notify( - &mut self, - pool: Pool, - zpool_request: ZpoolPutRequest, - ) { - let pool_id = pool.name.id(); - let underlay = self.underlay.as_ref().unwrap().clone(); - - let notify_nexus = move || { - let underlay = underlay.clone(); - let zpool_request = zpool_request.clone(); - async move { - let sled_id = underlay.sled_id; - let nexus_client = underlay.nexus_client.client().clone(); - nexus_client - .zpool_put(&sled_id, &pool_id, &zpool_request) - .await - .map_err(|e| { - backoff::BackoffError::transient(e.to_string()) - })?; - let msg = format!("{:?}", zpool_request); - Ok(msg) - } - }; - - let log = self.log.clone(); - let name = pool.name.clone(); - let disk = pool.parent.clone(); - let log_post_failure = move |err, call_count, total_duration| { - if call_count == 0 { - info!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; - "err" => ?err); - } else if total_duration > std::time::Duration::from_secs(30) { - warn!(log, "failed to notify nexus about a new pool {name} on disk {disk:?}"; - "err" => ?err, - "total duration" => ?total_duration); - } - }; - self.nexus_notifications.push_back( - backoff::retry_notify_ext( - backoff::retry_policy_internal_service_aggressive(), - notify_nexus, - log_post_failure, + async fn handle_resource_update(&mut self, updated_disks: AllDisks) { + self.dump_setup + .update_dumpdev_setup( + updated_disks.iter_managed().map(|(_id, disk)| disk), ) - .boxed(), - ); + .await; } } - -// The type of a future which is used to send a notification to Nexus. -type NotifyFut = - Pin> + Send>>; - -struct NexusUpdates { - disk_puts: Vec, - disk_deletes: Vec, - zpool_puts: Vec<(Pool, ZpoolPutRequest)>, -} - -fn compute_resource_diffs( - log: &Logger, - sled_id: &Uuid, - current: &StorageResources, - updated: &StorageResources, -) -> NexusUpdates { - let mut disk_puts = vec![]; - let mut disk_deletes = vec![]; - let mut zpool_puts = vec![]; - - let mut put_pool = |disk_id: &DiskIdentity, updated_pool: &Pool| { - match ByteCount::try_from(updated_pool.info.size()) { - Ok(size) => zpool_puts.push(( - updated_pool.clone(), - ZpoolPutRequest { - size: size.into(), - disk_model: disk_id.model.clone(), - disk_serial: disk_id.serial.clone(), - disk_vendor: disk_id.vendor.clone(), - }, - )), - Err(err) => { - error!( - log, - "Error parsing pool size"; - "name" => updated_pool.name.to_string(), - "err" => ?err); - } - } - }; - - // Diff the existing resources with the update to see what has changed - // This loop finds disks and pools that were modified or deleted - for (disk_id, (disk, pool)) in current.disks().iter() { - match updated.disks().get(disk_id) { - Some((updated_disk, updated_pool)) => { - if disk != updated_disk { - disk_puts.push(PhysicalDiskPutRequest { - sled_id: *sled_id, - model: disk_id.model.clone(), - serial: disk_id.serial.clone(), - vendor: disk_id.vendor.clone(), - variant: updated_disk.variant().convert(), - }); - } - if pool != updated_pool { - put_pool(disk_id, updated_pool); - } - } - None => disk_deletes.push(PhysicalDiskDeleteRequest { - model: disk_id.model.clone(), - serial: disk_id.serial.clone(), - vendor: disk_id.vendor.clone(), - sled_id: *sled_id, - }), - } - } - - // Diff the existing resources with the update to see what has changed - // This loop finds new disks and pools - for (disk_id, (updated_disk, updated_pool)) in updated.disks().iter() { - if !current.disks().contains_key(disk_id) { - disk_puts.push(PhysicalDiskPutRequest { - sled_id: *sled_id, - model: disk_id.model.clone(), - serial: disk_id.serial.clone(), - vendor: disk_id.vendor.clone(), - variant: updated_disk.variant().convert(), - }); - put_pool(disk_id, updated_pool); - } - } - - NexusUpdates { disk_puts, disk_deletes, zpool_puts } -} diff --git a/sled-agent/src/vmm_reservoir.rs b/sled-agent/src/vmm_reservoir.rs index b16286f5f5..caa1d88254 100644 --- a/sled-agent/src/vmm_reservoir.rs +++ b/sled-agent/src/vmm_reservoir.rs @@ -120,7 +120,8 @@ impl VmmReservoirManagerHandle { rx.await.map_err(|_| Error::ReplySenderDropped)? } - #[cfg(test)] + /// TODO: We should be able run to tests in VMs that can use the real VmmReservoir + #[cfg(all(test, target_os = "illumos"))] pub fn stub_for_test() -> Self { let (tx, _) = flume::bounded(1); let (size_updated_tx, _) = broadcast::channel(1); diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 7b0d9b8071..57d3cb1049 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -255,7 +255,7 @@ impl Inner { // that can exist but do not, i.e., those whose parent datasets already // exist; and returns those. async fn bundle_directories(&self) -> Vec { - let resources = self.storage_handle.get_latest_resources().await; + let resources = self.storage_handle.get_latest_disks().await; let expected = resources.all_zone_bundle_directories(); let mut out = Vec::with_capacity(expected.len()); for each in expected.into_iter() { @@ -263,6 +263,7 @@ impl Inner { out.push(each); } } + out.sort(); out } } @@ -427,7 +428,7 @@ impl ZoneBundler { ) -> Result { let inner = self.inner.lock().await; let storage_dirs = inner.bundle_directories().await; - let resources = inner.storage_handle.get_latest_resources().await; + let resources = inner.storage_handle.get_latest_disks().await; let extra_log_dirs = resources .all_u2_mountpoints(U2_DEBUG_DATASET) .into_iter() @@ -2168,26 +2169,22 @@ mod illumos_tests { use super::StorageLimit; use super::Utf8Path; use super::Utf8PathBuf; - use super::Uuid; use super::ZoneBundleCause; use super::ZoneBundleId; use super::ZoneBundleInfo; use super::ZoneBundleMetadata; use super::ZoneBundler; - use super::ZFS; use anyhow::Context; use chrono::DateTime; use chrono::TimeZone; use chrono::Timelike; use chrono::Utc; - use illumos_utils::zpool::ZpoolName; use rand::RngCore; - use sled_storage::disk::RawDisk; - use sled_storage::disk::SyntheticDisk; - use sled_storage::manager::{FakeStorageManager, StorageHandle}; + use sled_storage::manager_test_harness::StorageManagerTestHarness; use slog::Drain; use slog::Logger; - use tokio::process::Command; + use std::sync::Arc; + use tokio::sync::Mutex; /// An iterator that returns the date of consecutive days beginning with 1st /// January 2020. The time portion of each returned date will be fixed at @@ -2239,77 +2236,58 @@ mod illumos_tests { assert!(zfs_quota(&path).await.is_err()); } - struct CleanupTestContext { + struct CleanupTestContextInner { resource_wrapper: ResourceWrapper, context: CleanupContext, bundler: ZoneBundler, } + // Practically, we only expect one thread to "own" this context at a time. + // However, with the "run_test_with_zfs_dataset", it's hard to pass an + // async function as a parameter ("test") that acts on a mutable reference + // without some fancy HRTB shenanigans. + // + // Reader: If you think you can pass a "&mut CleanupTestContextInner" + // there instead of an "Arc>", I welcome you to try! + #[derive(Clone)] + struct CleanupTestContext { + ctx: Arc>, + } + // A wrapper around `StorageResources`, that automatically creates dummy // directories in the provided test locations and removes them on drop. // - // I'd much prefer this to be done in $TEMPDIR. However, `StorageResources` - // is difficult to mock out or modify in such a way that the underlying - // dataset locations can be controlled. - // - // This creates completely BS disks, and fake names for the zpools on them. - // Those pools are _supposed_ to live at directories like: - // - // `/pool/int/` - // // They don't exist when you just do `StorageResources::new_for_test()`. // This type creates the datasets at the expected mountpoints, backed by the // ramdisk, and removes them on drop. This is basically a tempdir-like // system, that creates the directories implied by the `StorageResources` // expected disk structure. struct ResourceWrapper { - storage_handle: StorageHandle, + storage_test_harness: StorageManagerTestHarness, dirs: Vec, } - async fn setup_storage() -> StorageHandle { - let (manager, handle) = FakeStorageManager::new(); - - // Spawn the storage manager as done by sled-agent - tokio::spawn(async move { - manager.run().await; - }); + async fn setup_storage(log: &Logger) -> StorageManagerTestHarness { + let mut harness = StorageManagerTestHarness::new(&log).await; - // These must be internal zpools - for i in 0..2 { - let internal_zpool_name = ZpoolName::new_internal(Uuid::new_v4()); - let internal_disk: RawDisk = - SyntheticDisk::new(internal_zpool_name.clone(), i).into(); - handle.upsert_disk(internal_disk).await; - } - handle + harness.handle().key_manager_ready().await; + let _raw_disks = + harness.add_vdevs(&["m2_left.vdev", "m2_right.vdev"]).await; + harness } impl ResourceWrapper { - // Create new storage resources, and mount fake datasets at the required + // Create new storage resources, and mount datasets at the required // locations. - async fn new() -> Self { + async fn new(log: &Logger) -> Self { // Spawn the storage related tasks required for testing and insert // synthetic disks. - let storage_handle = setup_storage().await; - let resources = storage_handle.get_latest_resources().await; - let dirs = resources.all_zone_bundle_directories(); - for d in dirs.iter() { - let id = - d.components().nth(3).unwrap().as_str().parse().unwrap(); - create_test_dataset(&id, d).await.unwrap(); - } - Self { storage_handle, dirs } - } - } - - impl Drop for ResourceWrapper { - fn drop(&mut self) { - for d in self.dirs.iter() { - let id = - d.components().nth(3).unwrap().as_str().parse().unwrap(); - remove_test_dataset(&id).unwrap(); - } + let storage_test_harness = setup_storage(log).await; + let resources = + storage_test_harness.handle().get_latest_disks().await; + let mut dirs = resources.all_zone_bundle_directories(); + dirs.sort(); + Self { storage_test_harness, dirs } } } @@ -2325,25 +2303,34 @@ mod illumos_tests { async fn setup_fake_cleanup_task() -> anyhow::Result { let log = test_logger(); let context = CleanupContext::default(); - let resource_wrapper = ResourceWrapper::new().await; + let resource_wrapper = ResourceWrapper::new(&log).await; let bundler = ZoneBundler::new( log, - resource_wrapper.storage_handle.clone(), + resource_wrapper.storage_test_harness.handle().clone(), context, ); - Ok(CleanupTestContext { resource_wrapper, context, bundler }) + Ok(CleanupTestContext { + ctx: Arc::new(Mutex::new(CleanupTestContextInner { + resource_wrapper, + context, + bundler, + })), + }) } #[tokio::test] async fn test_context() { - let ctx = setup_fake_cleanup_task().await.unwrap(); + let context = setup_fake_cleanup_task().await.unwrap(); + let mut ctx = context.ctx.lock().await; let context = ctx.bundler.cleanup_context().await; assert_eq!(context, ctx.context, "received incorrect context"); + ctx.resource_wrapper.storage_test_harness.cleanup().await; } #[tokio::test] async fn test_update_context() { - let ctx = setup_fake_cleanup_task().await.unwrap(); + let context = setup_fake_cleanup_task().await.unwrap(); + let mut ctx = context.ctx.lock().await; let new_context = CleanupContext { period: CleanupPeriod::new(ctx.context.period.as_duration() / 2) .unwrap(), @@ -2363,6 +2350,7 @@ mod illumos_tests { .expect("failed to set context"); let context = ctx.bundler.cleanup_context().await; assert_eq!(context, new_context, "failed to update context"); + ctx.resource_wrapper.storage_test_harness.cleanup().await; } // Quota applied to test datasets. @@ -2374,59 +2362,7 @@ mod illumos_tests { // i.e., the "ashift" value. An empty dataset is unlikely to contain more // than one megabyte of overhead, so use that as a conservative test size to // avoid issues. - const TEST_QUOTA: u64 = 1024 * 1024; - - async fn create_test_dataset( - id: &Uuid, - mountpoint: &Utf8PathBuf, - ) -> anyhow::Result<()> { - let output = Command::new("/usr/bin/pfexec") - .arg(ZFS) - .arg("create") - .arg("-o") - .arg(format!("quota={TEST_QUOTA}")) - .arg("-o") - .arg(format!("mountpoint={mountpoint}")) - .arg(format!("rpool/{id}")) - .output() - .await - .context("failed to spawn zfs create operation")?; - anyhow::ensure!( - output.status.success(), - "zfs create operation failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - // Make the path operable by the test code. - let output = Command::new("/usr/bin/pfexec") - .arg("chmod") - .arg("a+rw") - .arg(&mountpoint) - .output() - .await - .context("failed to spawn chmod operation")?; - anyhow::ensure!( - output.status.success(), - "chmod-ing the dataset failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - Ok(()) - } - - fn remove_test_dataset(id: &Uuid) -> anyhow::Result<()> { - let output = std::process::Command::new("/usr/bin/pfexec") - .arg(ZFS) - .arg("destroy") - .arg(format!("rpool/{id}")) - .output() - .context("failed to spawn zfs destroy operation")?; - anyhow::ensure!( - output.status.success(), - "zfs destroy operation failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - Ok(()) - } + const TEST_QUOTA: usize = sled_storage::dataset::DEBUG_DATASET_QUOTA; async fn run_test_with_zfs_dataset(test: T) where @@ -2436,7 +2372,14 @@ mod illumos_tests { let context = setup_fake_cleanup_task() .await .expect("failed to create cleanup task"); - let result = test(context).await; + let result = test(context.clone()).await; + + let mut ctx = context.ctx.lock().await; + info!( + &ctx.bundler.log, + "Test completed, performing cleanup before emitting result" + ); + ctx.resource_wrapper.storage_test_harness.cleanup().await; result.expect("test failed!"); } @@ -2448,6 +2391,7 @@ mod illumos_tests { async fn test_utilization_body( ctx: CleanupTestContext, ) -> anyhow::Result<()> { + let ctx = ctx.ctx.lock().await; let utilization = ctx.bundler.utilization().await?; let paths = utilization.keys().cloned().collect::>(); @@ -2462,8 +2406,22 @@ mod illumos_tests { .values() .next() .context("no utilization information?")?; + + // If this needs to change, go modify the "add_vdevs" call in + // "setup_storage". + assert!( + TEST_QUOTA + < StorageManagerTestHarness::DEFAULT_VDEV_SIZE + .try_into() + .unwrap(), + "Quota larger than underlying device (quota: {}, device size: {})", + TEST_QUOTA, + StorageManagerTestHarness::DEFAULT_VDEV_SIZE, + ); + anyhow::ensure!( - bundle_utilization.dataset_quota == TEST_QUOTA, + bundle_utilization.dataset_quota + == u64::try_from(TEST_QUOTA).unwrap(), "computed incorrect dataset quota" ); @@ -2489,9 +2447,13 @@ mod illumos_tests { DaysOfOurBundles::new().next().unwrap(), ZoneBundleCause::ExplicitRequest, ) - .await?; + .await + .context("Failed to insert_fake_bundle")?; - let new_utilization = ctx.bundler.utilization().await?; + let new_utilization = + ctx.bundler.utilization().await.context( + "Failed to get utilization after inserting fake bundle", + )?; anyhow::ensure!( paths == new_utilization.keys().cloned().collect::>(), "paths should not change" @@ -2545,6 +2507,7 @@ mod illumos_tests { } async fn test_cleanup_body(ctx: CleanupTestContext) -> anyhow::Result<()> { + let ctx = ctx.ctx.lock().await; // Let's add a bunch of fake bundles, until we should be over the // storage limit. These will all be explicit requests, so the priority // should be decided based on time, i.e., the ones first added should be @@ -2560,16 +2523,18 @@ mod illumos_tests { let mut days = DaysOfOurBundles::new(); let mut info = Vec::new(); let mut utilization = ctx.bundler.utilization().await?; + let bundle_dir = &ctx.resource_wrapper.dirs[0]; loop { let us = utilization - .values() - .next() + .get(bundle_dir) .context("no utilization information")?; + if us.bytes_used > us.bytes_available { break; } + let it = insert_fake_bundle( - &ctx.resource_wrapper.dirs[0], + bundle_dir, days.next().unwrap(), ZoneBundleCause::ExplicitRequest, ) @@ -2582,15 +2547,8 @@ mod illumos_tests { let counts = ctx.bundler.cleanup().await.context("failed to run cleanup")?; - // We should have cleaned up items in the same paths that we have in the - // context. - anyhow::ensure!( - counts.keys().zip(ctx.resource_wrapper.dirs.iter()).all(|(a, b)| a == b), - "cleaned-up directories do not match the context's storage directories", - ); - // We should have cleaned up the first-inserted bundle. - let count = counts.values().next().context("no cleanup counts")?; + let count = counts.get(bundle_dir).context("no cleanup counts")?; anyhow::ensure!(count.bundles == 1, "expected to cleanup one bundle"); anyhow::ensure!( count.bytes == info[0].bytes, @@ -2621,6 +2579,7 @@ mod illumos_tests { async fn test_list_with_filter_body( ctx: CleanupTestContext, ) -> anyhow::Result<()> { + let ctx = ctx.ctx.lock().await; let mut days = DaysOfOurBundles::new(); let mut info = Vec::new(); const N_BUNDLES: usize = 3; diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index a649b205e1..adea1d182a 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -32,6 +32,14 @@ pub enum PooledDiskError { BadPartitionLayout { path: Utf8PathBuf, why: String }, #[error("Requested partition {partition:?} not found on device {path}")] NotFound { path: Utf8PathBuf, partition: Partition }, + #[error("Zpool UUID required to format this disk")] + MissingZpoolUuid, + #[error("Observed Zpool with unexpected UUID (saw: {observed}, expected: {expected})")] + UnexpectedUuid { expected: Uuid, observed: Uuid }, + #[error("Unexpected disk variant")] + UnexpectedVariant, + #[error("Zpool does not exist")] + ZpoolDoesNotExist, #[error(transparent)] ZpoolCreate(#[from] illumos_utils::zpool::CreateError), #[error("Cannot import zpool: {0}")] @@ -58,7 +66,7 @@ pub enum Partition { ZfsPool, } -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] pub struct DiskPaths { // Full path to the disk under "/devices". // Should NOT end with a ":partition_letter". @@ -69,7 +77,11 @@ pub struct DiskPaths { impl DiskPaths { // Returns the "illumos letter-indexed path" for a device. - fn partition_path(&self, index: usize, raw: bool) -> Option { + pub fn partition_path( + &self, + index: usize, + raw: bool, + ) -> Option { let index = u8::try_from(index).ok()?; let path = &self.devfs_path; @@ -125,7 +137,7 @@ impl DiskPaths { /// This exists as a distinct entity from `Disk` in `sled-storage` because it /// may be desirable to monitor for hardware in one context, and conform disks /// to partition layouts in a different context. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] pub struct UnparsedDisk { paths: DiskPaths, slot: i64, @@ -135,7 +147,6 @@ pub struct UnparsedDisk { } impl UnparsedDisk { - #[allow(dead_code)] pub fn new( devfs_path: Utf8PathBuf, dev_path: Option, @@ -153,6 +164,10 @@ impl UnparsedDisk { } } + pub fn paths(&self) -> &DiskPaths { + &self.paths + } + pub fn devfs_path(&self) -> &Utf8PathBuf { &self.paths.devfs_path } @@ -168,6 +183,10 @@ impl UnparsedDisk { pub fn is_boot_disk(&self) -> bool { self.is_boot_disk } + + pub fn slot(&self) -> i64 { + self.slot + } } /// A physical disk that is partitioned to contain exactly one zpool @@ -197,14 +216,15 @@ impl PooledDisk { pub fn new( log: &Logger, unparsed_disk: UnparsedDisk, + zpool_id: Option, ) -> Result { let paths = &unparsed_disk.paths; let variant = unparsed_disk.variant; - let identity = unparsed_disk.identity(); + let identity = &unparsed_disk.identity; // Ensure the GPT has the right format. This does not necessarily // mean that the partitions are populated with the data we need. let partitions = - ensure_partition_layout(&log, &paths, variant, identity)?; + ensure_partition_layout(&log, &paths, variant, identity, zpool_id)?; // Find the path to the zpool which exists on this disk. // @@ -216,9 +236,10 @@ impl PooledDisk { false, )?; - let zpool_name = Self::ensure_zpool_exists(log, variant, &zpool_path)?; - Self::ensure_zpool_imported(log, &zpool_name)?; - Self::ensure_zpool_failmode_is_continue(log, &zpool_name)?; + let zpool_name = + ensure_zpool_exists(log, variant, &zpool_path, zpool_id)?; + ensure_zpool_imported(log, &zpool_name)?; + ensure_zpool_failmode_is_continue(log, &zpool_name)?; Ok(Self { paths: unparsed_disk.paths, @@ -230,83 +251,130 @@ impl PooledDisk { zpool_name, }) } +} - fn ensure_zpool_exists( - log: &Logger, - variant: DiskVariant, - zpool_path: &Utf8Path, - ) -> Result { - let zpool_name = match Fstyp::get_zpool(&zpool_path) { - Ok(zpool_name) => zpool_name, - Err(_) => { - // What happened here? - // - We saw that a GPT exists for this Disk (or we didn't, and - // made our own). - // - However, this particular partition does not appear to have - // a zpool. - // - // This can happen in situations where "zpool create" - // initialized a zpool, and "zpool destroy" removes the zpool - // but still leaves the partition table untouched. - // - // To remedy: Let's enforce that the partition exists. - info!( - log, - "GPT exists without Zpool: formatting zpool at {}", - zpool_path, - ); - // If a zpool does not already exist, create one. - let zpool_name = match variant { - DiskVariant::M2 => ZpoolName::new_internal(Uuid::new_v4()), - DiskVariant::U2 => ZpoolName::new_external(Uuid::new_v4()), - }; - Zpool::create(&zpool_name, &zpool_path)?; - zpool_name +/// Checks if the zpool exists, but makes no modifications, +/// and does not attempt to import the zpool. +pub fn check_if_zpool_exists( + zpool_path: &Utf8Path, +) -> Result { + let zpool_name = match Fstyp::get_zpool(&zpool_path) { + Ok(zpool_name) => zpool_name, + Err(_) => return Err(PooledDiskError::ZpoolDoesNotExist), + }; + Ok(zpool_name) +} + +pub fn ensure_zpool_exists( + log: &Logger, + variant: DiskVariant, + zpool_path: &Utf8Path, + zpool_id: Option, +) -> Result { + let zpool_name = match Fstyp::get_zpool(&zpool_path) { + Ok(zpool_name) => { + if let Some(expected) = zpool_id { + info!(log, "Checking that UUID in storage matches request"; "expected" => ?expected); + let observed = zpool_name.id(); + if expected != observed { + warn!(log, "Zpool UUID mismatch"; "expected" => ?expected, "observed" => ?observed); + return Err(PooledDiskError::UnexpectedUuid { + expected, + observed, + }); + } } - }; - Zpool::import(&zpool_name).map_err(|e| { - warn!(log, "Failed to import zpool {zpool_name}: {e}"); - PooledDiskError::ZpoolImport(e) - })?; + zpool_name + } + Err(_) => { + // What happened here? + // - We saw that a GPT exists for this Disk (or we didn't, and + // made our own). + // - However, this particular partition does not appear to have + // a zpool. + // + // This can happen in situations where "zpool create" + // initialized a zpool, and "zpool destroy" removes the zpool + // but still leaves the partition table untouched. + // + // To remedy: Let's enforce that the partition exists. + info!( + log, + "GPT exists without Zpool: formatting zpool at {}", zpool_path, + ); + let id = match zpool_id { + Some(id) => { + info!(log, "Formatting zpool with requested ID"; "id" => ?id); + id + } + None => { + let id = Uuid::new_v4(); + info!(log, "Formatting zpool with generated ID"; "id" => ?id); + id + } + }; + + // If a zpool does not already exist, create one. + let zpool_name = match variant { + DiskVariant::M2 => ZpoolName::new_internal(id), + DiskVariant::U2 => ZpoolName::new_external(id), + }; + Zpool::create(&zpool_name, &zpool_path)?; + zpool_name + } + }; + Zpool::import(&zpool_name).map_err(|e| { + warn!(log, "Failed to import zpool {zpool_name}: {e}"); + PooledDiskError::ZpoolImport(e) + })?; - Ok(zpool_name) - } + Ok(zpool_name) +} - fn ensure_zpool_imported( - log: &Logger, - zpool_name: &ZpoolName, - ) -> Result<(), PooledDiskError> { - Zpool::import(&zpool_name).map_err(|e| { - warn!(log, "Failed to import zpool {zpool_name}: {e}"); - PooledDiskError::ZpoolImport(e) - })?; - Ok(()) - } +pub fn ensure_zpool_imported( + log: &Logger, + zpool_name: &ZpoolName, +) -> Result<(), PooledDiskError> { + Zpool::import(&zpool_name).map_err(|e| { + warn!(log, "Failed to import zpool {zpool_name}: {e}"); + PooledDiskError::ZpoolImport(e) + })?; + Ok(()) +} - fn ensure_zpool_failmode_is_continue( - log: &Logger, - zpool_name: &ZpoolName, - ) -> Result<(), PooledDiskError> { - // Ensure failmode is set to `continue`. See - // https://github.com/oxidecomputer/omicron/issues/2766 for details. The - // short version is, each pool is only backed by one vdev. There is no - // recovery if one starts breaking, so if connectivity to one dies it's - // actively harmful to try to wait for it to come back; we'll be waiting - // forever and get stuck. We'd rather get the errors so we can deal with - // them ourselves. - Zpool::set_failmode_continue(&zpool_name).map_err(|e| { - warn!( - log, - "Failed to set failmode=continue on zpool {zpool_name}: {e}" - ); - PooledDiskError::ZpoolImport(e) - })?; - Ok(()) - } +pub fn ensure_zpool_failmode_is_continue( + log: &Logger, + zpool_name: &ZpoolName, +) -> Result<(), PooledDiskError> { + // Ensure failmode is set to `continue`. See + // https://github.com/oxidecomputer/omicron/issues/2766 for details. The + // short version is, each pool is only backed by one vdev. There is no + // recovery if one starts breaking, so if connectivity to one dies it's + // actively harmful to try to wait for it to come back; we'll be waiting + // forever and get stuck. We'd rather get the errors so we can deal with + // them ourselves. + Zpool::set_failmode_continue(&zpool_name).map_err(|e| { + warn!( + log, + "Failed to set failmode=continue on zpool {zpool_name}: {e}" + ); + PooledDiskError::ZpoolImport(e) + })?; + Ok(()) } #[derive( - Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + Serialize, + Deserialize, + JsonSchema, + Ord, + PartialOrd, )] pub enum DiskVariant { U2, diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs index 3b8e0af2ee..32debfc3e1 100644 --- a/sled-hardware/src/illumos/partitions.rs +++ b/sled-hardware/src/illumos/partitions.rs @@ -148,9 +148,10 @@ pub fn ensure_partition_layout( paths: &DiskPaths, variant: DiskVariant, identity: &DiskIdentity, + zpool_id: Option, ) -> Result, PooledDiskError> { internal_ensure_partition_layout::( - log, paths, variant, identity, + log, paths, variant, identity, zpool_id, ) } @@ -161,23 +162,26 @@ fn internal_ensure_partition_layout( paths: &DiskPaths, variant: DiskVariant, identity: &DiskIdentity, + zpool_id: Option, ) -> Result, PooledDiskError> { // Open the "Whole Disk" as a raw device to be parsed by the // libefi-illumos library. This lets us peek at the GPT before // making too many assumptions about it. let raw = true; let path = paths.whole_disk(raw); + let devfs_path_str = paths.devfs_path.as_str().to_string(); + let log = log.new(slog::o!("path" => devfs_path_str)); let gpt = match GPT::read(&path) { Ok(gpt) => { // This should be the common steady-state case - info!(log, "Disk at {} already has a GPT", paths.devfs_path); + info!(log, "Disk already has a GPT"); gpt } Err(libefi_illumos::Error::LabelNotFound) => { // Fresh U.2 disks are an example of devices where "we don't expect // a GPT to exist". - info!(log, "Disk at {} does not have a GPT", paths.devfs_path); + info!(log, "Disk does not have a GPT"); // For ZFS-implementation-specific reasons, Zpool create can only // act on devices under the "/dev" hierarchy, rather than the device @@ -193,12 +197,19 @@ fn internal_ensure_partition_layout( DiskVariant::U2 => { // First we need to check that this disk is of the proper // size and correct logical block address formatting. - ensure_size_and_formatting(log, identity)?; + ensure_size_and_formatting(&log, identity)?; + + info!( + log, + "Formatting zpool on disk"; + "uuid" => ?zpool_id, + ); + let Some(zpool_id) = zpool_id else { + return Err(PooledDiskError::MissingZpoolUuid); + }; - // If we were successful we can create a zpool on this disk. - info!(log, "Formatting zpool on disk {}", paths.devfs_path); // If a zpool does not already exist, create one. - let zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let zpool_name = ZpoolName::new_external(zpool_id); Zpool::create(&zpool_name, dev_path)?; return Ok(vec![Partition::ZfsPool]); } @@ -385,6 +396,7 @@ mod test { &DiskPaths { devfs_path, dev_path: None }, DiskVariant::U2, &mock_disk_identity(), + None, ); match result { Err(PooledDiskError::CannotFormatMissingDevPath { .. }) => {} @@ -419,6 +431,7 @@ mod test { }, DiskVariant::U2, &mock_disk_identity(), + Some(Uuid::new_v4()), ) .expect("Should have succeeded partitioning disk"); @@ -444,6 +457,7 @@ mod test { }, DiskVariant::M2, &mock_disk_identity(), + None, ) .is_err()); @@ -482,6 +496,7 @@ mod test { }, DiskVariant::U2, &mock_disk_identity(), + None, ) .expect("Should be able to parse disk"); @@ -525,6 +540,7 @@ mod test { }, DiskVariant::M2, &mock_disk_identity(), + None, ) .expect("Should be able to parse disk"); @@ -565,6 +581,7 @@ mod test { }, DiskVariant::M2, &mock_disk_identity(), + None, ) .expect_err("Should have failed parsing empty GPT"), PooledDiskError::BadPartitionLayout { .. } @@ -591,6 +608,7 @@ mod test { }, DiskVariant::U2, &mock_disk_identity(), + None, ) .expect_err("Should have failed parsing empty GPT"), PooledDiskError::BadPartitionLayout { .. } diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs index e990567b7c..a47bb0d2bc 100644 --- a/sled-hardware/src/non_illumos/mod.rs +++ b/sled-hardware/src/non_illumos/mod.rs @@ -68,6 +68,7 @@ pub fn ensure_partition_layout( _paths: &DiskPaths, _variant: DiskVariant, _identity: &DiskIdentity, + _zpool_id: Option, ) -> Result, PooledDiskError> { unimplemented!("Accessing hardware unsupported on non-illumos"); } diff --git a/sled-storage/Cargo.toml b/sled-storage/Cargo.toml index cb3a790631..839908effb 100644 --- a/sled-storage/Cargo.toml +++ b/sled-storage/Cargo.toml @@ -4,11 +4,15 @@ version = "0.1.0" edition = "2021" [dependencies] +anyhow.workspace = true async-trait.workspace = true camino.workspace = true +camino-tempfile.workspace = true cfg-if.workspace = true +debug-ignore.workspace = true derive_more.workspace = true glob.workspace = true +futures.workspace = true illumos-utils.workspace = true key-manager.workspace = true omicron-common.workspace = true @@ -24,9 +28,9 @@ uuid.workspace = true omicron-workspace-hack.workspace = true [dev-dependencies] -illumos-utils = { workspace = true, features = ["tmp_keypath", "testing"] } +expectorate.workspace = true +illumos-utils = { workspace = true, features = ["testing"] } omicron-test-utils.workspace = true -camino-tempfile.workspace = true [features] # Quotas and the like can be shrunk via this feature diff --git a/sled-storage/src/config.rs b/sled-storage/src/config.rs new file mode 100644 index 0000000000..a3baf220b2 --- /dev/null +++ b/sled-storage/src/config.rs @@ -0,0 +1,39 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Describes a handful of configuration options that can be +//! used to tweak behavior under test. + +use camino::Utf8PathBuf; + +/// Options to alter the mount path of datasets. +/// +/// By default, datasets within a pool are mounted under "/pool/ext/..." and +/// "/pool/int/...". For more context, see: +/// [illumos_utils::zpool::ZpoolName::dataset_mountpoint]. +/// +/// However, under test, it can be desirable to have a root filesystem +/// which is isolated from other tests, and which doesn't need to exist under +/// the root filesystem. [MountConfig] provides options to tweak which path is +/// used to set up and access these datasets. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct MountConfig { + /// The root path under which datasets are located. + pub root: Utf8PathBuf, + + /// The path where synthetic disks are stored, + /// if their paths are not absolute. + pub synthetic_disk_root: Utf8PathBuf, +} + +impl Default for MountConfig { + fn default() -> Self { + Self { + root: Utf8PathBuf::from( + illumos_utils::zpool::ZPOOL_MOUNTPOINT_ROOT, + ), + synthetic_disk_root: Utf8PathBuf::from("/var/tmp"), + } + } +} diff --git a/sled-storage/src/dataset.rs b/sled-storage/src/dataset.rs index 41b77ea38b..06eea367b9 100644 --- a/sled-storage/src/dataset.rs +++ b/sled-storage/src/dataset.rs @@ -4,6 +4,7 @@ //! ZFS dataset related functionality +use crate::config::MountConfig; use crate::keyfile::KeyFile; use camino::Utf8PathBuf; use cfg_if::cfg_if; @@ -33,7 +34,7 @@ pub const M2_BACKING_DATASET: &'static str = "backing"; cfg_if! { if #[cfg(any(test, feature = "testing"))] { // Tuned for zone_bundle tests - pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 10); + pub const DEBUG_DATASET_QUOTA: usize = 1 << 20; } else { // TODO-correctness: This value of 100GiB is a pretty wild guess, and should be // tuned as needed. @@ -279,10 +280,12 @@ pub enum DatasetError { /// `None` is for the M.2s touched by the Installinator. pub(crate) async fn ensure_zpool_has_datasets( log: &Logger, + mount_config: &MountConfig, zpool_name: &ZpoolName, disk_identity: &DiskIdentity, key_requester: Option<&StorageKeyRequester>, ) -> Result<(), DatasetError> { + info!(log, "Ensuring zpool has datasets"; "zpool" => ?zpool_name, "disk_identity" => ?disk_identity); let (root, datasets) = match zpool_name.kind().into() { DiskVariant::M2 => (None, M2_EXPECTED_DATASETS.iter()), DiskVariant::U2 => (Some(CRYPT_DATASET), U2_EXPECTED_DATASETS.iter()), @@ -297,8 +300,10 @@ pub(crate) async fn ensure_zpool_has_datasets( let Some(key_requester) = key_requester else { return Err(DatasetError::MissingStorageKeyRequester); }; - let mountpoint = zpool_name.dataset_mountpoint(dataset); - let keypath: Keypath = disk_identity.into(); + let mountpoint = + zpool_name.dataset_mountpoint(&mount_config.root, dataset); + let keypath: Keypath = + illumos_utils::zfs::Keypath::new(disk_identity, &mount_config.root); let epoch = if let Ok(epoch_str) = Zfs::get_oxide_value(dataset, "epoch") @@ -324,15 +329,15 @@ pub(crate) async fn ensure_zpool_has_datasets( // other reason, but the dataset actually existed, we will // try to create the dataset below and that will fail. So // there is no harm in just loading the latest secret here. - info!(log, "Loading latest secret"; "disk_id"=>#?disk_identity); + info!(log, "Loading latest secret"; "disk_id"=>?disk_identity); let epoch = key_requester.load_latest_secret().await?; - info!(log, "Loaded latest secret"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); + info!(log, "Loaded latest secret"; "epoch"=>%epoch, "disk_id"=>?disk_identity); epoch }; - info!(log, "Retrieving key"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); + info!(log, "Retrieving key"; "epoch"=>%epoch, "disk_id"=>?disk_identity); let key = key_requester.get_key(epoch, disk_identity.clone()).await?; - info!(log, "Got key"; "epoch"=>%epoch, "disk_id"=>#?disk_identity); + info!(log, "Got key"; "epoch"=>%epoch, "disk_id"=>?disk_identity); let mut keyfile = KeyFile::create(keypath.clone(), key.expose_secret(), log) @@ -366,7 +371,8 @@ pub(crate) async fn ensure_zpool_has_datasets( }; for dataset in datasets.into_iter() { - let mountpoint = zpool_name.dataset_mountpoint(dataset.name); + let mountpoint = + zpool_name.dataset_mountpoint(&mount_config.root, dataset.name); let name = &format!("{}/{}", zpool_name, dataset.name); // Use a value that's alive for the duration of this sled agent diff --git a/sled-storage/src/disk.rs b/sled-storage/src/disk.rs index 705b38718a..7383475cb9 100644 --- a/sled-storage/src/disk.rs +++ b/sled-storage/src/disk.rs @@ -4,19 +4,72 @@ //! Disk related types +use anyhow::bail; use camino::{Utf8Path, Utf8PathBuf}; use derive_more::From; -use illumos_utils::zpool::{Zpool, ZpoolKind, ZpoolName}; +use illumos_utils::zpool::{ZpoolKind, ZpoolName}; use key_manager::StorageKeyRequester; +use omicron_common::api::external::Generation; use omicron_common::disk::DiskIdentity; +use omicron_common::ledger::Ledgerable; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use sled_hardware::{ DiskVariant, Partition, PooledDisk, PooledDiskError, UnparsedDisk, }; -use slog::Logger; -use std::fs::File; +use slog::{info, Logger}; +use uuid::Uuid; +use crate::config::MountConfig; use crate::dataset; +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, +)] +pub struct OmicronPhysicalDiskConfig { + pub identity: DiskIdentity, + pub id: Uuid, + pub pool_id: Uuid, +} + +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, +)] +pub struct OmicronPhysicalDisksConfig { + /// generation number of this configuration + /// + /// This generation number is owned by the control plane (i.e., RSS or + /// Nexus, depending on whether RSS-to-Nexus handoff has happened). It + /// should not be bumped within Sled Agent. + /// + /// Sled Agent rejects attempts to set the configuration to a generation + /// older than the one it's currently running. + pub generation: Generation, + + pub disks: Vec, +} + +impl Default for OmicronPhysicalDisksConfig { + fn default() -> Self { + Self { generation: Generation::new(), disks: vec![] } + } +} + +impl Ledgerable for OmicronPhysicalDisksConfig { + fn is_newer_than(&self, other: &OmicronPhysicalDisksConfig) -> bool { + self.generation > other.generation + } + + // No need to do this, the generation number is provided externally. + fn generation_bump(&mut self) {} +} + +impl OmicronPhysicalDisksConfig { + pub fn new() -> Self { + Self { generation: Generation::new(), disks: vec![] } + } +} + #[derive(Debug, thiserror::Error)] pub enum DiskError { #[error(transparent)] @@ -25,13 +78,11 @@ pub enum DiskError { PooledDisk(#[from] sled_hardware::PooledDiskError), } -// A synthetic disk that acts as one "found" by the hardware and that is backed -// by a zpool +/// A synthetic disk which has been formatted with a zpool. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SyntheticDisk { - pub identity: DiskIdentity, - pub zpool_name: ZpoolName, - pub slot: i64, + raw: RawSyntheticDisk, + zpool_name: ZpoolName, } // By adding slots at an "offset", this acts as a barrier against synthetic @@ -43,45 +94,111 @@ pub struct SyntheticDisk { const SYNTHETIC_SLOT_OFFSET: i64 = 1024; impl SyntheticDisk { - // Create a zpool and import it for the synthetic disk - // Zpools willl be set to the min size of 64Mib - pub fn create_zpool( - dir: &Utf8Path, - zpool_name: &ZpoolName, + // "Manages" a SyntheticDisk by ensuring that it has a Zpool and importing + // it. If the zpool already exists, it is imported, but not re-created. + pub fn new( + log: &Logger, + mount_config: &MountConfig, + raw: RawSyntheticDisk, + zpool_id: Option, + ) -> Self { + let path = if raw.path.is_absolute() { + raw.path.clone() + } else { + mount_config.synthetic_disk_root.join(&raw.path) + }; + + info!( + log, + "Invoking SyntheticDisk::new"; + "identity" => ?raw.identity, + "path" => %path, + ); + + let zpool_name = sled_hardware::disk::ensure_zpool_exists( + log, + raw.variant, + &path, + zpool_id, + ) + .unwrap(); + sled_hardware::disk::ensure_zpool_imported(log, &zpool_name).unwrap(); + sled_hardware::disk::ensure_zpool_failmode_is_continue( + log, + &zpool_name, + ) + .unwrap(); + + Self { raw, zpool_name } + } +} + +// A synthetic disk that acts as one "found" by the hardware and that is backed +// by a vdev. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub struct RawSyntheticDisk { + pub path: Utf8PathBuf, + pub identity: DiskIdentity, + pub variant: DiskVariant, + pub slot: i64, +} + +impl RawSyntheticDisk { + /// Creates the file with a specified length, and also parses it as + /// a [RawSyntheticDisk]. + pub fn new_with_length>( + vdev: P, + length: u64, slot: i64, - ) -> SyntheticDisk { - // 64 MiB (min size of zpool) - const DISK_SIZE: u64 = 64 * 1024 * 1024; - let path = dir.join(zpool_name.to_string()); - let file = File::create(&path).unwrap(); - file.set_len(DISK_SIZE).unwrap(); - drop(file); - Zpool::create(zpool_name, &path).unwrap(); - Zpool::import(zpool_name).unwrap(); - Zpool::set_failmode_continue(zpool_name).unwrap(); - Self::new(zpool_name.clone(), slot) + ) -> Result { + let file = std::fs::File::create(vdev.as_ref())?; + file.set_len(length)?; + Self::load(vdev, slot) } - pub fn new(zpool_name: ZpoolName, slot: i64) -> SyntheticDisk { - let id = zpool_name.id(); + /// Treats a file at path `vdev` as a synthetic disk. The file + /// should already exist, and have the desired length. + pub fn load>( + vdev: P, + slot: i64, + ) -> Result { + let path = vdev.as_ref(); + let Some(file) = path.file_name() else { + bail!("Missing file name for synthetic disk"); + }; + + let Some(file) = file.strip_suffix(".vdev") else { + bail!("Missing '.vdev' suffix for synthetic disk"); + }; + + let (serial, variant) = if let Some(serial) = file.strip_prefix("m2_") { + (serial, DiskVariant::M2) + } else if let Some(serial) = file.strip_prefix("u2_") { + (serial, DiskVariant::U2) + } else { + bail!("Unknown file prefix: {file}. Try one of {{m2_,u2_}}"); + }; + let identity = DiskIdentity { vendor: "synthetic-vendor".to_string(), - serial: format!("synthetic-serial-{id}"), - model: "synthetic-model".to_string(), + serial: format!("synthetic-serial-{serial}"), + model: format!("synthetic-model-{variant:?}"), }; - SyntheticDisk { + + Ok(Self { + path: path.into(), identity, - zpool_name, + variant, slot: slot + SYNTHETIC_SLOT_OFFSET, - } + }) } } // An [`UnparsedDisk`] disk learned about from the hardware or a wrapped zpool -#[derive(Debug, Clone, PartialEq, Eq, Hash, From)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd, From)] pub enum RawDisk { Real(UnparsedDisk), - Synthetic(SyntheticDisk), + Synthetic(RawSyntheticDisk), } impl RawDisk { @@ -90,7 +207,7 @@ impl RawDisk { Self::Real(disk) => disk.is_boot_disk(), Self::Synthetic(disk) => { // Just label any M.2 the boot disk. - disk.zpool_name.kind() == ZpoolKind::Internal + disk.variant == DiskVariant::M2 } } } @@ -105,18 +222,7 @@ impl RawDisk { pub fn variant(&self) -> DiskVariant { match self { Self::Real(disk) => disk.variant(), - Self::Synthetic(disk) => match disk.zpool_name.kind() { - ZpoolKind::External => DiskVariant::U2, - ZpoolKind::Internal => DiskVariant::M2, - }, - } - } - - #[cfg(test)] - pub fn zpool_name(&self) -> &ZpoolName { - match self { - Self::Real(_) => unreachable!(), - Self::Synthetic(disk) => &disk.zpool_name, + Self::Synthetic(disk) => disk.variant, } } @@ -131,12 +237,37 @@ impl RawDisk { !self.is_synthetic() } + pub fn u2_zpool_path(&self) -> Result { + if !matches!(self.variant(), DiskVariant::U2) { + return Err(PooledDiskError::UnexpectedVariant); + } + match self { + Self::Real(disk) => { + let paths = disk.paths(); + // This is hard-coded to be "0", but that's because we aren't + // really parsing the whole partition table before considering + // where this would be see. + paths + .partition_path(0, false) + .ok_or_else(|| PooledDiskError::ZpoolDoesNotExist) + } + Self::Synthetic(raw) => Ok(raw.path.clone()), + } + } + pub fn devfs_path(&self) -> &Utf8PathBuf { match self { Self::Real(disk) => disk.devfs_path(), Self::Synthetic(_) => unreachable!(), } } + + pub fn slot(&self) -> i64 { + match self { + Self::Real(disk) => disk.slot(), + Self::Synthetic(disk) => disk.slot, + } + } } /// A physical [`PooledDisk`] or a [`SyntheticDisk`] that contains or is backed @@ -151,15 +282,23 @@ pub enum Disk { impl Disk { pub async fn new( log: &Logger, + mount_config: &MountConfig, raw_disk: RawDisk, + pool_id: Option, key_requester: Option<&StorageKeyRequester>, ) -> Result { - let disk = match raw_disk { - RawDisk::Real(disk) => PooledDisk::new(log, disk)?.into(), - RawDisk::Synthetic(disk) => Disk::Synthetic(disk), + let disk: Disk = match raw_disk { + RawDisk::Real(disk) => PooledDisk::new(log, disk, pool_id)?.into(), + RawDisk::Synthetic(disk) => Disk::Synthetic(SyntheticDisk::new( + log, + mount_config, + disk, + pool_id, + )), }; dataset::ensure_zpool_has_datasets( log, + mount_config, disk.zpool_name(), disk.identity(), key_requester, @@ -194,7 +333,7 @@ impl Disk { Self::Real(disk) => disk.is_boot_disk, Self::Synthetic(disk) => { // Just label any M.2 the boot disk. - disk.zpool_name.kind() == ZpoolKind::Internal + disk.raw.variant == DiskVariant::M2 } } } @@ -202,7 +341,7 @@ impl Disk { pub fn identity(&self) -> &DiskIdentity { match self { Self::Real(disk) => &disk.identity, - Self::Synthetic(disk) => &disk.identity, + Self::Synthetic(disk) => &disk.raw.identity, } } @@ -261,7 +400,25 @@ impl Disk { pub fn slot(&self) -> i64 { match self { Self::Real(disk) => disk.slot, - Self::Synthetic(disk) => disk.slot, + Self::Synthetic(disk) => disk.raw.slot, + } + } +} + +impl From for RawDisk { + fn from(disk: Disk) -> RawDisk { + match disk { + Disk::Real(pooled_disk) => RawDisk::Real(UnparsedDisk::new( + pooled_disk.paths.devfs_path, + pooled_disk.paths.dev_path, + pooled_disk.slot, + pooled_disk.variant, + pooled_disk.identity, + pooled_disk.is_boot_disk, + )), + Disk::Synthetic(synthetic_disk) => { + RawDisk::Synthetic(synthetic_disk.raw) + } } } } diff --git a/sled-storage/src/error.rs b/sled-storage/src/error.rs index b9f97ee428..4c5582fd79 100644 --- a/sled-storage/src/error.rs +++ b/sled-storage/src/error.rs @@ -8,6 +8,7 @@ use crate::dataset::{DatasetError, DatasetName}; use crate::disk::DiskError; use camino::Utf8PathBuf; use omicron_common::api::external::ByteCountRangeError; +use omicron_common::api::external::Generation; use uuid::Uuid; #[derive(thiserror::Error, Debug)] @@ -49,9 +50,6 @@ pub enum Error { #[error(transparent)] ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), - #[error("No U.2 Zpools found")] - NoU2Zpool, - #[error("Failed to parse UUID from {path}: {err}")] ParseUuid { path: Utf8PathBuf, @@ -76,6 +74,50 @@ pub enum Error { err: uuid::Error, }, + #[error("Not ready to manage U.2s (key manager is not ready)")] + KeyManagerNotReady, + + #[error("Physical disk configuration out-of-date (asked for {requested}, but latest is {current})")] + PhysicalDiskConfigurationOutdated { + requested: Generation, + current: Generation, + }, + + #[error("Failed to update ledger in internal storage")] + Ledger(#[from] omicron_common::ledger::Error), + + #[error("No ledger found on internal storage")] + LedgerNotFound, + #[error("Zpool Not Found: {0}")] ZpoolNotFound(String), } + +impl From for omicron_common::api::external::Error { + fn from(err: Error) -> Self { + use omicron_common::api::external::Error as ExternalError; + use omicron_common::api::external::LookupType; + use omicron_common::api::external::ResourceType; + + match err { + Error::LedgerNotFound => ExternalError::ObjectNotFound { + type_name: ResourceType::SledLedger, + lookup_type: LookupType::ByOther( + "Could not find record on M.2s".to_string(), + ), + }, + Error::ZpoolNotFound(name) => ExternalError::ObjectNotFound { + type_name: ResourceType::Zpool, + lookup_type: LookupType::ByName(name), + }, + Error::KeyManagerNotReady => ExternalError::ServiceUnavailable { + internal_message: + "Not ready to manage disks, try again after trust quorum" + .to_string(), + }, + _ => omicron_common::api::external::Error::InternalError { + internal_message: err.to_string(), + }, + } + } +} diff --git a/sled-storage/src/keyfile.rs b/sled-storage/src/keyfile.rs index 48e5d9a528..2c0524aec7 100644 --- a/sled-storage/src/keyfile.rs +++ b/sled-storage/src/keyfile.rs @@ -27,6 +27,7 @@ impl KeyFile { key: &[u8; 32], log: &Logger, ) -> std::io::Result { + info!(log, "About to create keyfile"; "path" => ?path); // We want to overwrite any existing contents. let mut file = tokio::fs::OpenOptions::new() .create(true) @@ -34,7 +35,7 @@ impl KeyFile { .open(&path.0) .await?; file.write_all(key).await?; - info!(log, "Created keyfile {}", path); + info!(log, "Created keyfile"; "path" => ?path); Ok(KeyFile { path, file, diff --git a/sled-storage/src/lib.rs b/sled-storage/src/lib.rs index d4b64c55a5..681f003b52 100644 --- a/sled-storage/src/lib.rs +++ b/sled-storage/src/lib.rs @@ -8,10 +8,13 @@ //! hardware partitions from the `sled-hardware` crate. It utilizes the //! `illumos-utils` crate to actually perform ZFS related OS calls. +pub mod config; pub mod dataset; pub mod disk; pub mod error; pub(crate) mod keyfile; pub mod manager; +#[cfg(any(feature = "testing", test))] +pub mod manager_test_harness; pub mod pool; pub mod resources; diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index bb749cc366..2cd79e6556 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -6,17 +6,24 @@ use std::collections::HashSet; -use crate::dataset::{DatasetError, DatasetName}; -use crate::disk::{Disk, DiskError, RawDisk}; +use crate::config::MountConfig; +use crate::dataset::{DatasetName, CONFIG_DATASET}; +use crate::disk::{ + OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, RawDisk, +}; use crate::error::Error; -use crate::resources::{AddDiskResult, StorageResources}; +use crate::resources::{AllDisks, DisksManagementResult, StorageResources}; use camino::Utf8PathBuf; +use debug_ignore::DebugIgnore; +use futures::future::FutureExt; use illumos_utils::zfs::{Mountpoint, Zfs}; use illumos_utils::zpool::ZpoolName; use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; +use omicron_common::ledger::Ledger; use sled_hardware::DiskVariant; -use slog::{error, info, o, warn, Logger}; +use slog::{info, o, warn, Logger}; +use std::future::Future; use tokio::sync::{mpsc, oneshot, watch}; use tokio::time::{interval, Duration, MissedTickBehavior}; use uuid::Uuid; @@ -48,80 +55,199 @@ use uuid::Uuid; // large messages. // // Here we start relatively small so that we can evaluate our choice over time. -const QUEUE_SIZE: usize = 256; +pub(crate) const QUEUE_SIZE: usize = 256; + +const SYNCHRONIZE_INTERVAL: Duration = Duration::from_secs(10); + +// The filename of the ledger storing physical disk info +const DISKS_LEDGER_FILENAME: &str = "omicron-physical-disks.json"; #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum StorageManagerState { +enum StorageManagerState { + // We know that any attempts to manage disks will fail, as the key manager + // is not ready yet. WaitingForKeyManager, - QueueingDisks, - Normal, + + // This state is used to indicate that the set of "control plane" physical + // disks and the set of "observed" disks may be out-of-sync. + // + // This can happen when: + // - The sled boots, and the ledger of "control plane disks" is initially + // loaded. + // - A U.2 is added to the disk after initial boot. + // + // In both of these cases, if trust quorum hasn't been established, it's + // possible that the request to [Self::manage_disks] will need to retry. + SynchronizationNeeded, + + // This state indicates the key manager is ready, and the storage manager + // believes that the set of control plane disks is in-sync with the set of + // observed disks. + Synchronized, } #[derive(Debug)] -struct NewFilesystemRequest { +pub(crate) struct NewFilesystemRequest { dataset_id: Uuid, dataset_name: DatasetName, - responder: oneshot::Sender>, + responder: DebugIgnore>>, } #[derive(Debug)] -enum StorageRequest { - AddDisk(RawDisk), - RemoveDisk(RawDisk), - DisksChanged(HashSet), +pub(crate) enum StorageRequest { + // Requests to manage which devices the sled considers active. + // These are manipulated by hardware management. + DetectedRawDisk { + raw_disk: RawDisk, + tx: DebugIgnore>>, + }, + DetectedRawDiskRemoval { + raw_disk: RawDisk, + tx: DebugIgnore>>, + }, + DetectedRawDisksChanged { + raw_disks: HashSet, + tx: DebugIgnore>>, + }, + + // Requests to explicitly manage or stop managing a set of devices + OmicronPhysicalDisksEnsure { + config: OmicronPhysicalDisksConfig, + tx: DebugIgnore>>, + }, + + // Reads the last set of physical disks that were successfully ensured. + OmicronPhysicalDisksList { + tx: DebugIgnore< + oneshot::Sender>, + >, + }, + + // Requests the creation of a new dataset within a managed disk. NewFilesystem(NewFilesystemRequest), + KeyManagerReady, + /// This will always grab the latest state after any new updates, as it /// serializes through the `StorageManager` task after all prior requests. /// This serialization is particularly useful for tests. - GetLatestResources(oneshot::Sender), - - /// Get the internal task state of the manager - GetManagerState(oneshot::Sender), -} - -/// Data managed internally to the StorageManagerTask that can be useful -/// to clients for debugging purposes, and that isn't exposed in other ways. -#[derive(Debug, Clone)] -pub struct StorageManagerData { - pub state: StorageManagerState, - pub queued_u2_drives: HashSet, + GetLatestResources(DebugIgnore>), } /// A mechanism for interacting with the [`StorageManager`] #[derive(Clone)] pub struct StorageHandle { tx: mpsc::Sender, - resource_updates: watch::Receiver, + disk_updates: watch::Receiver, } impl StorageHandle { + pub(crate) fn new( + tx: mpsc::Sender, + disk_updates: watch::Receiver, + ) -> Self { + Self { tx, disk_updates } + } + /// Adds a disk and associated zpool to the storage manager. - pub async fn upsert_disk(&self, disk: RawDisk) { - self.tx.send(StorageRequest::AddDisk(disk)).await.unwrap(); + /// + /// Returns a future which completes once the notification has been + /// processed. Awaiting this future is optional. + pub async fn detected_raw_disk( + &self, + raw_disk: RawDisk, + ) -> impl Future> { + let (tx, rx) = oneshot::channel(); + self.tx + .send(StorageRequest::DetectedRawDisk { raw_disk, tx: tx.into() }) + .await + .unwrap(); + + rx.map(|result| result.unwrap()) } /// Removes a disk, if it's tracked by the storage manager, as well /// as any associated zpools. - pub async fn delete_disk(&self, disk: RawDisk) { - self.tx.send(StorageRequest::RemoveDisk(disk)).await.unwrap(); + /// + /// Returns a future which completes once the notification has been + /// processed. Awaiting this future is optional. + pub async fn detected_raw_disk_removal( + &self, + raw_disk: RawDisk, + ) -> impl Future> { + let (tx, rx) = oneshot::channel(); + self.tx + .send(StorageRequest::DetectedRawDiskRemoval { + raw_disk, + tx: tx.into(), + }) + .await + .unwrap(); + + rx.map(|result| result.unwrap()) } /// Ensures that the storage manager tracks exactly the provided disks. /// - /// This acts similar to a batch [Self::upsert_disk] for all new disks, and - /// [Self::delete_disk] for all removed disks. + /// This acts similar to a batch [Self::detected_raw_disk] for all new disks, and + /// [Self::detected_raw_disk_removal] for all removed disks. /// /// If errors occur, an arbitrary "one" of them will be returned, but a /// best-effort attempt to add all disks will still be attempted. - pub async fn ensure_using_exactly_these_disks(&self, raw_disks: I) + /// + /// Returns a future which completes once the notification has been + /// processed. Awaiting this future is optional. + pub async fn ensure_using_exactly_these_disks( + &self, + raw_disks: I, + ) -> impl Future> where I: IntoIterator, { + let (tx, rx) = oneshot::channel(); self.tx - .send(StorageRequest::DisksChanged(raw_disks.into_iter().collect())) + .send(StorageRequest::DetectedRawDisksChanged { + raw_disks: raw_disks.into_iter().collect(), + tx: tx.into(), + }) .await .unwrap(); + rx.map(|result| result.unwrap()) + } + + pub async fn omicron_physical_disks_ensure( + &self, + config: OmicronPhysicalDisksConfig, + ) -> Result { + let (tx, rx) = oneshot::channel(); + self.tx + .send(StorageRequest::OmicronPhysicalDisksEnsure { + config, + tx: tx.into(), + }) + .await + .unwrap(); + + rx.await.unwrap() + } + + /// Reads the last value written to storage by + /// [Self::omicron_physical_disks_ensure]. + /// + /// This should be contrasted with both inventory and the result + /// of [Self::get_latest_disks] -- since this function focuses on + /// "Control Plane disks", it may return information about disks + /// that are no longer detected within the hardware of this sled. + pub async fn omicron_physical_disks_list( + &self, + ) -> Result { + let (tx, rx) = oneshot::channel(); + self.tx + .send(StorageRequest::OmicronPhysicalDisksList { tx: tx.into() }) + .await + .unwrap(); + + rx.await.unwrap() } /// Notify the [`StorageManager`] that the [`key_manager::KeyManager`] @@ -139,36 +265,35 @@ impl StorageHandle { /// Wait for a boot disk to be initialized pub async fn wait_for_boot_disk(&mut self) -> (DiskIdentity, ZpoolName) { + // We create a distinct receiver to avoid colliding with + // the receiver used by [Self::wait_for_changes]. + let mut receiver = self.disk_updates.clone(); loop { - let resources = self.resource_updates.borrow_and_update(); + let resources = receiver.borrow_and_update(); if let Some((disk_id, zpool_name)) = resources.boot_disk() { return (disk_id, zpool_name); } drop(resources); // We panic if the sender is dropped, as this means // the StorageManager has gone away, which it should not do. - self.resource_updates.changed().await.unwrap(); + receiver.changed().await.unwrap(); } } /// Wait for any storage resource changes - pub async fn wait_for_changes(&mut self) -> StorageResources { - self.resource_updates.changed().await.unwrap(); - self.resource_updates.borrow_and_update().clone() + pub async fn wait_for_changes(&mut self) -> AllDisks { + self.disk_updates.changed().await.unwrap(); + self.disk_updates.borrow_and_update().clone() } - /// Retrieve the latest value of `StorageResources` from the + /// Retrieve the latest value of `AllDisks` from the /// `StorageManager` task. - pub async fn get_latest_resources(&self) -> StorageResources { - let (tx, rx) = oneshot::channel(); - self.tx.send(StorageRequest::GetLatestResources(tx)).await.unwrap(); - rx.await.unwrap() - } - - /// Return internal data useful for debugging and testing - pub async fn get_manager_state(&self) -> StorageManagerData { + pub async fn get_latest_disks(&self) -> AllDisks { let (tx, rx) = oneshot::channel(); - self.tx.send(StorageRequest::GetManagerState(tx)).await.unwrap(); + self.tx + .send(StorageRequest::GetLatestResources(tx.into())) + .await + .unwrap(); rx.await.unwrap() } @@ -178,112 +303,42 @@ impl StorageHandle { dataset_name: DatasetName, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); - let request = - NewFilesystemRequest { dataset_id, dataset_name, responder: tx }; + let request = NewFilesystemRequest { + dataset_id, + dataset_name, + responder: tx.into(), + }; self.tx.send(StorageRequest::NewFilesystem(request)).await.unwrap(); rx.await.unwrap() } } - -// Some sled-agent tests cannot currently use the real StorageManager -// and want to fake the entire behavior, but still have access to the -// `StorageResources`. We allow this via use of the `FakeStorageManager` -// that will respond to real storage requests from a real `StorageHandle`. -#[cfg(feature = "testing")] -pub struct FakeStorageManager { - rx: mpsc::Receiver, - resources: StorageResources, - resource_updates: watch::Sender, -} - -#[cfg(feature = "testing")] -impl FakeStorageManager { - pub fn new() -> (Self, StorageHandle) { - let (tx, rx) = mpsc::channel(QUEUE_SIZE); - let resources = StorageResources::default(); - let (update_tx, update_rx) = watch::channel(resources.clone()); - ( - Self { rx, resources, resource_updates: update_tx }, - StorageHandle { tx, resource_updates: update_rx }, - ) - } - - /// Run the main receive loop of the `FakeStorageManager` - /// - /// This should be spawned into a tokio task - pub async fn run(mut self) { - loop { - match self.rx.recv().await { - Some(StorageRequest::AddDisk(raw_disk)) => { - if self.add_disk(raw_disk).disk_inserted() { - self.resource_updates - .send_replace(self.resources.clone()); - } - } - Some(StorageRequest::GetLatestResources(tx)) => { - let _ = tx.send(self.resources.clone()); - } - Some(_) => { - unreachable!(); - } - None => break, - } - } - } - - // Add a disk to `StorageResources` if it is new and return true if so - fn add_disk(&mut self, raw_disk: RawDisk) -> AddDiskResult { - let disk = match raw_disk { - RawDisk::Real(_) => { - panic!( - "Only synthetic disks can be used with `FakeStorageManager`" - ); - } - RawDisk::Synthetic(synthetic_disk) => { - Disk::Synthetic(synthetic_disk) - } - }; - self.resources.insert_fake_disk(disk) - } -} - /// The storage manager responsible for the state of the storage /// on a sled. The storage manager runs in its own task and is interacted /// with via the [`StorageHandle`]. pub struct StorageManager { log: Logger, state: StorageManagerState, - // Used to find the capacity of the channel for tracking purposes - tx: mpsc::Sender, rx: mpsc::Receiver, resources: StorageResources, - queued_u2_drives: HashSet, - key_requester: StorageKeyRequester, - resource_updates: watch::Sender, - last_logged_capacity: usize, } impl StorageManager { pub fn new( log: &Logger, + mount_config: MountConfig, key_requester: StorageKeyRequester, ) -> (StorageManager, StorageHandle) { let (tx, rx) = mpsc::channel(QUEUE_SIZE); - let resources = StorageResources::default(); - let (update_tx, update_rx) = watch::channel(resources.clone()); + let resources = StorageResources::new(log, mount_config, key_requester); + let disk_updates = resources.watch_disks(); ( StorageManager { log: log.new(o!("component" => "StorageManager")), state: StorageManagerState::WaitingForKeyManager, - tx: tx.clone(), rx, resources, - queued_u2_drives: HashSet::new(), - key_requester, - resource_updates: update_tx, - last_logged_capacity: QUEUE_SIZE, }, - StorageHandle { tx, resource_updates: update_rx }, + StorageHandle::new(tx, disk_updates), ) } @@ -291,22 +346,29 @@ impl StorageManager { /// /// This should be spawned into a tokio task pub async fn run(mut self) { + let mut interval = interval(SYNCHRONIZE_INTERVAL); + interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + tokio::pin!(interval); + loop { - const QUEUED_DISK_RETRY_TIMEOUT: Duration = Duration::from_secs(10); - let mut interval = interval(QUEUED_DISK_RETRY_TIMEOUT); - interval.set_missed_tick_behavior(MissedTickBehavior::Delay); tokio::select! { - res = self.step() => { - if let Err(e) = res { + Some(req) = self.rx.recv() => { + // It's critical that we don't "step" directly in the select + // branch, as that could cancel an ongoing request if it + // fires while a request is being processed. + // + // Instead, if we receive any request, we stop + // "select!"-ing and fully process the request before + // continuing. + if let Err(e) = self.step(req).await { warn!(self.log, "{e}"); } } _ = interval.tick(), - if self.state == StorageManagerState::QueueingDisks => + if self.state == StorageManagerState::SynchronizationNeeded => { - if self.add_queued_disks().await { - let _ = self.resource_updates.send_replace(self.resources.clone()); - } + info!(self.log, "automatically managing disks"); + self.manage_disks().await; } } } @@ -315,191 +377,387 @@ impl StorageManager { /// Process the next event /// /// This is useful for testing/debugging - pub async fn step(&mut self) -> Result<(), Error> { - const CAPACITY_LOG_THRESHOLD: usize = 10; - // We check the capacity and log it every time it changes by at least 10 - // entries in either direction. - let current = self.tx.capacity(); - if self.last_logged_capacity.saturating_sub(current) - >= CAPACITY_LOG_THRESHOLD - { - info!( - self.log, - "Channel capacity decreased"; - "previous" => ?self.last_logged_capacity, - "current" => ?current - ); - self.last_logged_capacity = current; - } else if current.saturating_sub(self.last_logged_capacity) - >= CAPACITY_LOG_THRESHOLD - { - info!( - self.log, - "Channel capacity increased"; - "previous" => ?self.last_logged_capacity, - "current" => ?current - ); - self.last_logged_capacity = current; - } - // The sending side never disappears because we hold a copy - let req = self.rx.recv().await.unwrap(); + async fn step(&mut self, req: StorageRequest) -> Result<(), Error> { info!(self.log, "Received {:?}", req); - let should_send_updates = match req { - StorageRequest::AddDisk(raw_disk) => { - self.add_disk(raw_disk).await?.disk_inserted() + + match req { + StorageRequest::DetectedRawDisk { raw_disk, tx } => { + let result = self.detected_raw_disk(raw_disk).await; + if let Err(ref err) = &result { + warn!(self.log, "Failed to add raw disk"; "err" => ?err); + } + let _ = tx.0.send(result); + } + StorageRequest::DetectedRawDiskRemoval { raw_disk, tx } => { + self.detected_raw_disk_removal(raw_disk); + let _ = tx.0.send(Ok(())); + } + StorageRequest::DetectedRawDisksChanged { raw_disks, tx } => { + self.ensure_using_exactly_these_disks(raw_disks).await; + let _ = tx.0.send(Ok(())); + } + StorageRequest::OmicronPhysicalDisksEnsure { config, tx } => { + let _ = + tx.0.send(self.omicron_physical_disks_ensure(config).await); } - StorageRequest::RemoveDisk(raw_disk) => self.remove_disk(raw_disk), - StorageRequest::DisksChanged(raw_disks) => { - self.ensure_using_exactly_these_disks(raw_disks).await + StorageRequest::OmicronPhysicalDisksList { tx } => { + let _ = tx.0.send(self.omicron_physical_disks_list().await); } StorageRequest::NewFilesystem(request) => { let result = self.add_dataset(&request).await; - if result.is_err() { - warn!(self.log, "{result:?}"); + if let Err(ref err) = &result { + warn!(self.log, "Failed to add dataset"; "err" => ?err); } - let _ = request.responder.send(result); - false + let _ = request.responder.0.send(result); } StorageRequest::KeyManagerReady => { - self.state = StorageManagerState::Normal; - self.add_queued_disks().await + self.key_manager_ready().await?; } StorageRequest::GetLatestResources(tx) => { - let _ = tx.send(self.resources.clone()); - false - } - StorageRequest::GetManagerState(tx) => { - let _ = tx.send(StorageManagerData { - state: self.state, - queued_u2_drives: self.queued_u2_drives.clone(), - }); - false + let _ = tx.0.send(self.resources.disks().clone()); } }; - if should_send_updates { - let _ = self.resource_updates.send_replace(self.resources.clone()); - } - Ok(()) } - // Loop through all queued disks inserting them into [`StorageResources`] - // unless we hit a transient error. If we hit a transient error, we return - // and wait for the next retry window to re-call this method. If we hit a - // permanent error we log it, but we continue inserting queued disks. - // - // Return true if updates should be sent to watchers, false otherwise - async fn add_queued_disks(&mut self) -> bool { + async fn manage_disks(&mut self) { + let result = self.resources.synchronize_disk_management().await; + + if result.has_retryable_error() { + // This is logged as "info", not "warn", as it can happen before + // trust quorum has been established. + info!( + self.log, + "Failed to synchronize disks, but will retry"; + "result" => ?result, + ); + return; + } + + self.state = StorageManagerState::Synchronized; + + if result.has_error() { + warn!( + self.log, + "Failed to synchronize disks due to permanant error"; + "result" => #?result, + ); + return; + } + info!( self.log, - "Attempting to add queued disks"; - "num_disks" => %self.queued_u2_drives.len() + "Successfully synchronized disks without error"; + "result" => ?result, ); - self.state = StorageManagerState::Normal; - - let mut send_updates = false; - - // Disks that should be requeued. - let queued = self.queued_u2_drives.clone(); - let mut to_dequeue = HashSet::new(); - for disk in queued.iter() { - if self.state == StorageManagerState::QueueingDisks { - // We hit a transient error in a prior iteration. - break; - } else { - match self.add_u2_disk(disk.clone()).await { - Err(_) => { - // This is an unrecoverable error, so we don't queue the - // disk again. - to_dequeue.insert(disk); - } - Ok(AddDiskResult::DiskInserted) => { - send_updates = true; - to_dequeue.insert(disk); - } - Ok(AddDiskResult::DiskAlreadyInserted) => { - to_dequeue.insert(disk); - } - Ok(AddDiskResult::DiskQueued) => (), + } + + async fn all_omicron_disk_ledgers(&self) -> Vec { + self.resources + .disks() + .all_m2_mountpoints(CONFIG_DATASET) + .into_iter() + .map(|p| p.join(DISKS_LEDGER_FILENAME)) + .collect() + } + + // Manages a newly detected disk that has been attached to this sled. + // + // For U.2s: we update our inventory. + // For M.2s: we do the same, but also begin "managing" the disk so + // it can automatically be in-use. + async fn detected_raw_disk( + &mut self, + raw_disk: RawDisk, + ) -> Result<(), Error> { + // In other words, the decision of "should we use this U.2" requires + // coordination with the control plane at large. + let needs_synchronization = + matches!(raw_disk.variant(), DiskVariant::U2); + self.resources.insert_disk(raw_disk).await?; + + if needs_synchronization { + match self.state { + // We'll synchronize once the key manager comes up. + StorageManagerState::WaitingForKeyManager => (), + // In these cases, we'd benefit from another call + // to "manage_disks" from StorageManager task runner. + StorageManagerState::SynchronizationNeeded + | StorageManagerState::Synchronized => { + self.state = StorageManagerState::SynchronizationNeeded; + + // TODO(https://github.com/oxidecomputer/omicron/issues/5328): + // We can remove this call once we've migrated everyone to a + // world that uses the ledger -- normally we'd only need to + // load the storage config once, when we know that the key + // manager is ready, but without a ledger, we may need to + // retry auto-management when any new U.2 appears. + self.load_storage_config().await?; } } } - // Dequeue any inserted disks - self.queued_u2_drives.retain(|k| !to_dequeue.contains(k)); - send_updates + + Ok(()) } - // Add a disk to `StorageResources` if it is new, - // updated, or its pool has been updated as determined by - // [`$crate::resources::StorageResources::insert_disk`] and we decide not to - // queue the disk for later addition. - async fn add_disk( - &mut self, - raw_disk: RawDisk, - ) -> Result { - match raw_disk.variant() { - DiskVariant::U2 => self.add_u2_disk(raw_disk).await, - DiskVariant::M2 => self.add_m2_disk(raw_disk).await, + async fn load_ledger(&self) -> Option> { + let ledger_paths = self.all_omicron_disk_ledgers().await; + let log = self.log.new(o!("request" => "load_ledger")); + let maybe_ledger = Ledger::::new( + &log, + ledger_paths.clone(), + ) + .await; + + match maybe_ledger { + Some(ledger) => { + info!(self.log, "Ledger of physical disks exists"); + return Some(ledger); + } + None => { + info!(self.log, "No ledger of physical disks exists"); + return None; + } } } - // Add a U.2 disk to [`StorageResources`] or queue it to be added later - async fn add_u2_disk( + async fn key_manager_ready(&mut self) -> Result<(), Error> { + self.load_storage_config().await + } + + async fn load_storage_config(&mut self) -> Result<(), Error> { + info!(self.log, "Loading storage config"); + // Set the state to "synchronization needed", to force us to try to + // asynchronously ensure that disks are ready. + self.state = StorageManagerState::SynchronizationNeeded; + + // Now that we're actually able to unpack U.2s, attempt to load the + // set of disks which we previously stored in the ledger, if one + // existed. + let ledger = self.load_ledger().await; + if let Some(ledger) = ledger { + info!(self.log, "Setting StorageResources state to match ledger"); + + // Identify which disks should be managed by the control + // plane, and adopt all requested disks into the control plane + // in a background task (see: [Self::manage_disks]). + self.resources.set_config(&ledger.data().disks); + } else { + info!(self.log, "KeyManager ready, but no ledger detected"); + let mut synthetic_config = + self.resources.get_config().values().cloned().collect(); + // TODO(https://github.com/oxidecomputer/omicron/issues/5328): Once + // we are confident that we have migrated to a world where this + // ledger is universally used, we should remove the following + // kludge. The sled agent should not need to "self-manage" anything! + let changed = self + .self_manage_disks_with_zpools(&mut synthetic_config) + .await?; + if !changed { + info!(self.log, "No disks to be automatically managed"); + return Ok(()); + } + info!(self.log, "auto-managed disks"; "count" => synthetic_config.len()); + self.resources.set_config(&synthetic_config); + } + + Ok(()) + } + + // NOTE: What follows is an exceptional case: one where we have + // no record of "Control Plane Physical Disks", but we have zpools + // on our U.2s, and we want to use them regardless. + // + // THIS WOULD NORMALLY BE INCORRECT BEHAVIOR. In the future, these + // zpools will not be "automatically imported", and instead, we'll + // let Nexus decide whether or not to reformat the disks. + // + // However, because we are transitioning from "the set of disks / + // zpools is implicit" to a world where that set is explicit, this + // is a necessary transitional tool. + // + // Returns "true" if the synthetic_config has changed. + async fn self_manage_disks_with_zpools( &mut self, - raw_disk: RawDisk, - ) -> Result { - if self.state != StorageManagerState::Normal { - self.queued_u2_drives.insert(raw_disk); - return Ok(AddDiskResult::DiskQueued); + synthetic_config: &mut Vec, + ) -> Result { + let mut changed = false; + for (identity, disk) in self.resources.disks().values.iter() { + match disk { + crate::resources::ManagedDisk::Unmanaged(raw) => { + let zpool_path = match raw.u2_zpool_path() { + Ok(zpool_path) => zpool_path, + Err(err) => { + info!(self.log, "Cannot find zpool path"; "identity" => ?identity, "err" => ?err); + continue; + } + }; + + let zpool_name = + match sled_hardware::disk::check_if_zpool_exists( + &zpool_path, + ) { + Ok(zpool_name) => zpool_name, + Err(err) => { + info!(self.log, "Zpool does not exist"; "identity" => ?identity, "err" => ?err); + continue; + } + }; + + info!(self.log, "Found existing zpool on device without ledger"; + "identity" => ?identity, + "zpool" => ?zpool_name); + + // We found an unmanaged disk with a zpool, even though + // we have no prior record of a ledger of control-plane + // disks. + synthetic_config.push( + // These disks don't have a control-plane UUID -- + // report "nil" until they're overwritten with real + // values. + OmicronPhysicalDiskConfig { + identity: identity.clone(), + id: Uuid::nil(), + pool_id: zpool_name.id(), + }, + ); + changed = true; + } + _ => continue, + } } + Ok(changed) + } - match Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) - .await - { - Ok(disk) => self.resources.insert_disk(disk), - Err(err @ DiskError::Dataset(DatasetError::KeyManager(_))) => { - warn!( - self.log, - "Transient error: {err}: queuing disk"; - "disk_id" => ?raw_disk.identity() + // Makes an U.2 disk managed by the control plane within [`StorageResources`]. + async fn omicron_physical_disks_ensure( + &mut self, + mut config: OmicronPhysicalDisksConfig, + ) -> Result { + let log = + self.log.new(o!("request" => "omicron_physical_disks_ensure")); + + // Ensure that the set of disks arrives in a consistent order. + config + .disks + .sort_by(|a, b| a.identity.partial_cmp(&b.identity).unwrap()); + + // We rely on the schema being stable across reboots -- observe + // "test_omicron_physical_disks_schema" below for that property + // guarantee. + let ledger_paths = self.all_omicron_disk_ledgers().await; + let maybe_ledger = Ledger::::new( + &log, + ledger_paths.clone(), + ) + .await; + + let mut ledger = match maybe_ledger { + Some(ledger) => { + info!( + log, + "Comparing 'requested disks' to ledger on internal storage" ); - self.queued_u2_drives.insert(raw_disk); - self.state = StorageManagerState::QueueingDisks; - Ok(AddDiskResult::DiskQueued) + let ledger_data = ledger.data(); + if config.generation < ledger_data.generation { + warn!( + log, + "Request looks out-of-date compared to prior request" + ); + return Err(Error::PhysicalDiskConfigurationOutdated { + requested: config.generation, + current: ledger_data.generation, + }); + } + + // TODO: If the generation is equal, check that the values are + // also equal. + + info!(log, "Request looks newer than prior requests"); + ledger } - Err(err) => { - error!( - self.log, - "Persistent error:not queueing disk"; - "err" => ?err, - "disk_id" => ?raw_disk.identity() - ); - Err(err.into()) + None => { + info!(log, "No previously-stored 'requested disks', creating new ledger"); + Ledger::::new_with( + &log, + ledger_paths.clone(), + OmicronPhysicalDisksConfig::new(), + ) } + }; + + let result = + self.omicron_physical_disks_ensure_internal(&log, &config).await?; + + let ledger_data = ledger.data_mut(); + if *ledger_data == config { + return Ok(result); } + *ledger_data = config; + ledger.commit().await?; + + Ok(result) } - // Add a U.2 disk to [`StorageResources`] if new and return `Ok(true)` if so - // + // Updates [StorageResources] to manage the disks requested by `config`, if + // those disks exist. // - // We never queue M.2 drives, as they don't rely on [`KeyManager`] based - // encryption - async fn add_m2_disk( + // Makes no attempts to manipulate the ledger storage. + async fn omicron_physical_disks_ensure_internal( &mut self, - raw_disk: RawDisk, - ) -> Result { - let disk = - Disk::new(&self.log, raw_disk.clone(), Some(&self.key_requester)) - .await?; - self.resources.insert_disk(disk) + log: &Logger, + config: &OmicronPhysicalDisksConfig, + ) -> Result { + if self.state == StorageManagerState::WaitingForKeyManager { + warn!( + log, + "Not ready to manage storage yet (waiting for the key manager)" + ); + return Err(Error::KeyManagerNotReady); + } + + // Identify which disks should be managed by the control + // plane, and adopt all requested disks into the control plane. + self.resources.set_config(&config.disks); + + // Actually try to "manage" those disks, which may involve formatting + // zpools and conforming partitions to those expected by the control + // plane. + Ok(self.resources.synchronize_disk_management().await) + } + + async fn omicron_physical_disks_list( + &mut self, + ) -> Result { + let log = self.log.new(o!("request" => "omicron_physical_disks_list")); + + // TODO(https://github.com/oxidecomputer/omicron/issues/5328): This + // could just use "resources.get_config", but that'll be more feasible + // once we don't have to cons up a fake "Generation" number. + + let ledger_paths = self.all_omicron_disk_ledgers().await; + let maybe_ledger = Ledger::::new( + &log, + ledger_paths.clone(), + ) + .await; + + match maybe_ledger { + Some(ledger) => { + info!(log, "Found ledger on internal storage"); + return Ok(ledger.data().clone()); + } + None => { + info!(log, "No ledger detected on internal storage"); + return Err(Error::LedgerNotFound); + } + } } // Delete a real disk and return `true` if the disk was actually removed - fn remove_disk(&mut self, raw_disk: RawDisk) -> bool { - // If the disk is a U.2, we want to first delete it from any queued disks - let _ = self.queued_u2_drives.remove(&raw_disk); - self.resources.remove_disk(raw_disk.identity()) + fn detected_raw_disk_removal(&mut self, raw_disk: RawDisk) { + self.resources.remove_disk(raw_disk.identity()); } // Find all disks to remove that are not in raw_disks and remove them. Then @@ -509,13 +767,7 @@ impl StorageManager { async fn ensure_using_exactly_these_disks( &mut self, raw_disks: HashSet, - ) -> bool { - let mut should_update = false; - - // Clear out any queued U.2 disks that are real. - // We keep synthetic disks, as they are only added once. - self.queued_u2_drives.retain(|d| d.is_synthetic()); - + ) { let all_ids: HashSet<_> = raw_disks.iter().map(|d| d.identity()).collect(); @@ -523,8 +775,8 @@ impl StorageManager { let to_remove: Vec = self .resources .disks() - .keys() - .filter_map(|id| { + .iter_all() + .filter_map(|(id, _variant, _slot)| { if !all_ids.contains(id) { Some(id.clone()) } else { @@ -534,27 +786,19 @@ impl StorageManager { .collect(); for id in to_remove { - if self.resources.remove_disk(&id) { - should_update = true; - } + self.resources.remove_disk(&id); } for raw_disk in raw_disks { let disk_id = raw_disk.identity().clone(); - match self.add_disk(raw_disk).await { - Ok(AddDiskResult::DiskInserted) => should_update = true, - Ok(_) => (), - Err(err) => { - warn!( - self.log, - "Failed to add disk to storage resources: {err}"; - "disk_id" => ?disk_id - ); - } + if let Err(err) = self.detected_raw_disk(raw_disk).await { + warn!( + self.log, + "Failed to add disk to storage resources: {err}"; + "disk_id" => ?disk_id + ); } } - - should_update } // Attempts to add a dataset within a zpool, according to `request`. @@ -562,15 +806,15 @@ impl StorageManager { &mut self, request: &NewFilesystemRequest, ) -> Result<(), Error> { - info!(self.log, "add_dataset: {:?}", request); + info!(self.log, "add_dataset"; "request" => ?request); if !self .resources .disks() - .values() - .any(|(_, pool)| &pool.name == request.dataset_name.pool()) + .iter_managed() + .any(|(_, disk)| disk.zpool_name() == request.dataset_name.pool()) { return Err(Error::ZpoolNotFound(format!( - "{}, looked up while trying to add dataset", + "{}", request.dataset_name.pool(), ))); } @@ -617,271 +861,313 @@ impl StorageManager { #[cfg(all(test, target_os = "illumos"))] mod tests { use crate::dataset::DatasetKind; - use crate::disk::SyntheticDisk; + use crate::disk::RawSyntheticDisk; + use crate::manager_test_harness::StorageManagerTestHarness; + use crate::resources::DiskManagementError; use super::*; - use async_trait::async_trait; - use camino_tempfile::tempdir; - use illumos_utils::zpool::Zpool; - use key_manager::{ - KeyManager, SecretRetriever, SecretRetrieverError, SecretState, - VersionedIkm, - }; + use camino_tempfile::tempdir_in; + use omicron_common::api::external::Generation; + use omicron_common::ledger; use omicron_test_utils::dev::test_setup_log; - use std::sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }; + use std::sync::atomic::Ordering; use uuid::Uuid; - /// A [`key-manager::SecretRetriever`] that only returns hardcoded IKM for - /// epoch 0 - #[derive(Debug, Default)] - struct HardcodedSecretRetriever { - inject_error: Arc, - } + // A helper struct to advance time. + struct TimeTravel {} - #[async_trait] - impl SecretRetriever for HardcodedSecretRetriever { - async fn get_latest( - &self, - ) -> Result { - if self.inject_error.load(Ordering::SeqCst) { - return Err(SecretRetrieverError::Bootstore( - "Timeout".to_string(), - )); - } - - let epoch = 0; - let salt = [0u8; 32]; - let secret = [0x1d; 32]; - - Ok(VersionedIkm::new(epoch, salt, &secret)) + impl TimeTravel { + pub fn new() -> Self { + tokio::time::pause(); + Self {} } - /// We don't plan to do any key rotation before trust quorum is ready - async fn get( - &self, - epoch: u64, - ) -> Result { - if self.inject_error.load(Ordering::SeqCst) { - return Err(SecretRetrieverError::Bootstore( - "Timeout".to_string(), - )); - } - if epoch != 0 { - return Err(SecretRetrieverError::NoSuchEpoch(epoch)); - } - Ok(SecretState::Current(self.get_latest().await?)) + pub async fn enough_to_start_synchronization(&self) { + tokio::time::advance(SYNCHRONIZE_INTERVAL).await; } } #[tokio::test] - async fn add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued() { + async fn add_control_plane_disks_requires_keymanager() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); - let logctx = test_setup_log( - "add_u2_disk_while_not_in_normal_stage_and_ensure_it_gets_queued", - ); - let (mut _key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); - let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let raw_disk: RawDisk = SyntheticDisk::new(zpool_name, 0).into(); - assert_eq!(StorageManagerState::WaitingForKeyManager, manager.state); - manager.add_u2_disk(raw_disk.clone()).await.unwrap(); - assert!(manager.resources.all_u2_zpools().is_empty()); - assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk.clone()])); - - // Check other non-normal stages and ensure disk gets queued - manager.queued_u2_drives.clear(); - manager.state = StorageManagerState::QueueingDisks; - manager.add_u2_disk(raw_disk.clone()).await.unwrap(); - assert!(manager.resources.all_u2_zpools().is_empty()); - assert_eq!(manager.queued_u2_drives, HashSet::from([raw_disk])); + let logctx = + test_setup_log("add_control_plane_disks_requires_keymanager"); + + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + let raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + + // These disks should exist, but only the M.2 should have a zpool. + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(2, all_disks.iter_all().collect::>().len()); + assert_eq!(0, all_disks.all_u2_zpools().len()); + assert_eq!(1, all_disks.all_m2_zpools().len()); + + // If we try to "act like nexus" and request a control-plane disk, we'll + // see a failure because the key manager isn't ready. + let config = harness.make_config(1, &raw_disks); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await; + assert!(matches!(result, Err(Error::KeyManagerNotReady))); + + // If we make the key manager ready and try again, it'll work. + harness.handle().key_manager_ready().await; + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring disks should work after key manager is ready"); + assert!(!result.has_error(), "{:?}", result); + + // If we look at the disks again, we'll now see one U.2 zpool. + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(2, all_disks.iter_all().collect::>().len()); + assert_eq!(1, all_disks.all_u2_zpools().len()); + assert_eq!(1, all_disks.all_m2_zpools().len()); + + harness.cleanup().await; logctx.cleanup_successful(); } #[tokio::test] - async fn ensure_u2_gets_added_to_resources() { - illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); - let logctx = test_setup_log("ensure_u2_gets_added_to_resources"); - let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (mut manager, _) = StorageManager::new(&logctx.log, key_requester); - let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let dir = tempdir().unwrap(); - let disk = - SyntheticDisk::create_zpool(dir.path(), &zpool_name, 0).into(); - - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); - - // Set the stage to pretend we've progressed enough to have a key_manager available. - manager.state = StorageManagerState::Normal; - manager.add_u2_disk(disk).await.unwrap(); - assert_eq!(manager.resources.all_u2_zpools().len(), 1); - Zpool::destroy(&zpool_name).unwrap(); + async fn ledger_writes_require_at_least_one_m2() { + let logctx = test_setup_log("ledger_writes_require_at_least_one_m2"); + + // Create a single U.2 under test, with a ready-to-go key manager. + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + let raw_disks = harness.add_vdevs(&["u2_under_test.vdev"]).await; + harness.handle().key_manager_ready().await; + let config = harness.make_config(1, &raw_disks); + + // Attempting to adopt this U.2 fails (we don't have anywhere to put the + // ledger). + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await; + assert!( + matches!( + result, + Err(Error::Ledger(ledger::Error::FailedToWrite { .. })) + ), + "Saw unexpected result: {:?}", + result + ); + + // Add an M.2 which can store the ledger. + let _raw_disks = + harness.add_vdevs(&["m2_finally_showed_up.vdev"]).await; + harness.handle_mut().wait_for_boot_disk().await; + + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("After adding an M.2, the ledger write should have worked"); + assert!(!result.has_error(), "{:?}", result); + + // Wait for the add disk notification + let tt = TimeTravel::new(); + tt.enough_to_start_synchronization().await; + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(all_disks.all_u2_zpools().len(), 1); + assert_eq!(all_disks.all_m2_zpools().len(), 1); + + harness.cleanup().await; logctx.cleanup_successful(); } #[tokio::test] - async fn wait_for_bootdisk() { + async fn add_raw_u2_does_not_create_zpool() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); - let logctx = test_setup_log("wait_for_bootdisk"); - let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (manager, mut handle) = - StorageManager::new(&logctx.log, key_requester); - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); - - // Spawn the storage manager as done by sled-agent - tokio::spawn(async move { - manager.run().await; - }); - - // Create a synthetic internal disk - let zpool_name = ZpoolName::new_internal(Uuid::new_v4()); - let dir = tempdir().unwrap(); - let disk = - SyntheticDisk::create_zpool(dir.path(), &zpool_name, 0).into(); - - handle.upsert_disk(disk).await; - handle.wait_for_boot_disk().await; - Zpool::destroy(&zpool_name).unwrap(); + let logctx = test_setup_log("add_raw_u2_does_not_create_zpool"); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + harness.handle().key_manager_ready().await; + + // Add a representative scenario for a small sled: a U.2 and M.2. + let _raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + + // This disks should exist, but only the M.2 should have a zpool. + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(2, all_disks.iter_all().collect::>().len()); + assert_eq!(0, all_disks.all_u2_zpools().len()); + assert_eq!(1, all_disks.all_m2_zpools().len()); + + harness.cleanup().await; logctx.cleanup_successful(); } #[tokio::test] - async fn queued_disks_get_added_as_resources() { + async fn wait_for_boot_disk() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); - let logctx = test_setup_log("queued_disks_get_added_as_resources"); - let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (manager, handle) = StorageManager::new(&logctx.log, key_requester); + let logctx = test_setup_log("wait_for_boot_disk"); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + let _raw_disks = harness.add_vdevs(&["u2_under_test.vdev"]).await; + + // When we wait for changes, we can see the U.2 being added, but no boot + // disk. + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(1, all_disks.iter_all().collect::>().len()); + assert!(all_disks.boot_disk().is_none()); + + // Waiting for the boot disk should time out. + assert!(tokio::time::timeout( + tokio::time::Duration::from_millis(10), + harness.handle_mut().wait_for_boot_disk(), + ) + .await + .is_err()); - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); + // Now we add a boot disk. + let boot_disk = harness.add_vdevs(&["m2_under_test.vdev"]).await; - // Spawn the storage manager as done by sled-agent - tokio::spawn(async move { - manager.run().await; - }); + // It shows up through the general "wait for changes" API. + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(2, all_disks.iter_all().collect::>().len()); + assert!(all_disks.boot_disk().is_some()); - // Queue up a disks, as we haven't told the `StorageManager` that - // the `KeyManager` is ready yet. - let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let dir = tempdir().unwrap(); - let disk = - SyntheticDisk::create_zpool(dir.path(), &zpool_name, 0).into(); - handle.upsert_disk(disk).await; - let resources = handle.get_latest_resources().await; - assert!(resources.all_u2_zpools().is_empty()); - - // Now inform the storage manager that the key manager is ready - // The queued disk should be successfully added - handle.key_manager_ready().await; - let resources = handle.get_latest_resources().await; - assert_eq!(resources.all_u2_zpools().len(), 1); - Zpool::destroy(&zpool_name).unwrap(); + // We can wait for, and see, the boot disk. + let (id, _) = harness.handle_mut().wait_for_boot_disk().await; + assert_eq!(&id, boot_disk[0].identity()); + + // We can keep calling this function without blocking. + let (id, _) = harness.handle_mut().wait_for_boot_disk().await; + assert_eq!(&id, boot_disk[0].identity()); + + harness.cleanup().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn disks_automatically_managed_after_key_manager_ready() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log( + "disks_automatically_managed_after_key_manager_ready", + ); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + + // Boot normally, add an M.2 and a U.2, and let them + // create pools. + let raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + harness.handle().key_manager_ready().await; + let config = harness.make_config(1, &raw_disks); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .unwrap(); + assert!(!result.has_error(), "{:?}", result); + + // Both pools exist + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(2, all_disks.iter_all().collect::>().len()); + assert_eq!(1, all_disks.all_u2_zpools().len()); + assert_eq!(1, all_disks.all_m2_zpools().len()); + + // "reboot" the storage manager, and let it see the disks before + // the key manager is ready. + let mut harness = harness.reboot(&logctx.log).await; + + // Both disks exist, but the U.2's pool is not yet accessible. + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(2, all_disks.iter_all().collect::>().len()); + assert_eq!(0, all_disks.all_u2_zpools().len()); + assert_eq!(1, all_disks.all_m2_zpools().len()); + + // Mark the key manaager ready. This should eventually lead to the + // U.2 being managed, since it exists in the M.2 ledger. + harness.handle().key_manager_ready().await; + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(1, all_disks.all_u2_zpools().len()); + + harness.cleanup().await; logctx.cleanup_successful(); } - /// For this test, we are going to step through the msg recv loop directly - /// without running the `StorageManager` in a tokio task. - /// This allows us to control timing precisely. #[tokio::test] async fn queued_disks_get_requeued_on_secret_retriever_error() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log( "queued_disks_get_requeued_on_secret_retriever_error", ); - let inject_error = Arc::new(AtomicBool::new(false)); - let (mut key_manager, key_requester) = KeyManager::new( - &logctx.log, - HardcodedSecretRetriever { inject_error: inject_error.clone() }, - ); - let (mut manager, handle) = - StorageManager::new(&logctx.log, key_requester); - - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; // Queue up a disks, as we haven't told the `StorageManager` that // the `KeyManager` is ready yet. - let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let dir = tempdir().unwrap(); - let disk = - SyntheticDisk::create_zpool(dir.path(), &zpool_name, 0).into(); - handle.upsert_disk(disk).await; - manager.step().await.unwrap(); - - // We can't wait for a reply through the handle as the storage manager task - // isn't actually running. We just check the resources directly. - assert!(manager.resources.all_u2_zpools().is_empty()); - - // Let's inject an error to the `SecretRetriever` to simulate a trust - // quorum timeout - inject_error.store(true, Ordering::SeqCst); - - // Now inform the storage manager that the key manager is ready - // The queued disk should not be added due to the error - handle.key_manager_ready().await; - manager.step().await.unwrap(); - assert!(manager.resources.all_u2_zpools().is_empty()); - - // Manually simulating a timer tick to add queued disks should also - // still hit the error - manager.add_queued_disks().await; - assert!(manager.resources.all_u2_zpools().is_empty()); - - // Clearing the injected error will cause the disk to get added - inject_error.store(false, Ordering::SeqCst); - manager.add_queued_disks().await; - assert_eq!(1, manager.resources.all_u2_zpools().len()); - - Zpool::destroy(&zpool_name).unwrap(); + let raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + let config = harness.make_config(1, &raw_disks); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await; + assert!(matches!(result, Err(Error::KeyManagerNotReady))); + + // As usual, the U.2 isn't ready yet. + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(2, all_disks.iter_all().collect::>().len()); + assert_eq!(0, all_disks.all_u2_zpools().len()); + + // Mark the key manager ready, but throwing errors. + harness.key_manager_error_injector().store(true, Ordering::SeqCst); + harness.handle().key_manager_ready().await; + + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .unwrap(); + assert!(result.has_error()); + assert!(matches!( + result.status[0].err.as_ref(), + Some(DiskManagementError::KeyManager(_)) + )); + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(0, all_disks.all_u2_zpools().len()); + + // After toggling KeyManager errors off, the U.2 can be successfully added. + harness.key_manager_error_injector().store(false, Ordering::SeqCst); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring control plane disks should have worked"); + assert!(!result.has_error(), "{:?}", result); + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(1, all_disks.all_u2_zpools().len()); + + harness.cleanup().await; logctx.cleanup_successful(); } #[tokio::test] - async fn delete_disk_triggers_notification() { + async fn detected_raw_disk_removal_triggers_notification() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); - let logctx = test_setup_log("delete_disk_triggers_notification"); - let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (manager, mut handle) = - StorageManager::new(&logctx.log, key_requester); - - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); - - // Spawn the storage manager as done by sled-agent - tokio::spawn(async move { - manager.run().await; - }); - - // Inform the storage manager that the key manager is ready, so disks - // don't get queued - handle.key_manager_ready().await; - - // Create and add a disk - let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let dir = tempdir().unwrap(); - let disk: RawDisk = - SyntheticDisk::create_zpool(dir.path(), &zpool_name, 0).into(); - handle.upsert_disk(disk.clone()).await; + let logctx = + test_setup_log("detected_raw_disk_removal_triggers_notification"); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + harness.handle().key_manager_ready().await; + let mut raw_disks = harness.add_vdevs(&["u2_under_test.vdev"]).await; - // Wait for the add disk notification - let resources = handle.wait_for_changes().await; - assert_eq!(resources.all_u2_zpools().len(), 1); + // Access the add disk notification + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(1, all_disks.iter_all().collect::>().len()); // Delete the disk and wait for a notification - handle.delete_disk(disk).await; - let resources = handle.wait_for_changes().await; - assert!(resources.all_u2_zpools().is_empty()); + harness + .handle() + .detected_raw_disk_removal(raw_disks.remove(0)) + .await + .await + .unwrap(); + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(0, all_disks.iter_all().collect::>().len()); - Zpool::destroy(&zpool_name).unwrap(); + harness.cleanup().await; logctx.cleanup_successful(); } @@ -889,122 +1175,81 @@ mod tests { async fn ensure_using_exactly_these_disks() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("ensure_using_exactly_these_disks"); - let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (manager, mut handle) = - StorageManager::new(&logctx.log, key_requester); - - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); - - // Spawn the storage manager as done by sled-agent - tokio::spawn(async move { - manager.run().await; - }); - - // Create a bunch of file backed external disks with zpools - let dir = tempdir().unwrap(); - let zpools: Vec = - (0..10).map(|_| ZpoolName::new_external(Uuid::new_v4())).collect(); - let disks: Vec = zpools - .iter() - .enumerate() - .map(|(slot, zpool_name)| { - SyntheticDisk::create_zpool( - dir.path(), - zpool_name, - slot.try_into().unwrap(), - ) - .into() + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + + // Create a bunch of file backed external disks + let vdev_dir = tempdir_in("/var/tmp").unwrap(); + let disks: Vec = (0..10) + .map(|serial| { + let vdev_path = + vdev_dir.path().join(format!("u2_{serial}.vdev")); + RawSyntheticDisk::new_with_length(&vdev_path, 1 << 20, serial) + .unwrap() + .into() }) .collect(); - // Add the first 3 disks, and ensure they get queued, as we haven't - // marked our key manager ready yet - handle + // Observe the first three disks + harness + .handle() .ensure_using_exactly_these_disks(disks.iter().take(3).cloned()) - .await; - let state = handle.get_manager_state().await; - assert_eq!(state.queued_u2_drives.len(), 3); - assert_eq!(state.state, StorageManagerState::WaitingForKeyManager); - assert!(handle.get_latest_resources().await.all_u2_zpools().is_empty()); - - // Mark the key manager ready and wait for the storage update - handle.key_manager_ready().await; - let resources = handle.wait_for_changes().await; - let expected: HashSet<_> = - disks.iter().take(3).map(|d| d.identity()).collect(); - let actual: HashSet<_> = resources.disks().keys().collect(); - assert_eq!(expected, actual); + .await + .await + .unwrap(); - // Add first three disks after the initial one. The returned resources + let all_disks = harness.handle().get_latest_disks().await; + assert_eq!(3, all_disks.iter_all().collect::>().len()); + + // Add first three disks after the initial one. The returned disks // should not contain the first disk. - handle + harness + .handle() .ensure_using_exactly_these_disks( disks.iter().skip(1).take(3).cloned(), ) - .await; - let resources = handle.wait_for_changes().await; + .await + .await + .unwrap(); + + let all_disks = harness.handle_mut().wait_for_changes().await; + assert_eq!(3, all_disks.iter_all().collect::>().len()); + let expected: HashSet<_> = disks.iter().skip(1).take(3).map(|d| d.identity()).collect(); - let actual: HashSet<_> = resources.disks().keys().collect(); + let actual: HashSet<_> = all_disks.values.keys().collect(); assert_eq!(expected, actual); // Ensure the same set of disks and make sure no change occurs - // Note that we directly request the resources this time so we aren't + // Note that we directly request the disks this time so we aren't // waiting forever for a change notification. - handle + harness + .handle() .ensure_using_exactly_these_disks( disks.iter().skip(1).take(3).cloned(), ) - .await; - let resources2 = handle.get_latest_resources().await; - assert_eq!(resources, resources2); + .await + .await + .unwrap(); + let all_disks2 = harness.handle().get_latest_disks().await; + assert_eq!(all_disks.values, all_disks2.values); // Add a disjoint set of disks and see that only they come through - handle + harness + .handle() .ensure_using_exactly_these_disks( disks.iter().skip(4).take(5).cloned(), ) - .await; - let resources = handle.wait_for_changes().await; + .await + .await + .unwrap(); + + let all_disks = harness.handle().get_latest_disks().await; let expected: HashSet<_> = disks.iter().skip(4).take(5).map(|d| d.identity()).collect(); - let actual: HashSet<_> = resources.disks().keys().collect(); + let actual: HashSet<_> = all_disks.values.keys().collect(); assert_eq!(expected, actual); - // Finally, change the zpool backing of the 5th disk to be that of the 10th - // and ensure that disk changes. Note that we don't change the identity - // of the 5th disk. - let mut modified_disk = disks[4].clone(); - if let RawDisk::Synthetic(disk) = &mut modified_disk { - disk.zpool_name = disks[9].zpool_name().clone(); - } else { - panic!(); - } - let mut expected: HashSet<_> = - disks.iter().skip(5).take(4).cloned().collect(); - expected.insert(modified_disk); - - handle - .ensure_using_exactly_these_disks(expected.clone().into_iter()) - .await; - let resources = handle.wait_for_changes().await; - - // Ensure the one modified disk changed as we expected - assert_eq!(5, resources.disks().len()); - for raw_disk in expected { - let (disk, pool) = - resources.disks().get(raw_disk.identity()).unwrap(); - assert_eq!(disk.zpool_name(), raw_disk.zpool_name()); - assert_eq!(&pool.name, disk.zpool_name()); - assert_eq!(raw_disk.identity(), &pool.parent); - } - - // Cleanup - for zpool in zpools { - Zpool::destroy(&zpool).unwrap(); - } + harness.cleanup().await; logctx.cleanup_successful(); } @@ -1012,34 +1257,194 @@ mod tests { async fn upsert_filesystem() { illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); let logctx = test_setup_log("upsert_filesystem"); - let (mut key_manager, key_requester) = - KeyManager::new(&logctx.log, HardcodedSecretRetriever::default()); - let (manager, handle) = StorageManager::new(&logctx.log, key_requester); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + + // Test setup: Add a U.2 and M.2, adopt them into the "control plane" + // for usage. + harness.handle().key_manager_ready().await; + let raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + let config = harness.make_config(1, &raw_disks); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring disks should work after key manager is ready"); + assert!(!result.has_error(), "{:?}", result); - // Spawn the key_manager so that it will respond to requests for encryption keys - tokio::spawn(async move { key_manager.run().await }); + // Create a filesystem on the newly formatted U.2 + let dataset_id = Uuid::new_v4(); + let zpool_name = ZpoolName::new_external(config.disks[0].pool_id); + let dataset_name = + DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); + harness + .handle() + .upsert_filesystem(dataset_id, dataset_name) + .await + .unwrap(); - // Spawn the storage manager as done by sled-agent - tokio::spawn(async move { - manager.run().await; - }); + harness.cleanup().await; + logctx.cleanup_successful(); + } - handle.key_manager_ready().await; + #[tokio::test] + async fn ledgerless_to_ledgered_migration() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("ledgerless_to_ledgered_migration"); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + + // Test setup: Create two U.2s and an M.2 + let raw_disks = harness + .add_vdevs(&[ + "u2_under_test.vdev", + "u2_that_shows_up_late.vdev", + "m2_helping.vdev", + ]) + .await; - // Create and add a disk - let zpool_name = ZpoolName::new_external(Uuid::new_v4()); - let dir = tempdir().unwrap(); - let disk: RawDisk = - SyntheticDisk::create_zpool(dir.path(), &zpool_name, 0).into(); - handle.upsert_disk(disk.clone()).await; + // First, we format the U.2s to have a zpool. This should work, even + // without looping in the StorageManager. + let first_u2 = &raw_disks[0]; + let first_pool_id = Uuid::new_v4(); + let _disk = crate::disk::Disk::new( + &logctx.log, + &harness.mount_config(), + first_u2.clone(), + Some(first_pool_id), + Some(harness.key_requester()), + ) + .await + .expect("Failed to format U.2"); - // Create a filesystem - let dataset_id = Uuid::new_v4(); - let dataset_name = - DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); - handle.upsert_filesystem(dataset_id, dataset_name).await.unwrap(); + let second_u2 = &raw_disks[1]; + let second_pool_id = Uuid::new_v4(); + let _disk = crate::disk::Disk::new( + &logctx.log, + &harness.mount_config(), + second_u2.clone(), + Some(second_pool_id), + Some(harness.key_requester()), + ) + .await + .expect("Failed to format U.2"); + + // Because we did that formatting "behind the back" of the + // StorageManager, we should see no evidence of the U.2 being managed. + // + // This currently matches the format of "existing systems, which were + // initialized before the storage ledger was created". + + // We should still see no ledger. + let result = harness.handle().omicron_physical_disks_list().await; + assert!(matches!(result, Err(Error::LedgerNotFound)), "{:?}", result); + + // We should also not see any managed U.2s. + let disks = harness.handle().get_latest_disks().await; + assert!(disks.all_u2_zpools().is_empty()); + + // Leave one of the U.2s attached, but "remove" the other one. + harness.remove_vdev(second_u2).await; + + // When the system activates, we should see a single Zpool, and + // "auto-manage" it. + harness.handle().key_manager_ready().await; + + // It might take a moment for synchronization to be handled by the + // background task, but we'll eventually see the U.2 zpool. + // + // This is the equivalent of us "loading a zpool, even though + // it was not backed by a ledger". + let tt = TimeTravel::new(); + tt.enough_to_start_synchronization().await; + while harness + .handle_mut() + .wait_for_changes() + .await + .all_u2_zpools() + .is_empty() + { + info!(&logctx.log, "Waiting for U.2 to automatically show up"); + } + let u2s = harness.handle().get_latest_disks().await.all_u2_zpools(); + assert_eq!(u2s.len(), 1, "{:?}", u2s); + + // If we attach the second U.2 -- the equivalent of it appearing after + // the key manager is ready -- it'll also be included in the set of + // auto-maanged U.2s. + harness.add_vdev_as(second_u2.clone()).await; + tt.enough_to_start_synchronization().await; + while harness + .handle_mut() + .wait_for_changes() + .await + .all_u2_zpools() + .len() + == 1 + { + info!(&logctx.log, "Waiting for U.2 to automatically show up"); + } + let u2s = harness.handle().get_latest_disks().await.all_u2_zpools(); + assert_eq!(u2s.len(), 2, "{:?}", u2s); + + // This is the equivalent of the "/omicron-physical-disks GET" API, + // which Nexus might use to contact this sled. + // + // This means that we'll bootstrap the sled successfully, but report a + // 404 if nexus asks us for the latest configuration. + let result = harness.handle().omicron_physical_disks_list().await; + assert!(matches!(result, Err(Error::LedgerNotFound),), "{:?}", result); + + // At this point, Nexus may want to explicitly tell sled agent which + // disks it should use. This is the equivalent of invoking + // "/omicron-physical-disks PUT". + let mut disks = vec![ + OmicronPhysicalDiskConfig { + identity: first_u2.identity().clone(), + id: Uuid::new_v4(), + pool_id: first_pool_id, + }, + OmicronPhysicalDiskConfig { + identity: second_u2.identity().clone(), + id: Uuid::new_v4(), + pool_id: second_pool_id, + }, + ]; + // Sort the disks to ensure the "output" matches the "input" when we + // query later. + disks.sort_by(|a, b| a.identity.partial_cmp(&b.identity).unwrap()); + let config = + OmicronPhysicalDisksConfig { generation: Generation::new(), disks }; + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Failed to ensure disks with 'new' Config"); + assert!(!result.has_error(), "{:?}", result); + + let observed_config = harness + .handle() + .omicron_physical_disks_list() + .await + .expect("Failed to retreive config after ensuring it"); + assert_eq!(observed_config, config); + + let u2s = harness.handle().get_latest_disks().await.all_u2_zpools(); + assert_eq!(u2s.len(), 2, "{:?}", u2s); - Zpool::destroy(&zpool_name).unwrap(); + harness.cleanup().await; logctx.cleanup_successful(); } } + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn test_omicron_physical_disks_schema() { + let schema = schemars::schema_for!(OmicronPhysicalDisksConfig); + expectorate::assert_contents( + "../schema/omicron-physical-disks.json", + &serde_json::to_string_pretty(&schema).unwrap(), + ); + } +} diff --git a/sled-storage/src/manager_test_harness.rs b/sled-storage/src/manager_test_harness.rs new file mode 100644 index 0000000000..efdbb0b9f6 --- /dev/null +++ b/sled-storage/src/manager_test_harness.rs @@ -0,0 +1,393 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Utilities for creating a StorageManager under test. + +use crate::config::MountConfig; +use crate::disk::{OmicronPhysicalDisksConfig, RawDisk}; +use crate::manager::{StorageHandle, StorageManager}; +use camino::Utf8PathBuf; +use key_manager::StorageKeyRequester; +use slog::{info, Logger}; +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, +}; +use uuid::Uuid; + +/// A [`key-manager::SecretRetriever`] that only returns hardcoded IKM for +/// epoch 0 +#[derive(Debug, Default)] +struct HardcodedSecretRetriever { + inject_error: Arc, +} + +#[async_trait::async_trait] +impl key_manager::SecretRetriever for HardcodedSecretRetriever { + async fn get_latest( + &self, + ) -> Result + { + if self.inject_error.load(Ordering::SeqCst) { + return Err(key_manager::SecretRetrieverError::Bootstore( + "Timeout".to_string(), + )); + } + + let epoch = 0; + let salt = [0u8; 32]; + let secret = [0x1d; 32]; + + Ok(key_manager::VersionedIkm::new(epoch, salt, &secret)) + } + + /// We don't plan to do any key rotation before trust quorum is ready + async fn get( + &self, + epoch: u64, + ) -> Result + { + if self.inject_error.load(Ordering::SeqCst) { + return Err(key_manager::SecretRetrieverError::Bootstore( + "Timeout".to_string(), + )); + } + if epoch != 0 { + return Err(key_manager::SecretRetrieverError::NoSuchEpoch(epoch)); + } + Ok(key_manager::SecretState::Current(self.get_latest().await?)) + } +} + +/// Helper utility for tests that want to use a StorageManager. +/// +/// Attempts to make it easy to create a set of vdev-based M.2 and U.2 +/// devices, which can be formatted with arbitrary zpools. +pub struct StorageManagerTestHarness { + handle: StorageHandle, + vdev_dir: Option, + vdevs: std::collections::BTreeSet, + next_slot: i64, + #[allow(unused)] + key_requester: StorageKeyRequester, + key_manager_error_injector: Arc, + key_manager_task: tokio::task::JoinHandle<()>, + storage_manager_task: tokio::task::JoinHandle<()>, +} + +impl Drop for StorageManagerTestHarness { + fn drop(&mut self) { + if let Some(vdev_dir) = self.vdev_dir.take() { + eprintln!( + "WARNING: StorageManagerTestHarness called without 'cleanup()'.\n\ + We may have leaked zpools, and not correctly deleted {}", + vdev_dir.path() + ); + + let pools = [ + ( + illumos_utils::zpool::ZPOOL_INTERNAL_PREFIX, + vdev_dir.path().join("pool/int"), + ), + ( + illumos_utils::zpool::ZPOOL_EXTERNAL_PREFIX, + vdev_dir.path().join("pool/ext"), + ), + ]; + + eprintln!( + "The following commands may need to be run to clean up state:" + ); + eprintln!("---"); + for (prefix, pool) in pools { + let Ok(entries) = pool.read_dir_utf8() else { + continue; + }; + for entry in entries.flatten() { + eprintln!( + " pfexec zpool destroy {prefix}{} ", + entry.file_name() + ); + } + } + eprintln!(" pfexec rm -rf {}", vdev_dir.path()); + eprintln!("---"); + + panic!("Dropped without cleanup. See stderr for cleanup advice"); + } + } +} + +impl StorageManagerTestHarness { + /// Creates a new StorageManagerTestHarness with no associated disks. + pub async fn new(log: &Logger) -> Self { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let tmp = camino_tempfile::tempdir_in("/var/tmp") + .expect("Failed to make temporary directory"); + info!(log, "Using tmp: {}", tmp.path()); + Self::new_with_tmp_dir(log, tmp).await + } + + async fn new_with_tmp_dir( + log: &Logger, + tmp: camino_tempfile::Utf8TempDir, + ) -> Self { + let mount_config = + MountConfig { root: tmp.path().into(), ..Default::default() }; + + let key_manager_error_injector = Arc::new(AtomicBool::new(false)); + let (mut key_manager, key_requester) = key_manager::KeyManager::new( + &log, + HardcodedSecretRetriever { + inject_error: key_manager_error_injector.clone(), + }, + ); + let (manager, handle) = + StorageManager::new(&log, mount_config, key_requester.clone()); + + // Spawn the key_manager so that it will respond to requests for encryption keys + let key_manager_task = + tokio::spawn(async move { key_manager.run().await }); + + // Spawn the storage manager as done by sled-agent + let storage_manager_task = tokio::spawn(async move { + manager.run().await; + }); + + Self { + handle, + vdev_dir: Some(tmp), + vdevs: std::collections::BTreeSet::new(), + next_slot: 0, + key_requester, + key_manager_error_injector, + key_manager_task, + storage_manager_task, + } + } + + /// Emulate a system rebooting. + /// + /// - Stops the currently running tasks and restarts them + /// - Re-inserts all vdevs previously created by [Self::add_vdevs]. + pub async fn reboot(mut self, log: &Logger) -> Self { + // Abort ongoing tasks, in lieu of a cleaner shutdown mechanism. + self.key_manager_task.abort(); + self.storage_manager_task.abort(); + + // Deconstruct the test harness + let vdev_dir = + std::mem::take(&mut self.vdev_dir).expect("Already terminated"); + let vdevs = std::mem::take(&mut self.vdevs); + + // Re-create all the state we created during the constructor, but + // leave the temporary directory as it was "before reboot". + let mut slef = Self::new_with_tmp_dir(log, vdev_dir).await; + slef.next_slot = self.next_slot; + + // Notify ourselves of the new disks, just as the hardware would. + // + // NOTE: Technically, if these disks have pools, they're still imported. + // However, the SledManager doesn't know about them, and wouldn't + // assume they're being managed right now. + for raw_disk in vdevs { + slef.handle + .detected_raw_disk(raw_disk.clone()) + .await // Notify StorageManager + .await // Wait for it to finish processing + .unwrap(); + slef.vdevs.insert(raw_disk.clone()); + } + + slef + } + + #[allow(unused)] + pub(crate) fn mount_config(&self) -> MountConfig { + MountConfig { + root: self + .vdev_dir + .as_ref() + .expect("Harness destroyed?") + .path() + .into(), + ..Default::default() + } + } + + #[allow(unused)] + pub(crate) fn key_requester(&self) -> &StorageKeyRequester { + &self.key_requester + } + + pub const DEFAULT_VDEV_SIZE: u64 = 64 * (1 << 20); + + /// Adds raw devices to the [crate::manager::StorageManager], as if they were detected via + /// hardware. Can be called several times. + /// + /// Each device is [Self::DEFAULT_VDEV_SIZE] in size. + /// Use [Self::add_vdevs_with_size] if you need more control + /// over device sizes. + pub async fn add_vdevs + ?Sized>( + &mut self, + vdevs: &[&P], + ) -> Vec { + self.add_vdevs_with_size( + &vdevs + .iter() + .map(|vdev| (vdev, Self::DEFAULT_VDEV_SIZE)) + .collect::>(), + ) + .await + } + + pub async fn add_vdevs_with_size + ?Sized>( + &mut self, + vdevs: &[(&P, u64)], + ) -> Vec { + let vdev_dir = self + .vdev_dir + .as_ref() + .expect("Cannot add vdevs, test harness terminated"); + let mut added = vec![]; + for (vdev, size) in vdevs + .iter() + .map(|(vdev, size)| (Utf8PathBuf::from(vdev.as_ref()), size)) + { + assert!(vdev.is_relative()); + let vdev_path = vdev_dir.path().join(&vdev); + let raw_disk: RawDisk = + crate::disk::RawSyntheticDisk::new_with_length( + &vdev_path, + *size, + self.next_slot, + ) + .unwrap_or_else(|err| { + panic!( + "Failed to create synthetic disk for {vdev}: {err:?}" + ) + }) + .into(); + self.next_slot += 1; + self.handle + .detected_raw_disk(raw_disk.clone()) + .await // Notify StorageManager + .await // Wait for it to finish processing + .unwrap(); + + self.vdevs.insert(raw_disk.clone()); + added.push(raw_disk); + } + added + } + + // Removes a vdev from the set of "tracked" devices. + // + // This is equivalent to having the hardware monitor unplug a device. + // + // If this device has an associated zpool, it must be either re-attached + // to the harness or manually destroyed before the test completes. + // Otherwise, removing the temporary directory containing that zpool + // will likely fail with a "device busy" error. + pub async fn remove_vdev(&mut self, raw: &RawDisk) { + assert!(self.vdevs.remove(&raw), "Vdev does not exist"); + self.handle + .detected_raw_disk_removal(raw.clone()) + .await + .await + .expect("Failed to remove vdev"); + } + + // Adds a vdev to the set of "tracked" devices. + pub async fn add_vdev_as(&mut self, raw_disk: RawDisk) { + self.handle + .detected_raw_disk(raw_disk.clone()) + .await // Notify StorageManager + .await // Wait for it to finish processing + .unwrap(); + self.vdevs.insert(raw_disk.clone()); + } + + pub fn make_config( + &self, + generation: u32, + disks: &[RawDisk], + ) -> OmicronPhysicalDisksConfig { + let disks = disks + .into_iter() + .map(|raw| { + let identity = raw.identity(); + + crate::disk::OmicronPhysicalDiskConfig { + identity: identity.clone(), + id: Uuid::new_v4(), + pool_id: Uuid::new_v4(), + } + }) + .collect(); + + OmicronPhysicalDisksConfig { + generation: omicron_common::api::external::Generation::from( + generation, + ), + disks, + } + } + + /// Returns the underlying [crate::manager::StorageHandle]. + pub fn handle_mut(&mut self) -> &mut StorageHandle { + &mut self.handle + } + + /// Returns the underlying [crate::manager::StorageHandle]. + pub fn handle(&self) -> &StorageHandle { + &self.handle + } + + /// Set to "true" to throw errors, "false" to not inject errors. + pub fn key_manager_error_injector(&self) -> &Arc { + &self.key_manager_error_injector + } + + /// Cleanly terminates the test harness + pub async fn cleanup(&mut self) { + let Some(vdev_dir) = self.vdev_dir.take() else { + // Already terminated + return; + }; + + eprintln!("Terminating StorageManagerTestHarness"); + let disks = self.handle().get_latest_disks().await; + let pools = disks.get_all_zpools(); + for (pool, _) in pools { + eprintln!("Destroying pool: {pool:?}"); + if let Err(e) = illumos_utils::zpool::Zpool::destroy(&pool) { + eprintln!("Failed to destroy {pool:?}: {e:?}"); + } + } + + self.key_manager_task.abort(); + self.storage_manager_task.abort(); + + // Make sure that we're actually able to delete everything within the + // temporary directory. + // + // This is necessary because the act of mounting datasets within this + // directory may have created directories owned by root, and the test + // process may not have been started as root. + // + // Since we're about to delete all these files anyway, make them + // accessible to everyone before destroying them. + let mut command = std::process::Command::new("/usr/bin/pfexec"); + let mount = vdev_dir.path(); + let cmd = command.args(["chmod", "-R", "a+rw", mount.as_str()]); + cmd.output().expect( + "Failed to change ownership of the temporary directory we're trying to delete" + ); + + // Actually delete everything, and check the result to fail loud if + // something goes wrong. + vdev_dir.close().expect("Failed to clean up temporary directory"); + } +} diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs index c1f460dc92..34b30f1bfd 100644 --- a/sled-storage/src/resources.rs +++ b/sled-storage/src/resources.rs @@ -4,17 +4,23 @@ //! Discovered and usable disks and zpools -use crate::dataset::M2_DEBUG_DATASET; -use crate::disk::Disk; +use crate::config::MountConfig; +use crate::dataset::{DatasetError, M2_DEBUG_DATASET}; +use crate::disk::{Disk, DiskError, OmicronPhysicalDiskConfig, RawDisk}; use crate::error::Error; -use crate::pool::Pool; use camino::Utf8PathBuf; use cfg_if::cfg_if; use illumos_utils::zpool::ZpoolName; +use key_manager::StorageKeyRequester; use omicron_common::disk::DiskIdentity; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; use sled_hardware::DiskVariant; +use slog::{info, o, warn, Logger}; use std::collections::BTreeMap; use std::sync::Arc; +use tokio::sync::watch; +use uuid::Uuid; // The directory within the debug dataset in which bundles are created. const BUNDLE_DIRECTORY: &str = "bundle"; @@ -22,129 +28,131 @@ const BUNDLE_DIRECTORY: &str = "bundle"; // The directory for zone bundles. const ZONE_BUNDLE_DIRECTORY: &str = "zone"; -pub enum AddDiskResult { - DiskInserted, - DiskAlreadyInserted, - DiskQueued, +#[derive(Debug, thiserror::Error, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case", tag = "type", content = "value")] +pub enum DiskManagementError { + #[error("Disk requested by control plane, but not found on device")] + NotFound, + + #[error("Expected zpool UUID of {expected}, but saw {observed}")] + ZpoolUuidMismatch { expected: Uuid, observed: Uuid }, + + #[error("Failed to access keys necessary to unlock storage. This error may be transient.")] + KeyManager(String), + + #[error("Other error starting disk management: {0}")] + Other(String), } -impl AddDiskResult { - pub fn disk_inserted(&self) -> bool { +impl DiskManagementError { + fn retryable(&self) -> bool { match self { - AddDiskResult::DiskInserted => true, + DiskManagementError::KeyManager(_) => true, _ => false, } } } -/// Storage related resources: disks and zpools -/// -/// This state is internal to the [`crate::manager::StorageManager`] task. Clones -/// of this state can be retrieved by requests to the `StorageManager` task -/// from the [`crate::manager::StorageHandle`]. This state is not `Sync`, and -/// as such does not require any mutexes. However, we do expect to share it -/// relatively frequently, and we want copies of it to be as cheaply made -/// as possible. So any large state is stored inside `Arc`s. On the other -/// hand, we expect infrequent updates to this state, and as such, we use -/// [`std::sync::Arc::make_mut`] to implement clone on write functionality -/// inside the `StorageManager` task if there are any outstanding copies. -/// Therefore, we only pay the cost to update infrequently, and no locks are -/// required by callers when operating on cloned data. The only contention here -/// is for the reference counters of the internal Arcs when `StorageResources` -/// gets cloned or dropped. -#[derive(Debug, Clone, Default, PartialEq, Eq)] -pub struct StorageResources { - // All disks, real and synthetic, being managed by this sled - disks: Arc>, +/// Identifies how a single disk management operation may have succeeded or +/// failed. +#[derive(Debug, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct DiskManagementStatus { + pub identity: DiskIdentity, + pub err: Option, } -impl StorageResources { - /// Return a reference to the current snapshot of disks - pub fn disks(&self) -> &BTreeMap { - &self.disks - } +/// The result from attempting to manage underlying disks. +/// +/// This is more complex than a simple "Error" type because it's possible +/// for some disks to be initialized correctly, while others can fail. +/// +/// This structure provides a mechanism for callers to learn about partial +/// failures, and handle them appropriately on a per-disk basis. +#[derive(Default, Debug, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[must_use = "this `DiskManagementResult` may contain errors, which should be handled"] +pub struct DisksManagementResult { + pub status: Vec, +} - /// Insert a disk and its zpool - /// - /// If the disk passed in is new or modified, or its pool size or pool - /// name changed, then insert the changed values and return `DiskInserted`. - /// Otherwise, do not insert anything and return `DiskAlreadyInserted`. - /// For instance, if only the pool health changes, because it is not one - /// of the checked values, we will not insert the update and will return - /// `DiskAlreadyInserted`. - pub(crate) fn insert_disk( - &mut self, - disk: Disk, - ) -> Result { - let disk_id = disk.identity().clone(); - let zpool_name = disk.zpool_name().clone(); - let zpool = Pool::new(zpool_name, disk_id.clone())?; - if let Some((stored_disk, stored_pool)) = self.disks.get(&disk_id) { - if stored_disk == &disk - && stored_pool.info.size() == zpool.info.size() - && stored_pool.name == zpool.name - { - return Ok(AddDiskResult::DiskAlreadyInserted); +impl DisksManagementResult { + pub fn has_error(&self) -> bool { + for status in &self.status { + if status.err.is_some() { + return true; } } - // Either the disk or zpool changed - Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); - Ok(AddDiskResult::DiskInserted) - } - - /// Insert a disk while creating a fake pool - /// This is a workaround for current mock based testing strategies - /// in the sled-agent. - #[cfg(feature = "testing")] - pub fn insert_fake_disk(&mut self, disk: Disk) -> AddDiskResult { - let disk_id = disk.identity().clone(); - let zpool_name = disk.zpool_name().clone(); - let zpool = Pool::new_with_fake_info(zpool_name, disk_id.clone()); - if self.disks.contains_key(&disk_id) { - return AddDiskResult::DiskAlreadyInserted; - } - // Either the disk or zpool changed - Arc::make_mut(&mut self.disks).insert(disk_id, (disk, zpool)); - AddDiskResult::DiskInserted + false } - /// Delete a disk and its zpool - /// - /// Return true, if data was changed, false otherwise - /// - /// Note: We never allow removal of synthetic disks in production as they - /// are only added once. - pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) -> bool { - let Some((disk, _)) = self.disks.get(id) else { - return false; - }; - - cfg_if! { - if #[cfg(test)] { - // For testing purposes, we allow synthetic disks to be deleted. - // Silence an unused variable warning. - _ = disk; - } else { - // In production, we disallow removal of synthetic disks as they - // are only added once. - if disk.is_synthetic() { - return false; + pub fn has_retryable_error(&self) -> bool { + for status in &self.status { + if let Some(err) = &status.err { + if err.retryable() { + return true; } } } - - // Safe to unwrap as we just checked the key existed above - Arc::make_mut(&mut self.disks).remove(id).unwrap(); - true + false } +} + +// The Sled Agent is responsible for both observing disks and managing them at +// the request of the broader control plane. This enum encompasses that duality, +// by representing all disks that can exist, managed or not. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ManagedDisk { + // A disk explicitly managed by the control plane. + // + // This includes U.2s which Nexus has told us to format and use. + ExplicitlyManaged(Disk), + + // A disk implicitly managed by the control plane. + // + // This includes M.2s which the sled agent auto-detects and uses. + ImplicitlyManaged(Disk), + + // A disk which has been observed by the sled, but which is not yet being + // managed by the control plane. + // + // This disk should be treated as "read-only" until we're explicitly told to + // use it. + Unmanaged(RawDisk), +} + +/// The disks, keyed by their identity, managed by the sled agent. +/// +/// This state is owned by [`crate::manager::StorageManager`], through +/// [`crate::resources::StorageResources`]. Clones of this state can be +/// retrieved by requests to the `StorageManager` task from the +/// [`crate::manager::StorageHandle`]. This state is not `Sync`, and as such +/// does not require any mutexes. However, we do expect to share it relatively +/// frequently, and we want copies of it to be as cheaply made as possible. So +/// any large state is stored inside `Arc`s. On the other hand, we expect +/// infrequent updates to this state, and as such, we use +/// [`std::sync::Arc::make_mut`] to implement clone on write functionality +/// inside the `StorageManager` task if there are any outstanding copies. +/// Therefore, we only pay the cost to update infrequently, and no locks are +/// required by callers when operating on cloned data. The only contention here +/// is for the reference counters of the internal Arcs when `AllDisks` +/// gets cloned or dropped. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct AllDisks { + pub values: Arc>, + pub mount_config: MountConfig, +} +impl AllDisks { /// Returns the identity of the boot disk. /// /// If this returns `None`, we have not processed the boot disk yet. pub fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> { - for (id, (disk, _)) in self.disks.iter() { - if disk.is_boot_disk() { - return Some((id.clone(), disk.zpool_name().clone())); + for (id, disk) in self.values.iter() { + if let ManagedDisk::ImplicitlyManaged(disk) = disk { + if disk.is_boot_disk() { + return Some((id.clone(), disk.zpool_name().clone())); + } } } None @@ -164,7 +172,9 @@ impl StorageResources { pub fn all_m2_mountpoints(&self, dataset: &str) -> Vec { self.all_m2_zpools() .iter() - .map(|zpool| zpool.dataset_mountpoint(dataset)) + .map(|zpool| { + zpool.dataset_mountpoint(&self.mount_config.root, dataset) + }) .collect() } @@ -172,26 +182,41 @@ impl StorageResources { pub fn all_u2_mountpoints(&self, dataset: &str) -> Vec { self.all_u2_zpools() .iter() - .map(|zpool| zpool.dataset_mountpoint(dataset)) + .map(|zpool| { + zpool.dataset_mountpoint(&self.mount_config.root, dataset) + }) .collect() } + /// Returns all zpools managed by the control plane pub fn get_all_zpools(&self) -> Vec<(ZpoolName, DiskVariant)> { - self.disks + self.values .values() - .map(|(disk, _)| (disk.zpool_name().clone(), disk.variant())) + .filter_map(|disk| match disk { + ManagedDisk::ExplicitlyManaged(disk) + | ManagedDisk::ImplicitlyManaged(disk) => { + Some((disk.zpool_name().clone(), disk.variant())) + } + ManagedDisk::Unmanaged(_) => None, + }) .collect() } - // Returns all zpools of a particular variant + // Returns all zpools of a particular variant. + // + // Only returns zpools from disks actively being managed. fn all_zpools(&self, variant: DiskVariant) -> Vec { - self.disks + self.values .values() - .filter_map(|(disk, _)| { - if disk.variant() == variant { - return Some(disk.zpool_name().clone()); + .filter_map(|disk| match disk { + ManagedDisk::ExplicitlyManaged(disk) + | ManagedDisk::ImplicitlyManaged(disk) => { + if disk.variant() == variant { + return Some(disk.zpool_name().clone()); + } + None } - None + ManagedDisk::Unmanaged(_) => None, }) .collect() } @@ -203,4 +228,333 @@ impl StorageResources { .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) .collect() } + + /// Returns an iterator over all managed disks. + pub fn iter_managed(&self) -> impl Iterator { + self.values.iter().filter_map(|(identity, disk)| match disk { + ManagedDisk::ExplicitlyManaged(disk) => Some((identity, disk)), + ManagedDisk::ImplicitlyManaged(disk) => Some((identity, disk)), + _ => None, + }) + } + + /// Returns an iterator over all disks, managed or not. + pub fn iter_all( + &self, + ) -> impl Iterator { + self.values.iter().map(|(identity, disk)| match disk { + ManagedDisk::ExplicitlyManaged(disk) => { + (identity, disk.variant(), disk.slot()) + } + ManagedDisk::ImplicitlyManaged(disk) => { + (identity, disk.variant(), disk.slot()) + } + ManagedDisk::Unmanaged(raw) => { + (identity, raw.variant(), raw.slot()) + } + }) + } +} + +/// The intersection of "physical disks noticed by hardware" and "physical +/// disks requested by the control plane". +#[derive(Debug)] +pub struct StorageResources { + log: Logger, + + key_requester: StorageKeyRequester, + + // All disks, real and synthetic, that exist within this sled + disks: AllDisks, + + // The last set of disks the control plane explicitly told us to manage. + // + // Only includes external storage (U.2s). + control_plane_disks: BTreeMap, + + // Many clients are interested when changes in the set of [AllDisks] + // might occur. This watch channel is updated once these disks get updated. + disk_updates: watch::Sender, +} + +impl StorageResources { + pub fn new( + log: &Logger, + mount_config: MountConfig, + key_requester: StorageKeyRequester, + ) -> Self { + let disks = + AllDisks { values: Arc::new(BTreeMap::new()), mount_config }; + Self { + log: log.new(o!("component" => "StorageResources")), + key_requester, + disks: disks.clone(), + control_plane_disks: BTreeMap::new(), + disk_updates: watch::Sender::new(disks), + } + } + + /// Monitors the set of disks for any updates + pub fn watch_disks(&self) -> watch::Receiver { + self.disk_updates.subscribe() + } + + /// Gets the set of all disks + pub fn disks(&self) -> &AllDisks { + &self.disks + } + + /// Sets the "control plane disk" state, as last requested by Nexus. + /// + /// Does not attempt to manage any of the physical disks previously + /// observed. To synchronize the "set of requested disks" with the "set of + /// observed disks", call [Self::synchronize_disk_management]. + pub fn set_config(&mut self, config: &Vec) { + self.control_plane_disks = config + .iter() + .map(|disk| (disk.identity.clone(), disk.clone())) + .collect(); + } + + pub fn get_config( + &self, + ) -> &BTreeMap { + &self.control_plane_disks + } + + /// Attempts to "manage" all the U.2 disks requested by the control plane. + /// + /// If any requested physical disks have not been observed by the hardware + /// monitor, they are ignored. + /// If the hardware monitor has observed disks that are not requested, they + /// are ignored. + /// + /// Attempts to manage all disks possible, and returns an error on partial + /// failure, indicating "which disks have failed to be synchronized". + pub async fn synchronize_disk_management( + &mut self, + ) -> DisksManagementResult { + let mut updated = false; + let disks = Arc::make_mut(&mut self.disks.values); + info!(self.log, "Synchronizing disk managment"); + + // "Unmanage" all disks no longer requested by the control plane. + // + // This updates the reported sets of "managed" disks, and performs no + // other modifications to the underlying storage. + for (identity, managed_disk) in &mut *disks { + match managed_disk { + // This leaves the presence of the disk still in "Self", but + // downgrades the disk to an unmanaged status. + ManagedDisk::ExplicitlyManaged(disk) => { + if self.control_plane_disks.get(identity).is_none() { + *managed_disk = + ManagedDisk::Unmanaged(RawDisk::from(disk.clone())); + updated = true; + } + } + _ => (), + } + } + + // "Manage" all disks that the control plane wants. + // + // If the disk can be successfully managed, and it's new, it will be + // formatted with a zpool identified by the Nexus-specified + // configuration. + let mut result = DisksManagementResult::default(); + for (identity, config) in &self.control_plane_disks { + let Some(managed_disk) = disks.get_mut(identity) else { + warn!( + self.log, + "Control plane disk requested, but not detected within sled"; + "disk_identity" => ?identity + ); + result.status.push(DiskManagementStatus { + identity: identity.clone(), + err: Some(DiskManagementError::NotFound), + }); + continue; + }; + info!(self.log, "Managing disk"; "disk_identity" => ?identity); + match managed_disk { + // Disk is currently unmanaged. Try to adopt the disk, which may + // involve formatting it, and emplacing the zpool. + ManagedDisk::Unmanaged(raw_disk) => { + match Self::begin_disk_management( + &self.log, + &self.disks.mount_config, + raw_disk, + config, + Some(&self.key_requester), + ) + .await + { + Ok(disk) => { + info!(self.log, "Disk management started successfully"; "disk_identity" => ?identity); + *managed_disk = disk; + updated = true; + } + Err(err) => { + warn!(self.log, "Cannot parse disk"; "err" => ?err); + result.status.push(DiskManagementStatus { + identity: identity.clone(), + err: Some(err), + }); + continue; + } + } + } + // Disk is already managed. Check that the configuration + // matches what we expect. + ManagedDisk::ExplicitlyManaged(disk) => { + let expected = config.pool_id; + let observed = disk.zpool_name().id(); + if expected != observed { + warn!( + self.log, + "Observed an unexpected zpool uuid"; + "expected" => ?expected, "observed" => ?observed + ); + result.status.push(DiskManagementStatus { + identity: identity.clone(), + err: Some(DiskManagementError::ZpoolUuidMismatch { + expected, + observed, + }), + }); + continue; + } + info!(self.log, "Disk already managed successfully"; "disk_identity" => ?identity); + } + // Skip disks that are managed implicitly + ManagedDisk::ImplicitlyManaged(_) => continue, + } + + result.status.push(DiskManagementStatus { + identity: identity.clone(), + err: None, + }); + } + + if updated { + self.disk_updates.send_replace(self.disks.clone()); + } + + return result; + } + + // Helper function to help transition an "unmanaged" disk to a "managed" + // disk. + async fn begin_disk_management( + log: &Logger, + mount_config: &MountConfig, + raw_disk: &RawDisk, + config: &OmicronPhysicalDiskConfig, + key_requester: Option<&StorageKeyRequester>, + ) -> Result { + info!(log, "Invoking Disk::new on an unmanaged disk"); + let disk = Disk::new( + &log, + mount_config, + raw_disk.clone(), + Some(config.pool_id), + key_requester, + ) + .await + .map_err(|err| { + warn!(log, "Disk::new failed"; "err" => ?err); + match err { + // We pick this error out and identify it separately because + // it may be transient, and should sometimes be handled with + // a retry. + DiskError::Dataset(DatasetError::KeyManager(_)) => { + DiskManagementError::KeyManager(err.to_string()) + } + err => DiskManagementError::Other(err.to_string()), + } + })?; + info!(log, "Disk::new completed successfully"; "disk_identity" => ?raw_disk.identity()); + Ok(ManagedDisk::ExplicitlyManaged(disk)) + } + + /// Tracks a new disk. + /// + /// For U.2s: Does not automatically attempt to manage disks -- for this, + /// the caller will need to also invoke + /// [`Self::synchronize_disk_management`]. + /// + /// For M.2s: As no additional control plane guidance is necessary to adopt + /// M.2s, these are automatically managed. + pub(crate) async fn insert_disk( + &mut self, + disk: RawDisk, + ) -> Result<(), Error> { + let disk_identity = disk.identity().clone(); + info!(self.log, "Inserting disk"; "identity" => ?disk_identity); + if self.disks.values.contains_key(&disk_identity) { + info!(self.log, "Disk already exists"; "identity" => ?disk_identity); + return Ok(()); + } + + let disks = Arc::make_mut(&mut self.disks.values); + match disk.variant() { + DiskVariant::U2 => { + disks.insert(disk_identity, ManagedDisk::Unmanaged(disk)); + } + DiskVariant::M2 => { + let managed_disk = Disk::new( + &self.log, + &self.disks.mount_config, + disk, + None, + Some(&self.key_requester), + ) + .await?; + disks.insert( + disk_identity, + ManagedDisk::ImplicitlyManaged(managed_disk), + ); + } + } + self.disk_updates.send_replace(self.disks.clone()); + + Ok(()) + } + + /// Delete a disk and its zpool + /// + /// Return true, if data was changed, false otherwise + /// + /// Note: We never allow removal of synthetic disks in production as they + /// are only added once. + pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) { + info!(self.log, "Removing disk"; "identity" => ?id); + let Some(entry) = self.disks.values.get(id) else { + return; + }; + let synthetic = match entry { + ManagedDisk::ExplicitlyManaged(disk) + | ManagedDisk::ImplicitlyManaged(disk) => disk.is_synthetic(), + ManagedDisk::Unmanaged(raw) => raw.is_synthetic(), + }; + + cfg_if! { + if #[cfg(test)] { + // For testing purposes, we allow synthetic disks to be deleted. + // Silence an unused variable warning. + _ = synthetic; + } else { + // In production, we disallow removal of synthetic disks as they + // are only added once. + if synthetic { + return; + } + } + } + + // Safe to unwrap as we just checked the key existed above + Arc::make_mut(&mut self.disks.values).remove(id).unwrap(); + self.disk_updates.send_replace(self.disks.clone()); + } } diff --git a/smf/sled-agent/non-gimlet/config.toml b/smf/sled-agent/non-gimlet/config.toml index 432652c50b..9efdcfbb93 100644 --- a/smf/sled-agent/non-gimlet/config.toml +++ b/smf/sled-agent/non-gimlet/config.toml @@ -18,27 +18,29 @@ sidecar_revision.soft_zone = { front_port_count = 1, rear_port_count = 1 } # in-sync, rather than querying its NTP zone. skip_timesync = false -# For testing purposes, A file-backed zpool can be manually created with the -# following: +# For testing purposes, a file-backed virtual disk can be manually created with +# the following: # -# # truncate -s 10GB testpool.vdev -# # zpool create oxp_d462a7f7-b628-40fe-80ff-4e4189e2d62b "$PWD/testpool.vdev" +# # truncate -s 10GB .vdev # -# Note that you'll need to create one such zpool for each below, with a -# different vdev for each. The `create_virtual_hardware.sh` script does this -# for you. -zpools = [ - "oxi_a462a7f7-b628-40fe-80ff-4e4189e2d62b", - "oxi_b462a7f7-b628-40fe-80ff-4e4189e2d62b", - "oxp_d462a7f7-b628-40fe-80ff-4e4189e2d62b", - "oxp_e4b4dc87-ab46-49fb-a4b4-d361ae214c03", - "oxp_f4b4dc87-ab46-49fb-a4b4-d361ae214c03", - "oxp_14b4dc87-ab46-49fb-a4b4-d361ae214c03", - "oxp_24b4dc87-ab46-49fb-a4b4-d361ae214c03", - "oxp_cd70d7f6-2354-4bf2-8012-55bf9eaf7930", - "oxp_ceb4461c-cf56-4719-ad3c-14430bfdfb60", - "oxp_31bd71cd-4736-4a12-a387-9b74b050396f", - "oxp_616b26df-e62a-4c68-b506-f4a923d8aaf7", +# Note that you'll need to create one such file for each disk below. +# The `create_virtual_hardware.sh` script does this for you. +# +# These paths have the prefix of either "u2" or "m2", followed by an underscore, +# followed by a string that is embedded into their fake serial values. +vdevs = [ + "m2_0.vdev", + "m2_1.vdev", + + "u2_0.vdev", + "u2_1.vdev", + "u2_2.vdev", + "u2_3.vdev", + "u2_4.vdev", + "u2_5.vdev", + "u2_6.vdev", + "u2_7.vdev", + "u2_8.vdev", ] # Percentage of usable physical DRAM to use for the VMM reservoir, which diff --git a/tools/create_gimlet_virtual_hardware.sh b/tools/create_gimlet_virtual_hardware.sh index ad22cc26e7..da26bef3cd 100755 --- a/tools/create_gimlet_virtual_hardware.sh +++ b/tools/create_gimlet_virtual_hardware.sh @@ -29,4 +29,4 @@ if [[ -f "$MARKER" ]]; then fi ensure_run_as_root -ensure_zpools +ensure_vdevs diff --git a/tools/create_scrimlet_virtual_hardware.sh b/tools/create_scrimlet_virtual_hardware.sh index be7785a90d..5ae4e52258 100755 --- a/tools/create_scrimlet_virtual_hardware.sh +++ b/tools/create_scrimlet_virtual_hardware.sh @@ -60,6 +60,6 @@ function ensure_softnpu_zone { } ensure_run_as_root -ensure_zpools +ensure_vdevs ensure_uplink_vnic "$PHYSICAL_LINK" ensure_softnpu_zone diff --git a/tools/create_virtual_hardware.sh b/tools/create_virtual_hardware.sh index ef01af92bb..116032dc22 100755 --- a/tools/create_virtual_hardware.sh +++ b/tools/create_virtual_hardware.sh @@ -84,7 +84,7 @@ in the SoftNPU zone later to add those entries." } ensure_run_as_root -ensure_zpools +ensure_vdevs if [[ "$SOFTNPU_MODE" == "zone" ]]; then ensure_simulated_links "$PHYSICAL_LINK" diff --git a/tools/virtual_hardware.sh b/tools/virtual_hardware.sh index ade7ac58b3..883b98a04e 100755 --- a/tools/virtual_hardware.sh +++ b/tools/virtual_hardware.sh @@ -23,27 +23,23 @@ function fail { exit 1 } -# Create the ZFS zpools required for the sled agent, backed by file-based vdevs. -function ensure_zpools { - # Find the list of zpools the sled agent expects, from its configuration +# Create the virtual devices required by the sled agent. +function ensure_vdevs { + # Find the list of virtual devices the sled agent expects, from its configuration # file. - ZPOOL_TYPES=('oxp_' 'oxi_') - for ZPOOL_TYPE in "${ZPOOL_TYPES[@]}"; do - readarray -t ZPOOLS < <( \ - grep "\"$ZPOOL_TYPE" "$OMICRON_TOP/smf/sled-agent/non-gimlet/config.toml" | \ + VDEV_TYPES=('m2_' 'u2_') + for VDEV_TYPE in "${VDEV_TYPES[@]}"; do + readarray -t VDEVS < <( \ + grep "\"$VDEV_TYPE" "$OMICRON_TOP/smf/sled-agent/non-gimlet/config.toml" | \ sed 's/[ ",]//g' \ ) - for ZPOOL in "${ZPOOLS[@]}"; do - echo "Zpool: [$ZPOOL]" - VDEV_PATH="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev" + for VDEV in "${VDEVS[@]}"; do + echo "Device: [$VDEV]" + VDEV_PATH="${VDEV_DIR:-/var/tmp}/$VDEV" if ! [[ -f "$VDEV_PATH" ]]; then dd if=/dev/zero of="$VDEV_PATH" bs=1 count=0 seek=20G fi - success "ZFS vdev $VDEV_PATH exists" - if [[ -z "$(zpool list -o name | grep $ZPOOL)" ]]; then - zpool create -o ashift=12 -f "$ZPOOL" "$VDEV_PATH" - fi - success "ZFS zpool $ZPOOL exists" + success "vdev $VDEV_PATH exists" done done } @@ -53,7 +49,7 @@ function try_destroy_zpools { for ZPOOL_TYPE in "${ZPOOL_TYPES[@]}"; do readarray -t ZPOOLS < <(zfs list -d 0 -o name | grep "^$ZPOOL_TYPE") for ZPOOL in "${ZPOOLS[@]}"; do - VDEV_FILE="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev" + VDEV_FILE="${VDEV_DIR:-/var/tmp}/$VDEV" zfs destroy -r "$ZPOOL" && \ (zfs unmount "$ZPOOL" || true) && \ zpool destroy "$ZPOOL" && \