From 031b5ecb6bebae76100d1f9240d258c0dcbbc91e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 15 Jul 2024 13:34:50 -0700 Subject: [PATCH] [sled-agent] Stop self-managing physical disks (#5987) Fixes https://github.com/oxidecomputer/omicron/issues/5328 This was an old kludge for backwards compatibility, we should be able to rely on ledgers explicitly for this purpose. --- sled-storage/src/manager.rs | 234 +----------------------------------- 1 file changed, 1 insertion(+), 233 deletions(-) diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index 9e31568e00..d374ab8e23 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -8,9 +8,7 @@ use std::collections::HashSet; use crate::config::MountConfig; use crate::dataset::{DatasetName, CONFIG_DATASET}; -use crate::disk::{ - OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, RawDisk, -}; +use crate::disk::{OmicronPhysicalDisksConfig, RawDisk}; use crate::error::Error; use crate::resources::{AllDisks, DisksManagementResult, StorageResources}; use camino::Utf8PathBuf; @@ -589,91 +587,11 @@ impl StorageManager { self.resources.set_config(&ledger.data().disks); } else { info!(self.log, "KeyManager ready, but no ledger detected"); - let mut synthetic_config = - self.resources.get_config().values().cloned().collect(); - // TODO(https://github.com/oxidecomputer/omicron/issues/5328): Once - // we are confident that we have migrated to a world where this - // ledger is universally used, we should remove the following - // kludge. The sled agent should not need to "self-manage" anything! - let changed = self - .self_manage_disks_with_zpools(&mut synthetic_config) - .await?; - if !changed { - info!(self.log, "No disks to be automatically managed"); - return Ok(()); - } - info!(self.log, "auto-managed disks"; "count" => synthetic_config.len()); - self.resources.set_config(&synthetic_config); } Ok(()) } - // NOTE: What follows is an exceptional case: one where we have - // no record of "Control Plane Physical Disks", but we have zpools - // on our U.2s, and we want to use them regardless. - // - // THIS WOULD NORMALLY BE INCORRECT BEHAVIOR. In the future, these - // zpools will not be "automatically imported", and instead, we'll - // let Nexus decide whether or not to reformat the disks. - // - // However, because we are transitioning from "the set of disks / - // zpools is implicit" to a world where that set is explicit, this - // is a necessary transitional tool. - // - // Returns "true" if the synthetic_config has changed. - async fn self_manage_disks_with_zpools( - &mut self, - synthetic_config: &mut Vec, - ) -> Result { - let mut changed = false; - for (identity, disk) in self.resources.disks().values.iter() { - match disk { - crate::resources::ManagedDisk::Unmanaged(raw) => { - let zpool_path = match raw.u2_zpool_path() { - Ok(zpool_path) => zpool_path, - Err(err) => { - info!(self.log, "Cannot find zpool path"; "identity" => ?identity, "err" => ?err); - continue; - } - }; - - let zpool_name = - match sled_hardware::disk::check_if_zpool_exists( - &zpool_path, - ) { - Ok(zpool_name) => zpool_name, - Err(err) => { - info!(self.log, "Zpool does not exist"; "identity" => ?identity, "err" => ?err); - continue; - } - }; - - info!(self.log, "Found existing zpool on device without ledger"; - "identity" => ?identity, - "zpool" => ?zpool_name); - - // We found an unmanaged disk with a zpool, even though - // we have no prior record of a ledger of control-plane - // disks. - synthetic_config.push( - // These disks don't have a control-plane UUID -- - // report "nil" until they're overwritten with real - // values. - OmicronPhysicalDiskConfig { - identity: identity.clone(), - id: Uuid::nil(), - pool_id: zpool_name.id(), - }, - ); - changed = true; - } - _ => continue, - } - } - Ok(changed) - } - // Makes an U.2 disk managed by the control plane within [`StorageResources`]. async fn omicron_physical_disks_ensure( &mut self, @@ -911,10 +829,8 @@ mod tests { use super::*; use camino_tempfile::tempdir_in; - use omicron_common::api::external::Generation; use omicron_common::ledger; use omicron_test_utils::dev::test_setup_log; - use omicron_uuid_kinds::ZpoolUuid; use sled_hardware::DiskFirmware; use std::sync::atomic::Ordering; use uuid::Uuid; @@ -1390,154 +1306,6 @@ mod tests { harness.cleanup().await; logctx.cleanup_successful(); } - - #[tokio::test] - async fn ledgerless_to_ledgered_migration() { - illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); - let logctx = test_setup_log("ledgerless_to_ledgered_migration"); - let mut harness = StorageManagerTestHarness::new(&logctx.log).await; - - // Test setup: Create two U.2s and an M.2 - let raw_disks = harness - .add_vdevs(&[ - "u2_under_test.vdev", - "u2_that_shows_up_late.vdev", - "m2_helping.vdev", - ]) - .await; - - // First, we format the U.2s to have a zpool. This should work, even - // without looping in the StorageManager. - let first_u2 = &raw_disks[0]; - let first_pool_id = ZpoolUuid::new_v4(); - let _disk = crate::disk::Disk::new( - &logctx.log, - &harness.mount_config(), - first_u2.clone(), - Some(first_pool_id), - Some(harness.key_requester()), - ) - .await - .expect("Failed to format U.2"); - - let second_u2 = &raw_disks[1]; - let second_pool_id = ZpoolUuid::new_v4(); - let _disk = crate::disk::Disk::new( - &logctx.log, - &harness.mount_config(), - second_u2.clone(), - Some(second_pool_id), - Some(harness.key_requester()), - ) - .await - .expect("Failed to format U.2"); - - // Because we did that formatting "behind the back" of the - // StorageManager, we should see no evidence of the U.2 being managed. - // - // This currently matches the format of "existing systems, which were - // initialized before the storage ledger was created". - - // We should still see no ledger. - let result = harness.handle().omicron_physical_disks_list().await; - assert!(matches!(result, Err(Error::LedgerNotFound)), "{:?}", result); - - // We should also not see any managed U.2s. - let disks = harness.handle().get_latest_disks().await; - assert!(disks.all_u2_zpools().is_empty()); - - // Leave one of the U.2s attached, but "remove" the other one. - harness.remove_vdev(second_u2).await; - - // When the system activates, we should see a single Zpool, and - // "auto-manage" it. - harness.handle().key_manager_ready().await; - - // It might take a moment for synchronization to be handled by the - // background task, but we'll eventually see the U.2 zpool. - // - // This is the equivalent of us "loading a zpool, even though - // it was not backed by a ledger". - let tt = TimeTravel::new(); - tt.enough_to_start_synchronization().await; - while harness - .handle_mut() - .wait_for_changes() - .await - .all_u2_zpools() - .is_empty() - { - info!(&logctx.log, "Waiting for U.2 to automatically show up"); - } - let u2s = harness.handle().get_latest_disks().await.all_u2_zpools(); - assert_eq!(u2s.len(), 1, "{:?}", u2s); - - // If we attach the second U.2 -- the equivalent of it appearing after - // the key manager is ready -- it'll also be included in the set of - // auto-maanged U.2s. - harness.add_vdev_as(second_u2.clone()).await; - tt.enough_to_start_synchronization().await; - while harness - .handle_mut() - .wait_for_changes() - .await - .all_u2_zpools() - .len() - == 1 - { - info!(&logctx.log, "Waiting for U.2 to automatically show up"); - } - let u2s = harness.handle().get_latest_disks().await.all_u2_zpools(); - assert_eq!(u2s.len(), 2, "{:?}", u2s); - - // This is the equivalent of the "/omicron-physical-disks GET" API, - // which Nexus might use to contact this sled. - // - // This means that we'll bootstrap the sled successfully, but report a - // 404 if nexus asks us for the latest configuration. - let result = harness.handle().omicron_physical_disks_list().await; - assert!(matches!(result, Err(Error::LedgerNotFound),), "{:?}", result); - - // At this point, Nexus may want to explicitly tell sled agent which - // disks it should use. This is the equivalent of invoking - // "/omicron-physical-disks PUT". - let mut disks = vec![ - OmicronPhysicalDiskConfig { - identity: first_u2.identity().clone(), - id: Uuid::new_v4(), - pool_id: first_pool_id, - }, - OmicronPhysicalDiskConfig { - identity: second_u2.identity().clone(), - id: Uuid::new_v4(), - pool_id: second_pool_id, - }, - ]; - // Sort the disks to ensure the "output" matches the "input" when we - // query later. - disks.sort_by(|a, b| a.identity.partial_cmp(&b.identity).unwrap()); - let config = - OmicronPhysicalDisksConfig { generation: Generation::new(), disks }; - let result = harness - .handle() - .omicron_physical_disks_ensure(config.clone()) - .await - .expect("Failed to ensure disks with 'new' Config"); - assert!(!result.has_error(), "{:?}", result); - - let observed_config = harness - .handle() - .omicron_physical_disks_list() - .await - .expect("Failed to retreive config after ensuring it"); - assert_eq!(observed_config, config); - - let u2s = harness.handle().get_latest_disks().await.all_u2_zpools(); - assert_eq!(u2s.len(), 2, "{:?}", u2s); - - harness.cleanup().await; - logctx.cleanup_successful(); - } } #[cfg(test)]