Skip to content

Commit

Permalink
[sled-agent] Stop self-managing physical disks (#5987)
Browse files Browse the repository at this point in the history
Fixes #5328

This was an old kludge for backwards compatibility, we should be able to
rely on ledgers explicitly for this purpose.
  • Loading branch information
smklein authored Jul 15, 2024
1 parent 8c6afd1 commit 031b5ec
Showing 1 changed file with 1 addition and 233 deletions.
234 changes: 1 addition & 233 deletions sled-storage/src/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ use std::collections::HashSet;

use crate::config::MountConfig;
use crate::dataset::{DatasetName, CONFIG_DATASET};
use crate::disk::{
OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, RawDisk,
};
use crate::disk::{OmicronPhysicalDisksConfig, RawDisk};
use crate::error::Error;
use crate::resources::{AllDisks, DisksManagementResult, StorageResources};
use camino::Utf8PathBuf;
Expand Down Expand Up @@ -589,91 +587,11 @@ impl StorageManager {
self.resources.set_config(&ledger.data().disks);
} else {
info!(self.log, "KeyManager ready, but no ledger detected");
let mut synthetic_config =
self.resources.get_config().values().cloned().collect();
// TODO(https://github.com/oxidecomputer/omicron/issues/5328): Once
// we are confident that we have migrated to a world where this
// ledger is universally used, we should remove the following
// kludge. The sled agent should not need to "self-manage" anything!
let changed = self
.self_manage_disks_with_zpools(&mut synthetic_config)
.await?;
if !changed {
info!(self.log, "No disks to be automatically managed");
return Ok(());
}
info!(self.log, "auto-managed disks"; "count" => synthetic_config.len());
self.resources.set_config(&synthetic_config);
}

Ok(())
}

// NOTE: What follows is an exceptional case: one where we have
// no record of "Control Plane Physical Disks", but we have zpools
// on our U.2s, and we want to use them regardless.
//
// THIS WOULD NORMALLY BE INCORRECT BEHAVIOR. In the future, these
// zpools will not be "automatically imported", and instead, we'll
// let Nexus decide whether or not to reformat the disks.
//
// However, because we are transitioning from "the set of disks /
// zpools is implicit" to a world where that set is explicit, this
// is a necessary transitional tool.
//
// Returns "true" if the synthetic_config has changed.
async fn self_manage_disks_with_zpools(
&mut self,
synthetic_config: &mut Vec<OmicronPhysicalDiskConfig>,
) -> Result<bool, Error> {
let mut changed = false;
for (identity, disk) in self.resources.disks().values.iter() {
match disk {
crate::resources::ManagedDisk::Unmanaged(raw) => {
let zpool_path = match raw.u2_zpool_path() {
Ok(zpool_path) => zpool_path,
Err(err) => {
info!(self.log, "Cannot find zpool path"; "identity" => ?identity, "err" => ?err);
continue;
}
};

let zpool_name =
match sled_hardware::disk::check_if_zpool_exists(
&zpool_path,
) {
Ok(zpool_name) => zpool_name,
Err(err) => {
info!(self.log, "Zpool does not exist"; "identity" => ?identity, "err" => ?err);
continue;
}
};

info!(self.log, "Found existing zpool on device without ledger";
"identity" => ?identity,
"zpool" => ?zpool_name);

// We found an unmanaged disk with a zpool, even though
// we have no prior record of a ledger of control-plane
// disks.
synthetic_config.push(
// These disks don't have a control-plane UUID --
// report "nil" until they're overwritten with real
// values.
OmicronPhysicalDiskConfig {
identity: identity.clone(),
id: Uuid::nil(),
pool_id: zpool_name.id(),
},
);
changed = true;
}
_ => continue,
}
}
Ok(changed)
}

// Makes an U.2 disk managed by the control plane within [`StorageResources`].
async fn omicron_physical_disks_ensure(
&mut self,
Expand Down Expand Up @@ -911,10 +829,8 @@ mod tests {

use super::*;
use camino_tempfile::tempdir_in;
use omicron_common::api::external::Generation;
use omicron_common::ledger;
use omicron_test_utils::dev::test_setup_log;
use omicron_uuid_kinds::ZpoolUuid;
use sled_hardware::DiskFirmware;
use std::sync::atomic::Ordering;
use uuid::Uuid;
Expand Down Expand Up @@ -1390,154 +1306,6 @@ mod tests {
harness.cleanup().await;
logctx.cleanup_successful();
}

#[tokio::test]
async fn ledgerless_to_ledgered_migration() {
illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst);
let logctx = test_setup_log("ledgerless_to_ledgered_migration");
let mut harness = StorageManagerTestHarness::new(&logctx.log).await;

// Test setup: Create two U.2s and an M.2
let raw_disks = harness
.add_vdevs(&[
"u2_under_test.vdev",
"u2_that_shows_up_late.vdev",
"m2_helping.vdev",
])
.await;

// First, we format the U.2s to have a zpool. This should work, even
// without looping in the StorageManager.
let first_u2 = &raw_disks[0];
let first_pool_id = ZpoolUuid::new_v4();
let _disk = crate::disk::Disk::new(
&logctx.log,
&harness.mount_config(),
first_u2.clone(),
Some(first_pool_id),
Some(harness.key_requester()),
)
.await
.expect("Failed to format U.2");

let second_u2 = &raw_disks[1];
let second_pool_id = ZpoolUuid::new_v4();
let _disk = crate::disk::Disk::new(
&logctx.log,
&harness.mount_config(),
second_u2.clone(),
Some(second_pool_id),
Some(harness.key_requester()),
)
.await
.expect("Failed to format U.2");

// Because we did that formatting "behind the back" of the
// StorageManager, we should see no evidence of the U.2 being managed.
//
// This currently matches the format of "existing systems, which were
// initialized before the storage ledger was created".

// We should still see no ledger.
let result = harness.handle().omicron_physical_disks_list().await;
assert!(matches!(result, Err(Error::LedgerNotFound)), "{:?}", result);

// We should also not see any managed U.2s.
let disks = harness.handle().get_latest_disks().await;
assert!(disks.all_u2_zpools().is_empty());

// Leave one of the U.2s attached, but "remove" the other one.
harness.remove_vdev(second_u2).await;

// When the system activates, we should see a single Zpool, and
// "auto-manage" it.
harness.handle().key_manager_ready().await;

// It might take a moment for synchronization to be handled by the
// background task, but we'll eventually see the U.2 zpool.
//
// This is the equivalent of us "loading a zpool, even though
// it was not backed by a ledger".
let tt = TimeTravel::new();
tt.enough_to_start_synchronization().await;
while harness
.handle_mut()
.wait_for_changes()
.await
.all_u2_zpools()
.is_empty()
{
info!(&logctx.log, "Waiting for U.2 to automatically show up");
}
let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
assert_eq!(u2s.len(), 1, "{:?}", u2s);

// If we attach the second U.2 -- the equivalent of it appearing after
// the key manager is ready -- it'll also be included in the set of
// auto-maanged U.2s.
harness.add_vdev_as(second_u2.clone()).await;
tt.enough_to_start_synchronization().await;
while harness
.handle_mut()
.wait_for_changes()
.await
.all_u2_zpools()
.len()
== 1
{
info!(&logctx.log, "Waiting for U.2 to automatically show up");
}
let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
assert_eq!(u2s.len(), 2, "{:?}", u2s);

// This is the equivalent of the "/omicron-physical-disks GET" API,
// which Nexus might use to contact this sled.
//
// This means that we'll bootstrap the sled successfully, but report a
// 404 if nexus asks us for the latest configuration.
let result = harness.handle().omicron_physical_disks_list().await;
assert!(matches!(result, Err(Error::LedgerNotFound),), "{:?}", result);

// At this point, Nexus may want to explicitly tell sled agent which
// disks it should use. This is the equivalent of invoking
// "/omicron-physical-disks PUT".
let mut disks = vec![
OmicronPhysicalDiskConfig {
identity: first_u2.identity().clone(),
id: Uuid::new_v4(),
pool_id: first_pool_id,
},
OmicronPhysicalDiskConfig {
identity: second_u2.identity().clone(),
id: Uuid::new_v4(),
pool_id: second_pool_id,
},
];
// Sort the disks to ensure the "output" matches the "input" when we
// query later.
disks.sort_by(|a, b| a.identity.partial_cmp(&b.identity).unwrap());
let config =
OmicronPhysicalDisksConfig { generation: Generation::new(), disks };
let result = harness
.handle()
.omicron_physical_disks_ensure(config.clone())
.await
.expect("Failed to ensure disks with 'new' Config");
assert!(!result.has_error(), "{:?}", result);

let observed_config = harness
.handle()
.omicron_physical_disks_list()
.await
.expect("Failed to retreive config after ensuring it");
assert_eq!(observed_config, config);

let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
assert_eq!(u2s.len(), 2, "{:?}", u2s);

harness.cleanup().await;
logctx.cleanup_successful();
}
}

#[cfg(test)]
Expand Down

0 comments on commit 031b5ec

Please sign in to comment.