Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sled-agent] Stop self-managing physical disks #5987

Merged
merged 9 commits into from
Jul 15, 2024
234 changes: 1 addition & 233 deletions sled-storage/src/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ use std::collections::HashSet;

use crate::config::MountConfig;
use crate::dataset::{DatasetName, CONFIG_DATASET};
use crate::disk::{
OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, RawDisk,
};
use crate::disk::{OmicronPhysicalDisksConfig, RawDisk};
use crate::error::Error;
use crate::resources::{AllDisks, DisksManagementResult, StorageResources};
use camino::Utf8PathBuf;
Expand Down Expand Up @@ -589,91 +587,11 @@ impl StorageManager {
self.resources.set_config(&ledger.data().disks);
} else {
info!(self.log, "KeyManager ready, but no ledger detected");
let mut synthetic_config =
self.resources.get_config().values().cloned().collect();
// TODO(https://github.com/oxidecomputer/omicron/issues/5328): Once
// we are confident that we have migrated to a world where this
// ledger is universally used, we should remove the following
// kludge. The sled agent should not need to "self-manage" anything!
let changed = self
.self_manage_disks_with_zpools(&mut synthetic_config)
.await?;
if !changed {
info!(self.log, "No disks to be automatically managed");
return Ok(());
}
info!(self.log, "auto-managed disks"; "count" => synthetic_config.len());
self.resources.set_config(&synthetic_config);
}

Ok(())
}

// NOTE: What follows is an exceptional case: one where we have
// no record of "Control Plane Physical Disks", but we have zpools
// on our U.2s, and we want to use them regardless.
//
// THIS WOULD NORMALLY BE INCORRECT BEHAVIOR. In the future, these
// zpools will not be "automatically imported", and instead, we'll
// let Nexus decide whether or not to reformat the disks.
//
// However, because we are transitioning from "the set of disks /
// zpools is implicit" to a world where that set is explicit, this
// is a necessary transitional tool.
//
// Returns "true" if the synthetic_config has changed.
async fn self_manage_disks_with_zpools(
&mut self,
synthetic_config: &mut Vec<OmicronPhysicalDiskConfig>,
) -> Result<bool, Error> {
let mut changed = false;
for (identity, disk) in self.resources.disks().values.iter() {
match disk {
crate::resources::ManagedDisk::Unmanaged(raw) => {
let zpool_path = match raw.u2_zpool_path() {
Ok(zpool_path) => zpool_path,
Err(err) => {
info!(self.log, "Cannot find zpool path"; "identity" => ?identity, "err" => ?err);
continue;
}
};

let zpool_name =
match sled_hardware::disk::check_if_zpool_exists(
&zpool_path,
) {
Ok(zpool_name) => zpool_name,
Err(err) => {
info!(self.log, "Zpool does not exist"; "identity" => ?identity, "err" => ?err);
continue;
}
};

info!(self.log, "Found existing zpool on device without ledger";
"identity" => ?identity,
"zpool" => ?zpool_name);

// We found an unmanaged disk with a zpool, even though
// we have no prior record of a ledger of control-plane
// disks.
synthetic_config.push(
// These disks don't have a control-plane UUID --
// report "nil" until they're overwritten with real
// values.
OmicronPhysicalDiskConfig {
identity: identity.clone(),
id: Uuid::nil(),
pool_id: zpool_name.id(),
},
);
changed = true;
}
_ => continue,
}
}
Ok(changed)
}

// Makes an U.2 disk managed by the control plane within [`StorageResources`].
async fn omicron_physical_disks_ensure(
&mut self,
Expand Down Expand Up @@ -911,10 +829,8 @@ mod tests {

use super::*;
use camino_tempfile::tempdir_in;
use omicron_common::api::external::Generation;
use omicron_common::ledger;
use omicron_test_utils::dev::test_setup_log;
use omicron_uuid_kinds::ZpoolUuid;
use sled_hardware::DiskFirmware;
use std::sync::atomic::Ordering;
use uuid::Uuid;
Expand Down Expand Up @@ -1390,154 +1306,6 @@ mod tests {
harness.cleanup().await;
logctx.cleanup_successful();
}

#[tokio::test]
async fn ledgerless_to_ledgered_migration() {
illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst);
let logctx = test_setup_log("ledgerless_to_ledgered_migration");
let mut harness = StorageManagerTestHarness::new(&logctx.log).await;

// Test setup: Create two U.2s and an M.2
let raw_disks = harness
.add_vdevs(&[
"u2_under_test.vdev",
"u2_that_shows_up_late.vdev",
"m2_helping.vdev",
])
.await;

// First, we format the U.2s to have a zpool. This should work, even
// without looping in the StorageManager.
let first_u2 = &raw_disks[0];
let first_pool_id = ZpoolUuid::new_v4();
let _disk = crate::disk::Disk::new(
&logctx.log,
&harness.mount_config(),
first_u2.clone(),
Some(first_pool_id),
Some(harness.key_requester()),
)
.await
.expect("Failed to format U.2");

let second_u2 = &raw_disks[1];
let second_pool_id = ZpoolUuid::new_v4();
let _disk = crate::disk::Disk::new(
&logctx.log,
&harness.mount_config(),
second_u2.clone(),
Some(second_pool_id),
Some(harness.key_requester()),
)
.await
.expect("Failed to format U.2");

// Because we did that formatting "behind the back" of the
// StorageManager, we should see no evidence of the U.2 being managed.
//
// This currently matches the format of "existing systems, which were
// initialized before the storage ledger was created".

// We should still see no ledger.
let result = harness.handle().omicron_physical_disks_list().await;
assert!(matches!(result, Err(Error::LedgerNotFound)), "{:?}", result);

// We should also not see any managed U.2s.
let disks = harness.handle().get_latest_disks().await;
assert!(disks.all_u2_zpools().is_empty());

// Leave one of the U.2s attached, but "remove" the other one.
harness.remove_vdev(second_u2).await;

// When the system activates, we should see a single Zpool, and
// "auto-manage" it.
harness.handle().key_manager_ready().await;

// It might take a moment for synchronization to be handled by the
// background task, but we'll eventually see the U.2 zpool.
//
// This is the equivalent of us "loading a zpool, even though
// it was not backed by a ledger".
let tt = TimeTravel::new();
tt.enough_to_start_synchronization().await;
while harness
.handle_mut()
.wait_for_changes()
.await
.all_u2_zpools()
.is_empty()
{
info!(&logctx.log, "Waiting for U.2 to automatically show up");
}
let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
assert_eq!(u2s.len(), 1, "{:?}", u2s);

// If we attach the second U.2 -- the equivalent of it appearing after
// the key manager is ready -- it'll also be included in the set of
// auto-maanged U.2s.
harness.add_vdev_as(second_u2.clone()).await;
tt.enough_to_start_synchronization().await;
while harness
.handle_mut()
.wait_for_changes()
.await
.all_u2_zpools()
.len()
== 1
{
info!(&logctx.log, "Waiting for U.2 to automatically show up");
}
let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
assert_eq!(u2s.len(), 2, "{:?}", u2s);

// This is the equivalent of the "/omicron-physical-disks GET" API,
// which Nexus might use to contact this sled.
//
// This means that we'll bootstrap the sled successfully, but report a
// 404 if nexus asks us for the latest configuration.
let result = harness.handle().omicron_physical_disks_list().await;
assert!(matches!(result, Err(Error::LedgerNotFound),), "{:?}", result);

// At this point, Nexus may want to explicitly tell sled agent which
// disks it should use. This is the equivalent of invoking
// "/omicron-physical-disks PUT".
let mut disks = vec![
OmicronPhysicalDiskConfig {
identity: first_u2.identity().clone(),
id: Uuid::new_v4(),
pool_id: first_pool_id,
},
OmicronPhysicalDiskConfig {
identity: second_u2.identity().clone(),
id: Uuid::new_v4(),
pool_id: second_pool_id,
},
];
// Sort the disks to ensure the "output" matches the "input" when we
// query later.
disks.sort_by(|a, b| a.identity.partial_cmp(&b.identity).unwrap());
let config =
OmicronPhysicalDisksConfig { generation: Generation::new(), disks };
let result = harness
.handle()
.omicron_physical_disks_ensure(config.clone())
.await
.expect("Failed to ensure disks with 'new' Config");
assert!(!result.has_error(), "{:?}", result);

let observed_config = harness
.handle()
.omicron_physical_disks_list()
.await
.expect("Failed to retreive config after ensuring it");
assert_eq!(observed_config, config);

let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
assert_eq!(u2s.len(), 2, "{:?}", u2s);

harness.cleanup().await;
logctx.cleanup_successful();
}
}

#[cfg(test)]
Expand Down
Loading