From 6869d92926f3782507940ba64ec91c9b3257d233 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 27 Jun 2024 17:51:15 -0700 Subject: [PATCH] Zone bundler --- sled-agent/src/long_running_tasks.rs | 2 +- sled-agent/src/sled_agent.rs | 5 ++++- sled-agent/src/zone_bundle.rs | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index e42c26591f..e920ffc3fc 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -46,7 +46,7 @@ pub struct LongRunningTaskHandles { /// for establishing zpools on disks and managing their datasets. pub storage_manager: StorageHandle, - /// A mehcanism for talking to the [`StorageMonitor`], which reacts to disk + /// A mechanism for talking to the [`StorageMonitor`], which reacts to disk /// changes and updates the dump devices. pub storage_monitor_handle: StorageMonitorHandle, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 4e2efa5657..e596dbe049 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -836,7 +836,10 @@ impl SledAgent { // to start using new disks and stop using old ones. self.inner.storage_monitor.await_generation(*our_gen).await?; - // - TODO: Update Zone bundles? + // Ensure that the ZoneBundler, if it was creating a bundle referencing + // the old U.2s, has stopped using them. + self.inner.zone_bundler.await_completion_of_prior_bundles().await; + // - TODO: Mark probes failed? // - TODO: Mark instances failed? diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 16147e5957..cd9f36fe3a 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -256,6 +256,9 @@ impl Inner { // exist; and returns those. async fn bundle_directories(&self) -> Vec { let resources = self.storage_handle.get_latest_disks().await; + // NOTE: These bundle directories are always stored on M.2s, so we don't + // need to worry about synchronizing with U.2 disk expungement at the + // callsite. let expected = resources.all_zone_bundle_directories(); let mut out = Vec::with_capacity(expected.len()); for each in expected.into_iter() { @@ -426,6 +429,10 @@ impl ZoneBundler { zone: &RunningZone, cause: ZoneBundleCause, ) -> Result { + // NOTE: [Self::await_completion_of_prior_bundles] relies on this lock + // being held across this whole function. If we want more concurrency, + // we'll need to add a barrier-like mechanism to let callers know when + // prior bundles have completed. let inner = self.inner.lock().await; let storage_dirs = inner.bundle_directories().await; let resources = inner.storage_handle.get_latest_disks().await; @@ -443,6 +450,14 @@ impl ZoneBundler { create(&self.log, zone, &context).await } + /// Awaits the completion of all prior calls to [ZoneBundler::create]. + /// + /// This is critical for disk expungement, which wants to ensure that the + /// Sled Agent is no longer using devices after they have been expunged. + pub async fn await_completion_of_prior_bundles(&self) { + let _ = self.inner.lock().await; + } + /// Return the paths for all bundles of the provided zone and ID. pub async fn bundle_paths( &self,