From 5229effef30ba61a15f84cb637069242ce470538 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 22 Apr 2024 16:47:40 -0700 Subject: [PATCH] [nexus] Expunge zones on expunged disks, only provision crucible to in-service disks --- .../planning/src/blueprint_builder/builder.rs | 25 ++++++++++++ nexus/reconfigurator/planning/src/planner.rs | 38 +++++++++++++++---- nexus/types/src/deployment/zone_type.rs | 30 +++++++++++++++ 3 files changed, 85 insertions(+), 8 deletions(-) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 8335815b4a..fa3d3d656b 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -404,6 +404,19 @@ impl<'a> BlueprintBuilder<'a> { &mut self, sled_id: SledUuid, reason: ZoneExpungeReason, + ) -> Result, Error> { + self.expunge_all_zones_for_sled_where(sled_id, reason, |_| true) + } + + /// Expunges all zones from a sled where `filter_fn` returns `true`. + /// + /// Returns a list of zone IDs expunged (excluding zones that were already + /// expunged). If the list is empty, then the operation was a no-op. + pub(crate) fn expunge_all_zones_for_sled_where( + &mut self, + sled_id: SledUuid, + reason: ZoneExpungeReason, + filter_fn: impl Fn(&BlueprintZoneConfig) -> bool, ) -> Result, Error> { let log = self.log.new(o!( "sled_id" => sled_id.to_string(), @@ -414,6 +427,11 @@ impl<'a> BlueprintBuilder<'a> { let sled_zones = self.zones.current_sled_zones(sled_id); for (z, state) in sled_zones { + // Only consider zones for which the filter returns true. + if !filter_fn(z) { + continue; + } + let is_expunged = is_already_expunged(z, state).map_err(|error| { Error::Planner(anyhow!(error).context(format!( @@ -435,6 +453,12 @@ impl<'a> BlueprintBuilder<'a> { } match reason { + ZoneExpungeReason::DiskExpunged => { + info!( + &log, + "expunged disk with non-expunged zone(s) was found" + ); + } ZoneExpungeReason::SledDecommissioned { policy } => { // A sled marked as decommissioned should have no resources // allocated to it. If it does, it's an illegal state, possibly @@ -468,6 +492,7 @@ impl<'a> BlueprintBuilder<'a> { // Finally, add a comment describing what happened. let reason = match reason { + ZoneExpungeReason::DiskExpunged => "zone was using expunged disk", ZoneExpungeReason::SledDecommissioned { .. } => { "sled state is decommissioned" } diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 7fb7ee784a..0352ad33bf 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -85,18 +85,39 @@ impl<'a> Planner<'a> { fn do_plan_expunge(&mut self) -> Result<(), Error> { // Remove services from sleds marked expunged. We use `SledFilter::All` - // and have a custom `needs_zone_expungement` function that allows us + // and have a custom `needs_all_zones_expungement` function that allows us // to produce better errors. for (sled_id, sled_details) in self.input.all_sleds(SledFilter::All) { // Does this sled need zone expungement based on the details? - let Some(reason) = - needs_zone_expungement(sled_details.state, sled_details.policy) - else { + if let Some(reason) = needs_all_zones_expungement( + sled_details.state, + sled_details.policy, + ) { + // Perform the expungement. + self.blueprint.expunge_all_zones_for_sled(sled_id, reason)?; + + // In this case, all zones have been expunged. We can jump to + // the next sled. continue; - }; + } - // Perform the expungement. - self.blueprint.expunge_all_zones_for_sled(sled_id, reason)?; + // The sled isn't being decommissioned, but there still might be + // some zones that need expungement. + // + // Remove any zones which rely on expunged disks. + self.blueprint.expunge_all_zones_for_sled_where( + sled_id, + ZoneExpungeReason::DiskExpunged, + |zone_config| { + let durable_storage_zpool = + match zone_config.zone_type.zpool() { + Some(pool) => pool, + None => return false, + }; + let zpool_id = durable_storage_zpool.id(); + !sled_details.resources.zpool_is_provisionable(&zpool_id) + }, + )?; } Ok(()) @@ -360,7 +381,7 @@ impl<'a> Planner<'a> { /// Returns `Some(reason)` if the sled needs its zones to be expunged, /// based on the policy and state. -fn needs_zone_expungement( +fn needs_all_zones_expungement( state: SledState, policy: SledPolicy, ) -> Option { @@ -387,6 +408,7 @@ fn needs_zone_expungement( /// logical flow. #[derive(Copy, Clone, Debug)] pub(crate) enum ZoneExpungeReason { + DiskExpunged, SledDecommissioned { policy: SledPolicy }, SledExpunged, } diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index a258ab53a1..76989b7738 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -33,6 +33,36 @@ pub enum BlueprintZoneType { } impl BlueprintZoneType { + /// Returns the zpool being used by this zone, if any. + pub fn zpool(&self) -> Option<&omicron_common::zpool_name::ZpoolName> { + match self { + BlueprintZoneType::ExternalDns( + blueprint_zone_type::ExternalDns { dataset, .. }, + ) + | BlueprintZoneType::Clickhouse( + blueprint_zone_type::Clickhouse { dataset, .. }, + ) + | BlueprintZoneType::ClickhouseKeeper( + blueprint_zone_type::ClickhouseKeeper { dataset, .. }, + ) + | BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { dataset, .. }, + ) + | BlueprintZoneType::Crucible(blueprint_zone_type::Crucible { + dataset, + .. + }) + | BlueprintZoneType::InternalDns( + blueprint_zone_type::InternalDns { dataset, .. }, + ) => Some(&dataset.pool_name), + BlueprintZoneType::BoundaryNtp(_) + | BlueprintZoneType::InternalNtp(_) + | BlueprintZoneType::Nexus(_) + | BlueprintZoneType::Oximeter(_) + | BlueprintZoneType::CruciblePantry(_) => None, + } + } + pub fn external_ip(&self) -> Option { match self { BlueprintZoneType::Nexus(nexus) => Some(nexus.external_ip),