From b941323f6cae79eaaf3509173f3616f089a2b939 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 8 Feb 2024 15:40:50 -0500 Subject: [PATCH] blueprint planner: skip non-provisionable sleds (#5003) --- .../db-queries/src/db/datastore/deployment.rs | 7 +- nexus/deployment/src/blueprint_builder.rs | 10 +- nexus/deployment/src/planner.rs | 105 ++++++++++++++++++ nexus/src/app/deployment.rs | 6 +- nexus/types/src/deployment.rs | 4 + 5 files changed, 129 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 7d0df4f532..cde6e7e8c6 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -1053,6 +1053,7 @@ mod tests { use nexus_test_utils::db::test_setup_database; use nexus_types::deployment::Policy; use nexus_types::deployment::SledResources; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; use omicron_common::address::Ipv6Subnet; use omicron_test_utils::dev; @@ -1115,7 +1116,11 @@ mod tests { }) .collect(); let ip = ip.unwrap_or_else(|| thread_rng().gen::().into()); - SledResources { zpools, subnet: Ipv6Subnet::new(ip) } + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet: Ipv6Subnet::new(ip), + } } // Create a `Policy` that contains all the sleds found in `collection` diff --git a/nexus/deployment/src/blueprint_builder.rs b/nexus/deployment/src/blueprint_builder.rs index 248615286d..1bf46d34b2 100644 --- a/nexus/deployment/src/blueprint_builder.rs +++ b/nexus/deployment/src/blueprint_builder.rs @@ -718,6 +718,7 @@ impl<'a> BlueprintZones<'a> { #[cfg(test)] pub mod test { use super::*; + use nexus_types::external_api::views::SledProvisionState; use omicron_common::address::IpRange; use omicron_common::address::Ipv4Range; use omicron_common::address::Ipv6Subnet; @@ -890,7 +891,14 @@ pub mod test { .collect(); let subnet = Ipv6Subnet::::new(sled_ip); - policy.sleds.insert(sled_id, SledResources { zpools, subnet }); + policy.sleds.insert( + sled_id, + SledResources { + provision_state: SledProvisionState::Provisionable, + zpools, + subnet, + }, + ); sled_ip } diff --git a/nexus/deployment/src/planner.rs b/nexus/deployment/src/planner.rs index 2c054b8cb5..cbdcfd80c0 100644 --- a/nexus/deployment/src/planner.rs +++ b/nexus/deployment/src/planner.rs @@ -12,6 +12,7 @@ use crate::blueprint_builder::EnsureMultiple; use crate::blueprint_builder::Error; use nexus_types::deployment::Blueprint; use nexus_types::deployment::Policy; +use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::Collection; use slog::warn; use slog::{info, Logger}; @@ -162,6 +163,20 @@ impl<'a> Planner<'a> { } } + // We've now placed all the services that should always exist on all + // sleds. Before moving on to make decisions about placing services that + // are _not_ present on all sleds, check the provision state of all our + // sleds so we can avoid any non-provisionable sleds under the + // assumption that there is something amiss with them. + sleds_ineligible_for_services.extend( + self.policy.sleds.iter().filter_map(|(sled_id, sled_info)| { + match sled_info.provision_state { + SledProvisionState::Provisionable => None, + SledProvisionState::NonProvisionable => Some(*sled_id), + } + }), + ); + self.ensure_correct_number_of_nexus_zones( &sleds_ineligible_for_services, )?; @@ -293,6 +308,7 @@ mod test { use crate::blueprint_builder::test::verify_blueprint; use crate::blueprint_builder::BlueprintBuilder; use nexus_inventory::now_db_precision; + use nexus_types::external_api::views::SledProvisionState; use nexus_types::inventory::OmicronZoneType; use nexus_types::inventory::OmicronZonesFound; use omicron_common::api::external::Generation; @@ -627,4 +643,93 @@ mod test { logctx.cleanup_successful(); } + + /// Check that the planner will skip non-provisionable sleds when allocating + /// extra Nexus zones + #[test] + fn test_nexus_allocation_skips_nonprovisionable_sleds() { + let logctx = test_setup_log( + "planner_nexus_allocation_skips_nonprovisionable_sleds", + ); + + // Use our example inventory collection as a starting point. + let (collection, mut policy) = example(); + + // Build the initial blueprint. + let blueprint1 = BlueprintBuilder::build_initial_from_collection( + &collection, + &policy, + "the_test", + ) + .expect("failed to create initial blueprint"); + + // This blueprint should only have 3 Nexus zones: one on each sled. + assert_eq!(blueprint1.omicron_zones.len(), 3); + for sled_config in blueprint1.omicron_zones.values() { + assert_eq!( + sled_config + .zones + .iter() + .filter(|z| z.zone_type.is_nexus()) + .count(), + 1 + ); + } + + // Arbitrarily choose one of the sleds and mark it non-provisionable. + let nonprovisionable_sled_id = { + let (sled_id, resources) = + policy.sleds.iter_mut().next().expect("no sleds"); + resources.provision_state = SledProvisionState::NonProvisionable; + *sled_id + }; + + // Now run the planner with a high number of target Nexus zones. + policy.target_nexus_zone_count = 14; + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &policy, + "add more Nexus", + &collection, + ) + .expect("failed to create planner") + .plan() + .expect("failed to plan"); + + let diff = blueprint1.diff(&blueprint2); + println!("1 -> 2 (added additional Nexus zones):\n{}", diff); + assert_eq!(diff.sleds_added().count(), 0); + assert_eq!(diff.sleds_removed().count(), 0); + let sleds = diff.sleds_changed().collect::>(); + + // Only 2 of the 3 sleds should get additional Nexus zones. We expect a + // total of 11 new Nexus zones, which should be spread evenly across the + // two sleds (one gets 6 and the other gets 5), while the + // non-provisionable sled should be unchanged. + assert_eq!(sleds.len(), 2); + let mut total_new_nexus_zones = 0; + for (sled_id, sled_changes) in sleds { + assert!(sled_id != nonprovisionable_sled_id); + assert_eq!(sled_changes.zones_removed().count(), 0); + assert_eq!(sled_changes.zones_changed().count(), 0); + let zones = sled_changes.zones_added().collect::>(); + match zones.len() { + n @ (5 | 6) => { + total_new_nexus_zones += n; + } + n => { + panic!("unexpected number of zones added to {sled_id}: {n}") + } + } + for zone in &zones { + let OmicronZoneType::Nexus { .. } = zone.zone_type else { + panic!("unexpectedly added a non-Crucible zone: {zone:?}"); + }; + } + } + assert_eq!(total_new_nexus_zones, 11); + + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 5818d1aa12..b8cb6deabf 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -171,7 +171,11 @@ impl super::Nexus { let zpools = zpools_by_sled_id .remove(&sled_id) .unwrap_or_else(BTreeSet::new); - let sled_info = SledResources { subnet, zpools }; + let sled_info = SledResources { + provision_state: sled_row.provision_state().into(), + subnet, + zpools, + }; (sled_id, sled_info) }) .collect(); diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 7438e2199a..06427507d5 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -11,6 +11,7 @@ //! nexus/deployment does not currently know about nexus/db-model and it's //! convenient to separate these concerns.) +use crate::external_api::views::SledProvisionState; use crate::inventory::Collection; pub use crate::inventory::NetworkInterface; pub use crate::inventory::NetworkInterfaceKind; @@ -63,6 +64,9 @@ pub struct Policy { /// Describes the resources available on each sled for the planner #[derive(Debug, Clone)] pub struct SledResources { + /// provision state of this sled + pub provision_state: SledProvisionState, + /// zpools on this sled /// /// (used to allocate storage for control plane zones with persistent