diff --git a/Cargo.lock b/Cargo.lock index 356b5b0fc7..8a932372bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2678,9 +2678,9 @@ checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" [[package]] name = "filetime" -version = "0.2.24" +version = "0.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf401df4a4e3872c4fe8151134cf483738e74b67fc934d6532c882b3d24a4550" +checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586" dependencies = [ "cfg-if", "libc", @@ -11518,6 +11518,7 @@ dependencies = [ "camino", "camino-tempfile", "cancel-safe-futures", + "chrono", "clap", "debug-ignore", "derive-where", diff --git a/Cargo.toml b/Cargo.toml index 8acf5c9a35..10839dc1c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -348,7 +348,7 @@ dyn-clone = "1.0.17" either = "1.13.0" expectorate = "1.1.0" fatfs = "0.3.6" -filetime = "0.2.24" +filetime = "0.2.25" flate2 = "1.0.31" float-ord = "0.3.2" flume = "0.11.0" diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 58b32cb1f3..7f5d098f94 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -34,6 +34,7 @@ use nexus_saga_recovery::LastPass; use nexus_types::deployment::Blueprint; use nexus_types::internal_api::background::LookupRegionPortStatus; use nexus_types::internal_api::background::RegionReplacementDriverStatus; +use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; @@ -1661,6 +1662,30 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!("{}", table); } } + } else if name == "region_snapshot_replacement_finish" { + match serde_json::from_value::( + details.clone(), + ) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + + Ok(status) => { + println!( + " total records transitioned to done: {}", + status.records_set_to_done.len(), + ); + for line in &status.records_set_to_done { + println!(" > {line}"); + } + + println!(" errors: {}", status.errors.len()); + for line in &status.errors { + println!(" > {line}"); + } + } + } } else { println!( "warning: unknown background task: {:?} \ @@ -1690,16 +1715,17 @@ fn bgtask_apply_kv_style(table: &mut tabled::Table) { ); } -// Extract and remove the event report. (If we don't do this, the `Debug` -// output can be quite large.) +/// Extract and remove the event report, returning None if it wasn't found and +/// an error if something else went wrong. (If we don't do this, the `Debug` +/// output can be quite large.) fn extract_event_buffer( value: &mut serde_json::Value, -) -> anyhow::Result> { +) -> anyhow::Result>> { let Some(obj) = value.as_object_mut() else { bail!("expected value to be an object") }; let Some(event_report) = obj.remove("event_report") else { - bail!("expected 'event_report' field in value") + return Ok(None); }; // Try deserializing the event report generically. We could deserialize to @@ -1714,19 +1740,22 @@ fn extract_event_buffer( let mut event_buffer = EventBuffer::default(); event_buffer.add_event_report(event_report); - Ok(event_buffer) + Ok(Some(event_buffer)) } // Make a short summary of the current state of an execution based on an event // buffer, and add it to the table. fn push_event_buffer_summary( - event_buffer: anyhow::Result>, + event_buffer: anyhow::Result>>, builder: &mut tabled::builder::Builder, ) { match event_buffer { - Ok(buffer) => { + Ok(Some(buffer)) => { event_buffer_summary_impl(buffer, builder); } + Ok(None) => { + builder.push_record(["status:", "(no event report found)"]); + } Err(error) => { builder.push_record([ "event report error:".to_string(), diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 2774a5d734..c57a9c9dce 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -127,6 +127,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_finish" + complete a region snapshot replacement if all the steps are done + + task: "region_snapshot_replacement_garbage_collection" clean up all region snapshot replacement step volumes @@ -289,6 +293,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_finish" + complete a region snapshot replacement if all the steps are done + + task: "region_snapshot_replacement_garbage_collection" clean up all region snapshot replacement step volumes @@ -438,6 +446,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_finish" + complete a region snapshot replacement if all the steps are done + + task: "region_snapshot_replacement_garbage_collection" clean up all region snapshot replacement step volumes diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 757b4e8888..e7720dd12c 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -343,6 +343,10 @@ task: "region_replacement_driver" drive region replacements forward to completion +task: "region_snapshot_replacement_finish" + complete a region snapshot replacement if all the steps are done + + task: "region_snapshot_replacement_garbage_collection" clean up all region snapshot replacement step volumes @@ -594,6 +598,14 @@ task: "region_replacement_driver" number of region replacement finish sagas started ok: 0 number of errors: 0 +task: "region_snapshot_replacement_finish" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total records transitioned to done: 0 + errors: 0 + task: "region_snapshot_replacement_garbage_collection" configured period: every s currently executing: no @@ -1012,6 +1024,14 @@ task: "region_replacement_driver" number of region replacement finish sagas started ok: 0 number of errors: 0 +task: "region_snapshot_replacement_finish" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total records transitioned to done: 0 + errors: 0 + task: "region_snapshot_replacement_garbage_collection" configured period: every s currently executing: no diff --git a/end-to-end-tests/src/bin/bootstrap.rs b/end-to-end-tests/src/bin/bootstrap.rs index b02bed4265..4c41d30bdc 100644 --- a/end-to-end-tests/src/bin/bootstrap.rs +++ b/end-to-end-tests/src/bin/bootstrap.rs @@ -8,8 +8,8 @@ use oxide_client::types::{ NameOrId, SiloQuotasUpdate, }; use oxide_client::{ - ClientDisksExt, ClientHiddenExt, ClientProjectsExt, - ClientSystemNetworkingExt, ClientSystemSilosExt, + ClientDisksExt, ClientHiddenExt, ClientProjectsExt, ClientSystemIpPoolsExt, + ClientSystemSilosExt, }; use serde::{de::DeserializeOwned, Deserialize}; use std::time::Duration; diff --git a/end-to-end-tests/src/bin/commtest.rs b/end-to-end-tests/src/bin/commtest.rs index 05e15faafc..5711a52179 100644 --- a/end-to-end-tests/src/bin/commtest.rs +++ b/end-to-end-tests/src/bin/commtest.rs @@ -9,7 +9,7 @@ use oxide_client::{ UsernamePasswordCredentials, }, ClientHiddenExt, ClientLoginExt, ClientProjectsExt, - ClientSystemHardwareExt, ClientSystemNetworkingExt, ClientSystemStatusExt, + ClientSystemHardwareExt, ClientSystemIpPoolsExt, ClientSystemStatusExt, ClientVpcsExt, }; use std::{ diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index b3d189691c..7f2726cc59 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -398,6 +398,9 @@ pub struct BackgroundTaskConfig { RegionSnapshotReplacementGarbageCollectionConfig, /// configuration for region snapshot replacement step task pub region_snapshot_replacement_step: RegionSnapshotReplacementStepConfig, + /// configuration for region snapshot replacement finisher task + pub region_snapshot_replacement_finish: + RegionSnapshotReplacementFinishConfig, } #[serde_as] @@ -658,6 +661,14 @@ pub struct RegionSnapshotReplacementStepConfig { pub period_secs: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct RegionSnapshotReplacementFinishConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + /// Configuration for a nexus server #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct PackageConfig { @@ -908,6 +919,7 @@ mod test { region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 + region_snapshot_replacement_finish.period_secs = 30 [default_region_allocation_strategy] type = "random" seed = 0 @@ -1082,6 +1094,10 @@ mod test { RegionSnapshotReplacementStepConfig { period_secs: Duration::from_secs(30), }, + region_snapshot_replacement_finish: + RegionSnapshotReplacementFinishConfig { + period_secs: Duration::from_secs(30), + }, }, default_region_allocation_strategy: crate::nexus_config::RegionAllocationStrategy::Random { @@ -1161,6 +1177,7 @@ mod test { region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 + region_snapshot_replacement_finish.period_secs = 30 [default_region_allocation_strategy] type = "random" "##, diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index e63b155fc6..4c181ef3a2 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -142,6 +142,7 @@ lookup_region_port.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 +region_snapshot_replacement_finish.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index bca3f7f2c4..d25408e6e3 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -128,6 +128,7 @@ lookup_region_port.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 +region_snapshot_replacement_finish.period_secs = 30 [default_region_allocation_strategy] # allocate region on 3 random distinct zpools, on 3 random distinct sleds. diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt index bde11e2de3..3024eb8ac4 100644 --- a/nexus/external-api/output/nexus_tags.txt +++ b/nexus/external-api/output/nexus_tags.txt @@ -146,11 +146,7 @@ sled_view GET /v1/system/hardware/sleds/{sle switch_list GET /v1/system/hardware/switches switch_view GET /v1/system/hardware/switches/{switch_id} -API operations found with tag "system/metrics" -OPERATION ID METHOD URL PATH -system_metric GET /v1/system/metrics/{metric_name} - -API operations found with tag "system/networking" +API operations found with tag "system/ip-pools" OPERATION ID METHOD URL PATH ip_pool_create POST /v1/system/ip-pools ip_pool_delete DELETE /v1/system/ip-pools/{pool} @@ -169,6 +165,13 @@ ip_pool_silo_update PUT /v1/system/ip-pools/{pool}/sil ip_pool_update PUT /v1/system/ip-pools/{pool} ip_pool_utilization_view GET /v1/system/ip-pools/{pool}/utilization ip_pool_view GET /v1/system/ip-pools/{pool} + +API operations found with tag "system/metrics" +OPERATION ID METHOD URL PATH +system_metric GET /v1/system/metrics/{metric_name} + +API operations found with tag "system/networking" +OPERATION ID METHOD URL PATH networking_address_lot_block_list GET /v1/system/networking/address-lot/{address_lot}/blocks networking_address_lot_create POST /v1/system/networking/address-lot networking_address_lot_delete DELETE /v1/system/networking/address-lot/{address_lot} diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index 669b25145f..2c8ec38c7e 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -175,6 +175,12 @@ pub const API_VERSION: &str = "20240821.0"; url = "http://docs.oxide.computer/api/system-metrics" } }, + "system/ip-pools" = { + description = "IP pools are collections of external IPs that can be assigned to silos. When a pool is linked to a silo, users in that silo can allocate IPs from the pool for their instances.", + external_docs = { + url = "http://docs.oxide.computer/api/system-ip-pools" + } + }, "system/networking" = { description = "This provides rack-level network configuration.", external_docs = { @@ -630,7 +636,7 @@ pub trait NexusExternalApi { #[endpoint { method = GET, path = "/v1/system/ip-pools", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_list( rqctx: RequestContext, @@ -641,7 +647,7 @@ pub trait NexusExternalApi { #[endpoint { method = POST, path = "/v1/system/ip-pools", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_create( rqctx: RequestContext, @@ -652,7 +658,7 @@ pub trait NexusExternalApi { #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_view( rqctx: RequestContext, @@ -663,7 +669,7 @@ pub trait NexusExternalApi { #[endpoint { method = DELETE, path = "/v1/system/ip-pools/{pool}", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_delete( rqctx: RequestContext, @@ -674,7 +680,7 @@ pub trait NexusExternalApi { #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_update( rqctx: RequestContext, @@ -686,7 +692,7 @@ pub trait NexusExternalApi { #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/utilization", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_utilization_view( rqctx: RequestContext, @@ -697,7 +703,7 @@ pub trait NexusExternalApi { #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/silos", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_silo_list( rqctx: RequestContext, @@ -723,7 +729,7 @@ pub trait NexusExternalApi { #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/silos", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_silo_link( rqctx: RequestContext, @@ -737,7 +743,7 @@ pub trait NexusExternalApi { #[endpoint { method = DELETE, path = "/v1/system/ip-pools/{pool}/silos/{silo}", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_silo_unlink( rqctx: RequestContext, @@ -754,7 +760,7 @@ pub trait NexusExternalApi { #[endpoint { method = PUT, path = "/v1/system/ip-pools/{pool}/silos/{silo}", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_silo_update( rqctx: RequestContext, @@ -766,7 +772,7 @@ pub trait NexusExternalApi { #[endpoint { method = GET, path = "/v1/system/ip-pools-service", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_service_view( rqctx: RequestContext, @@ -778,7 +784,7 @@ pub trait NexusExternalApi { #[endpoint { method = GET, path = "/v1/system/ip-pools/{pool}/ranges", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_range_list( rqctx: RequestContext, @@ -792,7 +798,7 @@ pub trait NexusExternalApi { #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/add", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_range_add( rqctx: RequestContext, @@ -804,7 +810,7 @@ pub trait NexusExternalApi { #[endpoint { method = POST, path = "/v1/system/ip-pools/{pool}/ranges/remove", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_range_remove( rqctx: RequestContext, @@ -818,7 +824,7 @@ pub trait NexusExternalApi { #[endpoint { method = GET, path = "/v1/system/ip-pools-service/ranges", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_service_range_list( rqctx: RequestContext, @@ -831,7 +837,7 @@ pub trait NexusExternalApi { #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/add", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_service_range_add( rqctx: RequestContext, @@ -842,7 +848,7 @@ pub trait NexusExternalApi { #[endpoint { method = POST, path = "/v1/system/ip-pools-service/ranges/remove", - tags = ["system/networking"], + tags = ["system/ip-pools"], }] async fn ip_pool_service_range_remove( rqctx: RequestContext, diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs index 93c845add5..4594ff9fed 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs @@ -103,8 +103,8 @@ impl<'a> BuilderExternalNetworking<'a> { ExternalIpAllocator::new(input.service_ip_pool_ranges()); let mut used_macs: HashSet = HashSet::new(); - for (_, z) in - parent_blueprint.all_omicron_zones(BlueprintZoneFilter::All) + for (_, z) in parent_blueprint + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) { let zone_type = &z.zone_type; match zone_type { diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 86aa00fb52..e0da9428ee 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -9,13 +9,10 @@ use crate::system::SledBuilder; use crate::system::SystemDescription; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneFilter; -use nexus_types::deployment::OmicronZoneNic; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledFilter; use nexus_types::inventory::Collection; -use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::SledKind; -use omicron_uuid_kinds::VnicUuid; use typed_rng::TypedUuidRng; pub struct ExampleSystem { @@ -95,37 +92,15 @@ impl ExampleSystem { system.to_collection_builder().expect("failed to build collection"); builder.set_rng_seed((test_name, "ExampleSystem collection")); - for sled_id in blueprint.sleds() { - let Some(zones) = blueprint.blueprint_zones.get(&sled_id) else { - continue; - }; - for zone in zones.zones.iter() { - let service_id = zone.id; - if let Some((external_ip, nic)) = - zone.zone_type.external_networking() - { - input_builder - .add_omicron_zone_external_ip(service_id, external_ip) - .expect("failed to add Omicron zone external IP"); - input_builder - .add_omicron_zone_nic( - service_id, - OmicronZoneNic { - // TODO-cleanup use `TypedUuid` everywhere - id: VnicUuid::from_untyped_uuid(nic.id), - mac: nic.mac, - ip: nic.ip, - slot: nic.slot, - primary: nic.primary, - }, - ) - .expect("failed to add Omicron zone NIC"); - } - } + input_builder + .update_network_resources_from_blueprint(&blueprint) + .expect("failed to add network resources from blueprint"); + + for (sled_id, zones) in &blueprint.blueprint_zones { builder .found_sled_omicron_zones( "fake sled agent", - sled_id, + *sled_id, zones.to_omicron_zones_config( BlueprintZoneFilter::ShouldBeRunning, ), diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 7149aecb85..8dcad21df1 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -1249,6 +1249,99 @@ mod test { logctx.cleanup_successful(); } + /// Check that the planner will reuse external IPs that were previously + /// assigned to expunged zones + #[test] + fn test_reuse_external_ips_from_expunged_zones() { + static TEST_NAME: &str = + "planner_reuse_external_ips_from_expunged_zones"; + let logctx = test_setup_log(TEST_NAME); + + // Use our example system as a starting point. + let (collection, input, blueprint1) = + example(&logctx.log, TEST_NAME, DEFAULT_N_SLEDS); + + // Expunge the first sled we see, which will result in a Nexus external + // IP no longer being associated with a running zone, and a new Nexus + // zone being added to one of the two remaining sleds. + let mut builder = input.into_builder(); + let (sled_id, details) = + builder.sleds_mut().iter_mut().next().expect("no sleds"); + let sled_id = *sled_id; + details.policy = SledPolicy::Expunged; + let input = builder.build(); + let blueprint2 = Planner::new_based_on( + logctx.log.clone(), + &blueprint1, + &input, + "test_blueprint2", + &collection, + ) + .expect("failed to create planner") + .with_rng_seed((TEST_NAME, "bp2")) + .plan() + .expect("failed to plan"); + + let diff = blueprint2.diff_since_blueprint(&blueprint1); + println!("1 -> 2 (expunged sled):\n{}", diff.display()); + + // The expunged sled should have an expunged Nexus zone. + let zone = blueprint2.blueprint_zones[&sled_id] + .zones + .iter() + .find(|zone| matches!(zone.zone_type, BlueprintZoneType::Nexus(_))) + .expect("no nexus zone found"); + assert_eq!(zone.disposition, BlueprintZoneDisposition::Expunged); + + // Set the target Nexus zone count to one that will completely exhaust + // the service IP pool. This will force reuse of the IP that was + // allocated to the expunged Nexus zone. + let mut builder = input.into_builder(); + builder.update_network_resources_from_blueprint(&blueprint2).unwrap(); + assert_eq!(builder.policy_mut().service_ip_pool_ranges.len(), 1); + builder.policy_mut().target_nexus_zone_count = + builder.policy_mut().service_ip_pool_ranges[0] + .len() + .try_into() + .unwrap(); + let input = builder.build(); + let blueprint3 = Planner::new_based_on( + logctx.log.clone(), + &blueprint2, + &input, + "test_blueprint3", + &collection, + ) + .expect("failed to create planner") + .with_rng_seed((TEST_NAME, "bp3")) + .plan() + .expect("failed to plan"); + + let diff = blueprint3.diff_since_blueprint(&blueprint2); + println!("2 -> 3 (maximum Nexus):\n{}", diff.display()); + + // Planning succeeded, but let's prove that we reused the IP address! + let expunged_ip = zone.zone_type.external_networking().unwrap().0.ip(); + let new_zone = blueprint3 + .blueprint_zones + .values() + .flat_map(|c| &c.zones) + .find(|zone| { + zone.disposition == BlueprintZoneDisposition::InService + && zone + .zone_type + .external_networking() + .map_or(false, |(ip, _)| expunged_ip == ip.ip()) + }) + .expect("couldn't find that the external IP was reused"); + println!( + "zone {} reused external IP {} from expunged zone {}", + new_zone.id, expunged_ip, zone.id + ); + + logctx.cleanup_successful(); + } + #[test] fn test_crucible_allocation_skips_nonprovisionable_disks() { static TEST_NAME: &str = @@ -1604,6 +1697,10 @@ mod test { }; println!("1 -> 2: decommissioned {decommissioned_sled_id}"); + // Because we marked zones as expunged, we need to update the networking + // config in the planning input. + builder.update_network_resources_from_blueprint(&blueprint1).unwrap(); + // Now run the planner with a high number of target Nexus zones. The // number (9) is chosen such that: // @@ -1877,6 +1974,7 @@ mod test { // Remove the now-decommissioned sled from the planning input. let mut builder = input.into_builder(); + builder.update_network_resources_from_blueprint(&blueprint2).unwrap(); builder.sleds_mut().remove(&expunged_sled_id); let input = builder.build(); diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ae4309d8f9..fedb74b81b 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -108,6 +108,7 @@ use super::tasks::phantom_disks; use super::tasks::physical_disk_adoption; use super::tasks::region_replacement; use super::tasks::region_replacement_driver; +use super::tasks::region_snapshot_replacement_finish::*; use super::tasks::region_snapshot_replacement_garbage_collect::*; use super::tasks::region_snapshot_replacement_start::*; use super::tasks::region_snapshot_replacement_step::*; @@ -167,6 +168,7 @@ pub struct BackgroundTasks { pub task_region_snapshot_replacement_start: Activator, pub task_region_snapshot_replacement_garbage_collection: Activator, pub task_region_snapshot_replacement_step: Activator, + pub task_region_snapshot_replacement_finish: Activator, // Handles to activate background tasks that do not get used by Nexus // at-large. These background tasks are implementation details as far as @@ -252,6 +254,7 @@ impl BackgroundTasksInitializer { task_region_snapshot_replacement_garbage_collection: Activator::new( ), task_region_snapshot_replacement_step: Activator::new(), + task_region_snapshot_replacement_finish: Activator::new(), task_internal_dns_propagation: Activator::new(), task_external_dns_propagation: Activator::new(), @@ -316,6 +319,7 @@ impl BackgroundTasksInitializer { task_region_snapshot_replacement_start, task_region_snapshot_replacement_garbage_collection, task_region_snapshot_replacement_step, + task_region_snapshot_replacement_finish, // Add new background tasks here. Be sure to use this binding in a // call to `Driver::register()` below. That's what actually wires // up the Activator to the corresponding background task. @@ -780,7 +784,7 @@ impl BackgroundTasksInitializer { replacement, and run the step saga for them", period: config.region_snapshot_replacement_step.period_secs, task_impl: Box::new(RegionSnapshotReplacementFindAffected::new( - datastore, + datastore.clone(), sagas.clone(), )), opctx: opctx.child(BTreeMap::new()), @@ -788,6 +792,20 @@ impl BackgroundTasksInitializer { activator: task_region_snapshot_replacement_step, }); + driver.register(TaskDefinition { + name: "region_snapshot_replacement_finish", + description: + "complete a region snapshot replacement if all the steps are \ + done", + period: config.region_snapshot_replacement_finish.period_secs, + task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( + datastore, + )), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_region_snapshot_replacement_finish, + }); + driver } } diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 6089ba8d65..6cbba0a07b 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -25,6 +25,7 @@ pub mod phantom_disks; pub mod physical_disk_adoption; pub mod region_replacement; pub mod region_replacement_driver; +pub mod region_snapshot_replacement_finish; pub mod region_snapshot_replacement_garbage_collect; pub mod region_snapshot_replacement_start; pub mod region_snapshot_replacement_step; diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs new file mode 100644 index 0000000000..134995d848 --- /dev/null +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -0,0 +1,332 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for detecting when a region snapshot replacement has all its +//! steps done, and finishing it. +//! +//! Once all related region snapshot replacement steps are done, the region +//! snapshot replacement can be completed. + +use crate::app::background::BackgroundTask; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus; +use serde_json::json; +use std::sync::Arc; + +pub struct RegionSnapshotReplacementFinishDetector { + datastore: Arc, +} + +impl RegionSnapshotReplacementFinishDetector { + pub fn new(datastore: Arc) -> Self { + RegionSnapshotReplacementFinishDetector { datastore } + } + + async fn transition_requests_to_done( + &self, + opctx: &OpContext, + status: &mut RegionSnapshotReplacementFinishStatus, + ) { + let log = &opctx.log; + + // Find all region snapshot replacement requests in state "Running" + let requests = match self + .datastore + .get_running_region_snapshot_replacements(opctx) + .await + { + Ok(requests) => requests, + + Err(e) => { + let s = format!( + "query for region snapshot replacement requests \ + failed: {e}", + ); + error!(&log, "{s}"); + status.errors.push(s); + + return; + } + }; + + for request in requests { + // Count associated region snapshot replacement steps that are not + // completed. + let count = match self + .datastore + .in_progress_region_snapshot_replacement_steps( + opctx, request.id, + ) + .await + { + Ok(count) => count, + + Err(e) => { + let s = format!( + "counting incomplete region snapshot replacement \ + steps failed: {e}", + ); + error!(&log, "{s}"); + status.errors.push(s); + + continue; + } + }; + + if count == 0 { + // If the region snapshot has been deleted, then the snapshot + // replacement is done: the reference number went to zero and it + // was deleted, therefore there aren't any volumes left that + // reference it! + match self + .datastore + .region_snapshot_get( + request.old_dataset_id, + request.old_region_id, + request.old_snapshot_id, + ) + .await + { + Ok(Some(_)) => { + info!( + &log, + "region snapshot still exists"; + "request.old_dataset_id" => %request.old_dataset_id, + "request.old_region_id" => %request.old_region_id, + "request.old_snapshot_id" => %request.old_snapshot_id, + ); + continue; + } + + Ok(None) => { + // gone! + } + + Err(e) => { + let s = format!( + "error querying for region snapshot {} {} {}: {e}", + request.old_dataset_id, + request.old_region_id, + request.old_snapshot_id, + ); + error!(&log, "{s}"); + status.errors.push(s); + + continue; + } + }; + + // Transition region snapshot replacement to Complete + match self + .datastore + .set_region_snapshot_replacement_complete(opctx, request.id) + .await + { + Ok(()) => { + let s = format!("set request {} to done", request.id); + info!(&log, "{s}"); + status.records_set_to_done.push(s); + } + + Err(e) => { + let s = format!( + "marking snapshot replacement as done failed: {e}" + ); + error!(&log, "{s}"); + status.errors.push(s); + } + } + } + } + } +} + +impl BackgroundTask for RegionSnapshotReplacementFinishDetector { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async move { + let mut status = RegionSnapshotReplacementFinishStatus::default(); + + self.transition_requests_to_done(opctx, &mut status).await; + + json!(status) + } + .boxed() + } +} + +#[cfg(test)] +mod test { + use super::*; + use nexus_db_model::RegionSnapshotReplacement; + use nexus_db_model::RegionSnapshotReplacementStep; + use nexus_db_model::RegionSnapshotReplacementStepState; + use nexus_test_utils_macros::nexus_test; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + #[nexus_test(server = crate::Server)] + async fn test_done_region_snapshot_replacement_causes_finish( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let mut task = + RegionSnapshotReplacementFinishDetector::new(datastore.clone()); + + // Noop test + let result: RegionSnapshotReplacementFinishStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementFinishStatus::default()); + + // Add a region snapshot replacement request for a fake region snapshot. + + let dataset_id = Uuid::new_v4(); + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + // Do not add the fake region snapshot to the database, as it should + // have been deleted by the time the request transitions to "Running" + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, + request, + Uuid::new_v4(), + ) + .await + .unwrap(); + + // Transition that to Allocating -> ReplacementDone -> DeletingOldVolume + // -> Running + + let operating_saga_id = Uuid::new_v4(); + + datastore + .set_region_snapshot_replacement_allocating( + &opctx, + request_id, + operating_saga_id, + ) + .await + .unwrap(); + + let new_region_id = Uuid::new_v4(); + let old_snapshot_volume_id = Uuid::new_v4(); + + datastore + .set_region_snapshot_replacement_replacement_done( + &opctx, + request_id, + operating_saga_id, + new_region_id, + old_snapshot_volume_id, + ) + .await + .unwrap(); + + datastore + .set_region_snapshot_replacement_deleting_old_volume( + &opctx, + request_id, + operating_saga_id, + ) + .await + .unwrap(); + + datastore + .set_region_snapshot_replacement_running( + &opctx, + request_id, + operating_saga_id, + ) + .await + .unwrap(); + + // Insert a few steps, not all finished yet + + let operating_saga_id = Uuid::new_v4(); + + let mut step_1 = + RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + step_1.replacement_state = RegionSnapshotReplacementStepState::Complete; + step_1.operating_saga_id = Some(operating_saga_id); + let step_1_id = step_1.id; + + let mut step_2 = + RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + step_2.replacement_state = RegionSnapshotReplacementStepState::Complete; + step_2.operating_saga_id = Some(operating_saga_id); + let step_2_id = step_2.id; + + datastore + .insert_region_snapshot_replacement_step(&opctx, step_1) + .await + .unwrap(); + datastore + .insert_region_snapshot_replacement_step(&opctx, step_2) + .await + .unwrap(); + + // Activate the task, it should do nothing yet + + let result: RegionSnapshotReplacementFinishStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementFinishStatus::default()); + + // Transition one record to Complete, the task should still do nothing + + datastore + .set_region_snapshot_replacement_step_volume_deleted( + &opctx, step_1_id, + ) + .await + .unwrap(); + + let result: RegionSnapshotReplacementFinishStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementFinishStatus::default()); + + // Transition the other record to Complete + + datastore + .set_region_snapshot_replacement_step_volume_deleted( + &opctx, step_2_id, + ) + .await + .unwrap(); + + // Activate the task - it should pick the request up, change the state, + // and try to run the region snapshot replacement finish saga + let result: RegionSnapshotReplacementFinishStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + assert_eq!( + result, + RegionSnapshotReplacementFinishStatus { + records_set_to_done: vec![format!( + "set request {request_id} to done" + )], + errors: vec![], + }, + ); + } +} diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index d78e304b75..cd13a56642 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -413,12 +413,6 @@ impl BackgroundTask for RegionSnapshotReplacementFindAffected { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { async move { - let log = &opctx.log; - info!( - &log, - "region snapshot replacement find affected volumes task started" - ); - let mut status = RegionSnapshotReplacementStepStatus::default(); // Importantly, clean old steps up before finding affected volumes! @@ -436,11 +430,6 @@ impl BackgroundTask for RegionSnapshotReplacementFindAffected { self.invoke_step_saga_for_affected_volumes(opctx, &mut status) .await; - info!( - &log, - "region snapshot replacement find affected volumes task done" - ); - json!(status) } .boxed() diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 6859e992ca..bd338469e0 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -140,6 +140,7 @@ instance_updater.period_secs = 60 region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 +region_snapshot_replacement_finish.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index 04a85bc179..a2eec5ca8a 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -6,6 +6,8 @@ //! blueprints. use super::AddNetworkResourceError; +use super::Blueprint; +use super::BlueprintZoneFilter; use super::OmicronZoneExternalIp; use super::OmicronZoneNetworkResources; use super::OmicronZoneNic; @@ -16,6 +18,7 @@ use crate::external_api::views::SledProvisionPolicy; use crate::external_api::views::SledState; use clap::ValueEnum; use ipnetwork::IpNetwork; +use newtype_uuid::GenericUuid; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; @@ -25,6 +28,7 @@ use omicron_common::disk::DiskIdentity; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::VnicUuid; use omicron_uuid_kinds::ZpoolUuid; use schemars::JsonSchema; use serde::Deserialize; @@ -859,6 +863,35 @@ impl PlanningInputBuilder { &mut self.network_resources } + pub fn update_network_resources_from_blueprint( + &mut self, + blueprint: &Blueprint, + ) -> Result<(), PlanningInputBuildError> { + self.network_resources = OmicronZoneNetworkResources::new(); + for (_, zone) in + blueprint.all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + { + let service_id = zone.id; + if let Some((external_ip, nic)) = + zone.zone_type.external_networking() + { + self.add_omicron_zone_external_ip(service_id, external_ip)?; + self.add_omicron_zone_nic( + service_id, + OmicronZoneNic { + // TODO-cleanup use `TypedUuid` everywhere + id: VnicUuid::from_untyped_uuid(nic.id), + mac: nic.mac, + ip: nic.ip, + slot: nic.slot, + primary: nic.primary, + }, + )?; + } + } + Ok(()) + } + pub fn policy_mut(&mut self) -> &mut Policy { &mut self.policy } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index e5fd35d1e3..6f6e80cb60 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -46,3 +46,10 @@ pub struct RegionSnapshotReplacementStepStatus { pub step_invoked_ok: Vec, pub errors: Vec, } + +/// The status of a `region_snapshot_replacement_finish` background task activation +#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] +pub struct RegionSnapshotReplacementFinishStatus { + pub records_set_to_done: Vec, + pub errors: Vec, +} diff --git a/openapi/nexus.json b/openapi/nexus.json index a855378cd4..ac437b7c97 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -5210,7 +5210,7 @@ "/v1/system/ip-pools": { "get": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "List IP pools", "operationId": "ip_pool_list", @@ -5267,7 +5267,7 @@ }, "post": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Create IP pool", "operationId": "ip_pool_create", @@ -5304,7 +5304,7 @@ "/v1/system/ip-pools/{pool}": { "get": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Fetch IP pool", "operationId": "ip_pool_view", @@ -5340,7 +5340,7 @@ }, "put": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Update IP pool", "operationId": "ip_pool_update", @@ -5386,7 +5386,7 @@ }, "delete": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Delete IP pool", "operationId": "ip_pool_delete", @@ -5417,7 +5417,7 @@ "/v1/system/ip-pools/{pool}/ranges": { "get": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "List ranges for IP pool", "description": "Ranges are ordered by their first address.", @@ -5479,7 +5479,7 @@ "/v1/system/ip-pools/{pool}/ranges/add": { "post": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Add range to IP pool", "description": "IPv6 ranges are not allowed yet.", @@ -5528,7 +5528,7 @@ "/v1/system/ip-pools/{pool}/ranges/remove": { "post": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Remove range from IP pool", "operationId": "ip_pool_range_remove", @@ -5569,7 +5569,7 @@ "/v1/system/ip-pools/{pool}/silos": { "get": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "List IP pool's linked silos", "operationId": "ip_pool_silo_list", @@ -5635,7 +5635,7 @@ }, "post": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Link IP pool to silo", "description": "Users in linked silos can allocate external IPs from this pool for their instances. A silo can have at most one default pool. IPs are allocated from the default pool when users ask for one without specifying a pool.", @@ -5684,7 +5684,7 @@ "/v1/system/ip-pools/{pool}/silos/{silo}": { "put": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Make IP pool default for silo", "description": "When a user asks for an IP (e.g., at instance create time) without specifying a pool, the IP comes from the default pool if a default is configured. When a pool is made the default for a silo, any existing default will remain linked to the silo, but will no longer be the default.", @@ -5738,7 +5738,7 @@ }, "delete": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Unlink IP pool from silo", "description": "Will fail if there are any outstanding IPs allocated in the silo.", @@ -5777,7 +5777,7 @@ "/v1/system/ip-pools/{pool}/utilization": { "get": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Fetch IP pool utilization", "operationId": "ip_pool_utilization_view", @@ -5815,7 +5815,7 @@ "/v1/system/ip-pools-service": { "get": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Fetch Oxide service IP pool", "operationId": "ip_pool_service_view", @@ -5842,7 +5842,7 @@ "/v1/system/ip-pools-service/ranges": { "get": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "List IP ranges for the Oxide service pool", "description": "Ranges are ordered by their first address.", @@ -5895,7 +5895,7 @@ "/v1/system/ip-pools-service/ranges/add": { "post": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Add IP range to Oxide service pool", "description": "IPv6 ranges are not allowed yet.", @@ -5933,7 +5933,7 @@ "/v1/system/ip-pools-service/ranges/remove": { "post": { "tags": [ - "system/networking" + "system/ip-pools" ], "summary": "Remove IP range from Oxide service pool", "operationId": "ip_pool_service_range_remove", @@ -21464,6 +21464,13 @@ "url": "http://docs.oxide.computer/api/system-hardware" } }, + { + "name": "system/ip-pools", + "description": "IP pools are collections of external IPs that can be assigned to silos. When a pool is linked to a silo, users in that silo can allocate IPs from the pool for their instances.", + "externalDocs": { + "url": "http://docs.oxide.computer/api/system-ip-pools" + } + }, { "name": "system/metrics", "description": "Metrics provide insight into the operation of the Oxide deployment. These include telemetry on hardware and software components that can be used to understand the current state as well as to diagnose issues.", diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index f0f40d282e..30b8676785 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -68,6 +68,7 @@ instance_updater.period_secs = 30 region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 +region_snapshot_replacement_finish.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 23340b3c36..1761d41698 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -68,6 +68,7 @@ instance_updater.period_secs = 30 region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30 +region_snapshot_replacement_finish.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. diff --git a/smf/switch_zone_setup/README.md b/smf/switch_zone_setup/README.md new file mode 100644 index 0000000000..48c0403fef --- /dev/null +++ b/smf/switch_zone_setup/README.md @@ -0,0 +1,4 @@ +`authorized_keys` is from [`oxidecomputer/support-piv-tokens@36abeb82`](https://github.com/oxidecomputer/support-piv-tokens/blob/36abeb82aacb3640be2f6c988e49844356cf65a8/authorized_keys). + +If the set of authorized keys changes, it should also be updated in Hubris +(`support/support_tokens/authorized_keys`) diff --git a/tools/permslip_staging b/tools/permslip_staging index d2ddc45f20..41bfefb8b0 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -1,5 +1,5 @@ 6ea87b554882860f1a9b1cf97b2f4a9c61fadf3d69e6ea1bdcd781d306d6ca9c manifest-gimlet-v1.0.24.toml 85553dd164933a9b9e4f22409abd1190b1d632d192b5f7527129acaa778a671a manifest-oxide-rot-1-v1.0.13.toml 11bc0684155119f494a6e21810e4dc97b9efadb8154d570f67143dae98a45060 manifest-psc-v1.0.23.toml -60205852109f1584d29e2b086eae5a72d7f61b2e1f64d958e6326312ed2b0d66 manifest-sidecar-v1.0.24.toml +097f2ba82fa9a00de56ac9f004cb911f2c68fd5b62fa578ea41fd8d408f18ba3 manifest-sidecar-v1.0.25.toml c0fecaefac7674138337f3bd4ce4ce5b884053dead5ec27b575701471631ea2f manifest-bootleby-v1.3.0.toml diff --git a/update-engine/Cargo.toml b/update-engine/Cargo.toml index 6d94e9e269..5a22ac66f5 100644 --- a/update-engine/Cargo.toml +++ b/update-engine/Cargo.toml @@ -10,6 +10,7 @@ workspace = true [dependencies] anyhow.workspace = true cancel-safe-futures.workspace = true +chrono.workspace = true debug-ignore.workspace = true derive-where.workspace = true either.workspace = true diff --git a/update-engine/src/display/group_display.rs b/update-engine/src/display/group_display.rs index 9e75b64757..7d99150a9f 100644 --- a/update-engine/src/display/group_display.rs +++ b/update-engine/src/display/group_display.rs @@ -184,8 +184,13 @@ impl GroupDisplay { pub fn write_stats(&mut self, header: &str) -> std::io::Result<()> { // Add a blank prefix which is equal to the maximum width of known prefixes. let prefix = " ".repeat(self.max_width); - let mut line = - self.formatter.start_line(&prefix, Some(self.start_sw.elapsed())); + let mut line = self.formatter.start_line( + &prefix, + // TODO: we don't currently support setting a start time for group + // displays. We should do that at some point. + None, + Some(self.start_sw.elapsed()), + ); self.stats.format_line(&mut line, header, &self.formatter); writeln!(self.writer, "{line}") } diff --git a/update-engine/src/display/line_display.rs b/update-engine/src/display/line_display.rs index 5321ec017c..f6005a9f9e 100644 --- a/update-engine/src/display/line_display.rs +++ b/update-engine/src/display/line_display.rs @@ -4,6 +4,7 @@ // Copyright 2023 Oxide Computer Company +use chrono::{DateTime, Utc}; use debug_ignore::DebugIgnore; use derive_where::derive_where; use owo_colors::Style; @@ -50,6 +51,16 @@ impl LineDisplay { self.formatter.set_styles(styles); } + /// Sets the start time for all future lines. + /// + /// If the start time is set, then the progress display will be relative to + /// that time. Otherwise, only the offset from the start of the job will be + /// displayed. + #[inline] + pub fn set_start_time(&mut self, start_time: DateTime) { + self.shared.set_start_time(start_time); + } + /// Sets the amount of time before the next progress event is shown. #[inline] pub fn set_progress_interval(&mut self, interval: Duration) { diff --git a/update-engine/src/display/line_display_shared.rs b/update-engine/src/display/line_display_shared.rs index e31d36dcd7..73a0e44e19 100644 --- a/update-engine/src/display/line_display_shared.rs +++ b/update-engine/src/display/line_display_shared.rs @@ -9,9 +9,11 @@ use std::{ collections::HashMap, fmt::{self, Write as _}, + sync::LazyLock, time::Duration, }; +use chrono::{DateTime, Utc}; use owo_colors::OwoColorize; use swrite::{swrite, SWrite as _}; @@ -33,6 +35,8 @@ pub(super) const HEADER_WIDTH: usize = 9; #[derive(Debug, Default)] pub(super) struct LineDisplayShared { + // The start time, if provided. + start_time: Option>, // This is a map from root execution ID to data about it. execution_data: HashMap, } @@ -45,6 +49,10 @@ impl LineDisplayShared { ) -> LineDisplaySharedContext<'a> { LineDisplaySharedContext { shared: self, prefix, formatter } } + + pub(super) fn set_start_time(&mut self, start_time: DateTime) { + self.start_time = Some(start_time); + } } #[derive(Debug)] @@ -60,7 +68,11 @@ impl<'a> LineDisplaySharedContext<'a> { /// This line does not have a trailing newline; adding one is the caller's /// responsibility. pub(super) fn format_generic(&self, message: &str) -> String { - let mut line = self.formatter.start_line(self.prefix, None); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + None, + ); line.push_str(message); line } @@ -134,9 +146,11 @@ impl<'a> LineDisplaySharedContext<'a> { ) { match &step_event.kind { StepEventKind::NoStepsDefined => { - let mut line = self - .formatter - .start_line(self.prefix, Some(step_event.total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(step_event.total_elapsed), + ); swrite!( line, "{}", @@ -152,9 +166,11 @@ impl<'a> LineDisplaySharedContext<'a> { &first_step.info, &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); swrite!( line, @@ -178,9 +194,11 @@ impl<'a> LineDisplaySharedContext<'a> { &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); swrite!( line, @@ -224,9 +242,11 @@ impl<'a> LineDisplaySharedContext<'a> { &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); swrite!( line, @@ -270,9 +290,11 @@ impl<'a> LineDisplaySharedContext<'a> { &step.info, &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); self.formatter.add_completion_and_step_info( &mut line, @@ -293,9 +315,11 @@ impl<'a> LineDisplaySharedContext<'a> { &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); self.format_step_running(&mut line, ld_step_info); @@ -315,9 +339,11 @@ impl<'a> LineDisplaySharedContext<'a> { &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); self.formatter.add_completion_and_step_info( &mut line, @@ -344,9 +370,11 @@ impl<'a> LineDisplaySharedContext<'a> { &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); // The prefix is used for "Caused by" lines below. Add // the requisite amount of spacing here. let mut caused_by_prefix = line.clone(); @@ -387,9 +415,11 @@ impl<'a> LineDisplaySharedContext<'a> { &nest_data, ); - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); swrite!( line, @@ -463,8 +493,11 @@ impl<'a> LineDisplaySharedContext<'a> { &self, info: &ExecutionTerminalInfo, ) -> String { - let mut line = - self.formatter.start_line(self.prefix, info.leaf_total_elapsed); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + info.leaf_total_elapsed, + ); match info.kind { TerminalKind::Completed => { swrite!( @@ -540,9 +573,11 @@ impl<'a> LineDisplaySharedContext<'a> { nest_data: &nest_data, }; - let mut line = self - .formatter - .start_line(self.prefix, Some(root_total_elapsed)); + let mut line = self.formatter.start_line( + self.prefix, + self.shared.start_time, + Some(root_total_elapsed), + ); let (before, after) = match progress { Some(counter) => { @@ -685,6 +720,7 @@ impl LineDisplayFormatter { pub(super) fn start_line( &self, prefix: &str, + start_time: Option>, total_elapsed: Option, ) -> String { let mut line = format!("[{}", prefix.style(self.styles.prefix_style)); @@ -694,17 +730,31 @@ impl LineDisplayFormatter { } // Show total elapsed time in an hh:mm:ss format. - if let Some(total_elapsed) = total_elapsed { - let total_elapsed_secs = total_elapsed.as_secs(); - let hours = total_elapsed_secs / 3600; - let minutes = (total_elapsed_secs % 3600) / 60; - let seconds = total_elapsed_secs % 60; - swrite!(line, "{:02}:{:02}:{:02}", hours, minutes, seconds); - // To show total_elapsed more accurately, use: - // swrite!(line, "{:.2?}", total_elapsed); - } else { - // Add 8 spaces to align with hh:mm:ss. - line.push_str(" "); + match (start_time, total_elapsed) { + (Some(start_time), Some(total_elapsed)) => { + // Add the offset from the start time. + let current_time = start_time + total_elapsed; + swrite!( + line, + "{}", + current_time.format_with_items(DATETIME_FORMAT.iter()) + ); + } + (None, Some(total_elapsed)) => { + let total_elapsed_secs = total_elapsed.as_secs(); + let hours = total_elapsed_secs / 3600; + let minutes = (total_elapsed_secs % 3600) / 60; + let seconds = total_elapsed_secs % 60; + swrite!(line, "{:02}:{:02}:{:02}", hours, minutes, seconds); + // To show total_elapsed more accurately, use: + // swrite!(line, "{:.2?}", total_elapsed); + } + (Some(_), None) => { + line.push_str(DATETIME_FORMAT_INDENT); + } + (None, None) => { + line.push_str(ELAPSED_FORMAT_INDENT); + } } line.push_str("] "); @@ -874,6 +924,23 @@ impl LineDisplayFormatter { } } +static DATETIME_FORMAT: LazyLock>> = + LazyLock::new(|| { + // The format is "Jan 01 00:00:00". + // + // We can add customization in the future, but we want to restrict + // formats to fixed-width so we know how to align them. + chrono::format::StrftimeItems::new("%b %d %H:%M:%S") + .parse() + .expect("datetime format is valid") + }); + +// "Jan 01 00:00:00" is 15 characters wide. +const DATETIME_FORMAT_INDENT: &str = " "; + +// "00:00:00" is 8 characters wide. +const ELAPSED_FORMAT_INDENT: &str = " "; + #[derive(Clone, Debug)] pub(super) struct LineDisplayOutput { lines: Vec, @@ -989,6 +1056,8 @@ impl fmt::Display for AsLetters { #[cfg(test)] mod tests { + use chrono::TimeZone; + use super::*; #[test] @@ -1010,4 +1079,32 @@ mod tests { ); } } + + #[test] + fn test_start_line() { + let formatter = LineDisplayFormatter::new(); + let prefix = "prefix"; + let start_time = Utc.with_ymd_and_hms(2023, 2, 8, 3, 40, 56).unwrap(); + + assert_eq!( + formatter.start_line(prefix, None, None), + "[prefix ] ", + ); + assert_eq!( + formatter.start_line(prefix, None, Some(Duration::from_secs(5))), + "[prefix 00:00:05] ", + ); + assert_eq!( + formatter.start_line(prefix, Some(start_time), None), + "[prefix ] ", + ); + assert_eq!( + formatter.start_line( + prefix, + Some(start_time), + Some(Duration::from_secs(3600)), + ), + "[prefix Feb 08 04:40:56] ", + ); + } } diff --git a/update-engine/src/spec.rs b/update-engine/src/spec.rs index 0ec346a423..bb017be3f6 100644 --- a/update-engine/src/spec.rs +++ b/update-engine/src/spec.rs @@ -335,6 +335,14 @@ mod tests { #[test] fn test_merge_anyhow_list() { + // If the process's environment has `RUST_BACKTRACE=1`, then backtraces + // get captured and the output doesn't match. As long as we set + // `RUST_BACKTRACE=0` before the first time a backtrace is captured, we + // should be fine. Do so at the beginning of this test. + unsafe { + std::env::set_var("RUST_BACKTRACE", "0"); + } + // A single error stays as-is. let error = anyhow!("base").context("parent").context("root"); diff --git a/wicket/src/ui/panes/rack_setup.rs b/wicket/src/ui/panes/rack_setup.rs index f23bc3c816..cc6a2c5621 100644 --- a/wicket/src/ui/panes/rack_setup.rs +++ b/wicket/src/ui/panes/rack_setup.rs @@ -285,10 +285,29 @@ fn draw_rack_reset_popup( "Rack Reset (DESTRUCTIVE!)", style::header(true), )]); - let body = Text::from(vec![Line::from(vec![Span::styled( + let mut body = Text::from(vec![Line::from(vec![Span::styled( "Would you like to reset the rack to an uninitialized state?", style::plain_text(), )])]); + // One might see this warning and ask "why is this feature even + // here, then?" We do eventually want "rack reset" to work as a + // sort of factory reset, and the current implementation is a good + // starting point, so there's no sense in removing it (this is + // certainly not the only feature currently in this state). + // + // The warning is intended to remove the speed bump where someone + // has to find out the hard way that this doesn't work, without + // removing the speed bump where we're reminded of the feature that + // doesn't work yet. + body.lines.push(Line::from("")); + body.lines.push(Line::from(vec![ + Span::styled("WARNING: ", style::warning()), + Span::styled( + "This does not work yet and will leave the rack \ + in an unknown state (see omicron#3820)", + style::plain_text(), + ), + ])); let buttons = vec![ButtonText::new("Yes", "Y"), ButtonText::new("No", "N")];