diff --git a/Cargo.lock b/Cargo.lock index f10181f752..1eea7f9154 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9761,6 +9761,7 @@ dependencies = [ "sha3", "sled-hardware-types", "slog", + "strum", "thiserror", "toml 0.8.19", "uuid", @@ -12117,6 +12118,7 @@ dependencies = [ "schemars", "serde", "serde_json", + "sled-agent-types", "sled-hardware-types", "slog", "update-engine", diff --git a/clients/wicketd-client/Cargo.toml b/clients/wicketd-client/Cargo.toml index 0e55acd8bb..5e52eedb49 100644 --- a/clients/wicketd-client/Cargo.toml +++ b/clients/wicketd-client/Cargo.toml @@ -19,6 +19,7 @@ reqwest = { workspace = true, features = ["rustls-tls", "stream"] } schemars.workspace = true serde.workspace = true serde_json.workspace = true +sled-agent-types.workspace = true sled-hardware-types.workspace = true slog.workspace = true update-engine.workspace = true diff --git a/clients/wicketd-client/src/lib.rs b/clients/wicketd-client/src/lib.rs index 7a07ecd6a5..40b60ac612 100644 --- a/clients/wicketd-client/src/lib.rs +++ b/clients/wicketd-client/src/lib.rs @@ -67,6 +67,7 @@ progenitor::generate_api!( RotSlot = wicket_common::inventory::RotSlot, RotState = wicket_common::inventory::RotState, RouteConfig = omicron_common::api::internal::shared::RouteConfig, + RssStep = sled_agent_types::rack_ops::RssStep, SpComponentCaboose = wicket_common::inventory::SpComponentCaboose, SpComponentInfo = wicket_common::inventory::SpComponentInfo, SpIdentifier = wicket_common::inventory::SpIdentifier, diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index bd928001bb..6b4d2093a1 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -1070,11 +1070,15 @@ "enum": [ "initializing" ] + }, + "step": { + "$ref": "#/components/schemas/RssStep" } }, "required": [ "id", - "status" + "status", + "step" ] }, { @@ -1280,6 +1284,235 @@ "nexthop" ] }, + "RssStep": { + "description": "Steps we go through during initial rack setup. Keep this list in order that they happen.", + "oneOf": [ + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "requested" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "starting" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "load_existing_plan" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "create_sled_plan" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "init_trust_quorum" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "network_config_update" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "sled_init" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "ensure_storage" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "init_dns" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "configure_dns" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "init_ntp" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "wait_for_time_sync" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "wait_for_database" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "cluster_init" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "zones_init" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "nexus_handoff" + ] + } + }, + "required": [ + "status" + ] + } + ] + }, "SemverVersion": { "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 87cfe045d3..6d17d9c071 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -2680,7 +2680,7 @@ ] }, "RackOperationStatus": { - "description": "Current status of any rack-level operation being performed by this bootstrap agent.\n\n
JSON schema\n\n```json { \"description\": \"Current status of any rack-level operation being performed by this bootstrap agent.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initializing\" ] } } }, { \"description\": \"`id` will be none if the rack was already initialized on startup.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_panicked\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"resetting\" ] } } }, { \"description\": \"`reset_id` will be None if the rack is in an uninitialized-on-startup, or Some if it is in an uninitialized state due to a reset operation completing.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"reset_id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"uninitialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_panicked\" ] } } } ] } ```
", + "description": "Current status of any rack-level operation being performed by this bootstrap agent.\n\n
JSON schema\n\n```json { \"description\": \"Current status of any rack-level operation being performed by this bootstrap agent.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"id\", \"status\", \"step\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initializing\" ] }, \"step\": { \"$ref\": \"#/components/schemas/RssStep\" } } }, { \"description\": \"`id` will be none if the rack was already initialized on startup.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackInitKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"initialization_panicked\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"resetting\" ] } } }, { \"description\": \"`reset_id` will be None if the rack is in an uninitialized-on-startup, or Some if it is in an uninitialized state due to a reset operation completing.\", \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"reset_id\": { \"allOf\": [ { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" } ] }, \"status\": { \"type\": \"string\", \"enum\": [ \"uninitialized\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"message\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"message\": { \"type\": \"string\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_failed\" ] } } }, { \"type\": \"object\", \"required\": [ \"id\", \"status\" ], \"properties\": { \"id\": { \"$ref\": \"#/components/schemas/TypedUuidForRackResetKind\" }, \"status\": { \"type\": \"string\", \"enum\": [ \"reset_panicked\" ] } } } ] } ```
", "oneOf": [ { "type": "object", @@ -2693,11 +2693,15 @@ "enum": [ "initializing" ] + }, + "step": { + "$ref": "#/components/schemas/RssStep" } }, "required": [ "id", - "status" + "status", + "step" ] }, { @@ -3150,6 +3154,235 @@ "nexthop" ] }, + "RssStep": { + "description": "Steps we go through during initial rack setup. Keep this list in order that they happen.\n\n
JSON schema\n\n```json { \"description\": \"Steps we go through during initial rack setup. Keep this list in order that they happen.\", \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"requested\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"starting\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"load_existing_plan\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"create_sled_plan\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"init_trust_quorum\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"network_config_update\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"sled_init\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"ensure_storage\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"init_dns\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"configure_dns\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"init_ntp\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"wait_for_time_sync\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"wait_for_database\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"cluster_init\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"zones_init\" ] } } }, { \"type\": \"object\", \"required\": [ \"status\" ], \"properties\": { \"status\": { \"type\": \"string\", \"enum\": [ \"nexus_handoff\" ] } } } ] } ```
", + "oneOf": [ + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "requested" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "starting" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "load_existing_plan" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "create_sled_plan" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "init_trust_quorum" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "network_config_update" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "sled_init" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "ensure_storage" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "init_dns" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "configure_dns" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "init_ntp" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "wait_for_time_sync" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "wait_for_database" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "cluster_init" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "zones_init" + ] + } + }, + "required": [ + "status" + ] + }, + { + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "nexus_handoff" + ] + } + }, + "required": [ + "status" + ] + } + ] + }, "SemverVersion": { "type": "string", "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" diff --git a/sled-agent/src/bootstrap/rack_ops.rs b/sled-agent/src/bootstrap/rack_ops.rs index 3eb00b419a..cd59aa4849 100644 --- a/sled-agent/src/bootstrap/rack_ops.rs +++ b/sled-agent/src/bootstrap/rack_ops.rs @@ -10,7 +10,7 @@ use bootstore::schemes::v0 as bootstore; use omicron_uuid_kinds::RackInitUuid; use omicron_uuid_kinds::RackResetUuid; use sled_agent_types::rack_init::RackInitializeRequest; -use sled_agent_types::rack_ops::RackOperationStatus; +use sled_agent_types::rack_ops::{RackOperationStatus, RssStep}; use sled_storage::manager::StorageHandle; use slog::Logger; use std::mem; @@ -19,6 +19,7 @@ use std::sync::Arc; use std::sync::Mutex; use tokio::sync::oneshot; use tokio::sync::oneshot::error::TryRecvError; +use tokio::sync::watch; #[derive(Debug, Clone, thiserror::Error)] pub enum RssAccessError { @@ -62,7 +63,7 @@ impl RssAccess { let mut status = self.status.lock().unwrap(); match &mut *status { - RssStatus::Initializing { id, completion } => { + RssStatus::Initializing { id, completion, step_rx } => { let id = *id; // This is our only chance to notice the initialization task has // panicked: if it dropped the sending half of `completion` @@ -75,7 +76,11 @@ impl RssAccess { } Err(TryRecvError::Empty) => { // Initialization task is still running - RackOperationStatus::Initializing { id } + // Update the step we are on. + RackOperationStatus::Initializing { + id, + step: *step_rx.borrow(), + } } Err(TryRecvError::Closed) => { // Initialization task has panicked! @@ -171,9 +176,9 @@ impl RssAccess { RssStatus::Uninitialized { .. } => { let (completion_tx, completion) = oneshot::channel(); let id = RackInitUuid::new_v4(); - *status = RssStatus::Initializing { id, completion }; + let (step_tx, step_rx) = watch::channel(RssStep::Requested); + *status = RssStatus::Initializing { id, completion, step_rx }; mem::drop(status); - let parent_log = parent_log.clone(); let storage_manager = storage_manager.clone(); let bootstore_node_handle = bootstore_node_handle.clone(); @@ -185,6 +190,7 @@ impl RssAccess { storage_manager, bootstore_node_handle, request, + step_tx, ) .await; let new_status = match result { @@ -284,6 +290,9 @@ enum RssStatus { Initializing { id: RackInitUuid, completion: oneshot::Receiver<()>, + // Used by the RSS task to update us with what step it is on. + // This holds the current RSS step. + step_rx: watch::Receiver, }, Resetting { id: RackResetUuid, @@ -313,6 +322,7 @@ async fn rack_initialize( storage_manager: StorageHandle, bootstore_node_handle: bootstore::NodeHandle, request: RackInitializeRequest, + step_tx: watch::Sender, ) -> Result<(), SetupServiceError> { RssHandle::run_rss( parent_log, @@ -320,6 +330,7 @@ async fn rack_initialize( global_zone_bootstrap_ip, storage_manager, bootstore_node_handle, + step_tx, ) .await } diff --git a/sled-agent/src/bootstrap/rss_handle.rs b/sled-agent/src/bootstrap/rss_handle.rs index eee7eed085..0cf6054ca2 100644 --- a/sled-agent/src/bootstrap/rss_handle.rs +++ b/sled-agent/src/bootstrap/rss_handle.rs @@ -15,6 +15,7 @@ use omicron_common::backoff::retry_notify; use omicron_common::backoff::retry_policy_local; use omicron_common::backoff::BackoffError; use sled_agent_types::rack_init::RackInitializeRequest; +use sled_agent_types::rack_ops::RssStep; use sled_agent_types::sled::StartSledAgentRequest; use sled_storage::manager::StorageHandle; use slog::Logger; @@ -22,6 +23,7 @@ use std::net::Ipv6Addr; use std::net::SocketAddrV6; use tokio::sync::mpsc; use tokio::sync::oneshot; +use tokio::sync::watch; use tokio::task::JoinHandle; pub(super) struct RssHandle { @@ -48,6 +50,7 @@ impl RssHandle { our_bootstrap_address: Ipv6Addr, storage_manager: StorageHandle, bootstore: bootstore::NodeHandle, + step_tx: watch::Sender, ) -> Result<(), SetupServiceError> { let (tx, rx) = rss_channel(our_bootstrap_address); @@ -57,6 +60,7 @@ impl RssHandle { storage_manager, tx, bootstore, + step_tx, ); let log = log.new(o!("component" => "BootstrapAgentRssHandler")); rx.await_local_rss_request(&log).await; diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 3f73e55d0f..17b75c4334 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -118,6 +118,7 @@ use sled_agent_types::early_networking::{ use sled_agent_types::rack_init::{ BootstrapAddressDiscovery, RackInitializeRequest as Config, }; +use sled_agent_types::rack_ops::RssStep; use sled_agent_types::sled::StartSledAgentRequest; use sled_agent_types::time_sync::TimeSync; use sled_hardware_types::underlay::BootstrapInterface; @@ -130,8 +131,25 @@ use std::iter; use std::net::{Ipv6Addr, SocketAddrV6}; use std::time::Duration; use thiserror::Error; +use tokio::sync::watch; use uuid::Uuid; +/// For tracking the current RSS step and sending notifications about it. +pub struct RssProgress { + step_tx: watch::Sender, +} + +impl RssProgress { + pub fn new(step_tx: watch::Sender) -> Self { + step_tx.send_replace(RssStep::Starting); + RssProgress { step_tx } + } + + pub fn update(&mut self, new_step: RssStep) { + self.step_tx.send_replace(new_step); + } +} + /// Describes errors which may occur while operating the setup service. #[derive(Error, Debug)] pub enum SetupServiceError { @@ -224,6 +242,7 @@ impl RackSetupService { storage_manager: StorageHandle, local_bootstrap_agent: BootstrapAgentHandle, bootstore: bootstore::NodeHandle, + step_tx: watch::Sender, ) -> Self { let handle = tokio::task::spawn(async move { let svc = ServiceInner::new(log.clone()); @@ -233,6 +252,7 @@ impl RackSetupService { &storage_manager, local_bootstrap_agent, bootstore, + step_tx, ) .await { @@ -1049,8 +1069,10 @@ impl ServiceInner { storage_manager: &StorageHandle, local_bootstrap_agent: BootstrapAgentHandle, bootstore: bootstore::NodeHandle, + step_tx: watch::Sender, ) -> Result<(), SetupServiceError> { info!(self.log, "Injecting RSS configuration: {:#?}", config); + let mut rss_step = RssProgress::new(step_tx); let resolver = DnsResolver::new_from_subnet( self.log.new(o!("component" => "DnsResolver")), @@ -1081,6 +1103,7 @@ impl ServiceInner { "RSS configuration looks like it has already been applied", ); + rss_step.update(RssStep::LoadExistingPlan); let sled_plan = SledPlan::load(&self.log, storage_manager) .await? .expect("Sled plan should exist if completed marker exists"); @@ -1100,6 +1123,7 @@ impl ServiceInner { let nexus_address = resolver.lookup_socket_v6(ServiceName::Nexus).await?; + rss_step.update(RssStep::NexusHandoff); self.handoff_to_nexus( &config, &sled_plan, @@ -1113,6 +1137,7 @@ impl ServiceInner { info!(self.log, "RSS configuration has not been fully applied yet"); } + rss_step.update(RssStep::CreateSledPlan); // Wait for either: // - All the peers to re-load an old plan (if one exists) // - Enough peers to create a new plan (if one does not exist) @@ -1163,6 +1188,7 @@ impl ServiceInner { }; let config = &plan.config; + rss_step.update(RssStep::InitTrustQuorum); // Initialize the trust quorum if there are peers configured. if let Some(peers) = &config.trust_quorum_peers { let initial_membership: BTreeSet<_> = @@ -1185,8 +1211,10 @@ impl ServiceInner { }, }; info!(self.log, "Writing Rack Network Configuration to bootstore"); + rss_step.update(RssStep::NetworkConfigUpdate); bootstore.update_network_config(early_network_config.into()).await?; + rss_step.update(RssStep::SledInit); // Forward the sled initialization requests to our sled-agent. local_bootstrap_agent .initialize_sleds( @@ -1223,6 +1251,7 @@ impl ServiceInner { .await? }; + rss_step.update(RssStep::EnsureStorage); // Before we can ask for any services, we need to ensure that storage is // operational. self.ensure_storage_config_at_least(&service_plan).await?; @@ -1239,7 +1268,9 @@ impl ServiceInner { matches!(zone_type, OmicronZoneType::InternalDns { .. }) }, ); + rss_step.update(RssStep::InitDns); self.ensure_zone_config_at_least(v2generator.sled_configs()).await?; + rss_step.update(RssStep::ConfigureDns); self.initialize_internal_dns_records(&service_plan).await?; // Ask MGS in each switch zone which switch it is. @@ -1247,6 +1278,7 @@ impl ServiceInner { .lookup_switch_zone_underlay_addrs(&resolver) .await; + rss_step.update(RssStep::InitNtp); // Next start up the NTP services. let v3generator = v2generator.new_version_with( DeployStepVersion::V3_DNS_AND_NTP, @@ -1260,11 +1292,13 @@ impl ServiceInner { ); self.ensure_zone_config_at_least(v3generator.sled_configs()).await?; + rss_step.update(RssStep::WaitForTimeSync); // Wait until time is synchronized on all sleds before proceeding. self.wait_for_timesync(&sled_addresses).await?; info!(self.log, "Finished setting up Internal DNS and NTP"); + rss_step.update(RssStep::WaitForDatabase); // Wait until Cockroach has been initialized before running Nexus. let v4generator = v3generator.new_version_with( DeployStepVersion::V4_COCKROACHDB, @@ -1276,9 +1310,11 @@ impl ServiceInner { // Now that datasets and zones have started for CockroachDB, // perform one-time initialization of the cluster. + rss_step.update(RssStep::ClusterInit); self.initialize_cockroach(&service_plan).await?; // Issue the rest of the zone initialization requests. + rss_step.update(RssStep::ZonesInit); let v5generator = v4generator .new_version_with(DeployStepVersion::V5_EVERYTHING, &|_| true); self.ensure_zone_config_at_least(v5generator.sled_configs()).await?; @@ -1296,6 +1332,7 @@ impl ServiceInner { let nexus_address = resolver.lookup_socket_v6(ServiceName::Nexus).await?; + rss_step.update(RssStep::NexusHandoff); // At this point, even if we reboot, we must not try to manage sleds, // services, or DNS records. self.handoff_to_nexus( diff --git a/sled-agent/types/Cargo.toml b/sled-agent/types/Cargo.toml index e01d40db28..b33cdc8651 100644 --- a/sled-agent/types/Cargo.toml +++ b/sled-agent/types/Cargo.toml @@ -29,6 +29,7 @@ serde_json.workspace = true sha3.workspace = true sled-hardware-types.workspace = true slog.workspace = true +strum.workspace = true thiserror.workspace = true toml.workspace = true uuid.workspace = true diff --git a/sled-agent/types/src/rack_ops.rs b/sled-agent/types/src/rack_ops.rs index d8c0fa1c88..3ff56a02f0 100644 --- a/sled-agent/types/src/rack_ops.rs +++ b/sled-agent/types/src/rack_ops.rs @@ -5,16 +5,16 @@ use omicron_uuid_kinds::{RackInitUuid, RackResetUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use strum::{EnumCount, EnumIter, IntoEnumIterator}; /// Current status of any rack-level operation being performed by this bootstrap /// agent. -#[derive( - Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, -)] +#[derive(Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] #[serde(tag = "status", rename_all = "snake_case")] pub enum RackOperationStatus { Initializing { id: RackInitUuid, + step: RssStep, }, /// `id` will be none if the rack was already initialized on startup. Initialized { @@ -44,3 +44,55 @@ pub enum RackOperationStatus { id: RackResetUuid, }, } + +/// Steps we go through during initial rack setup. +/// Keep this list in order that they happen. +#[derive( + Copy, + Clone, + Debug, + Deserialize, + EnumCount, + EnumIter, + Eq, + Hash, + JsonSchema, + Ord, + PartialEq, + PartialOrd, + Serialize, +)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum RssStep { + Requested, + Starting, + LoadExistingPlan, + CreateSledPlan, + InitTrustQuorum, + NetworkConfigUpdate, + SledInit, + EnsureStorage, + InitDns, + ConfigureDns, + InitNtp, + WaitForTimeSync, + WaitForDatabase, + ClusterInit, + ZonesInit, + NexusHandoff, +} + +impl RssStep { + pub fn max_step(&self) -> usize { + RssStep::COUNT + } + + pub fn index(&self) -> usize { + for (index, variant) in RssStep::iter().enumerate() { + if *self == variant { + return index; + } + } + return 0; + } +} diff --git a/wicket/src/ui/panes/rack_setup.rs b/wicket/src/ui/panes/rack_setup.rs index cc6a2c5621..cbf66a1cf3 100644 --- a/wicket/src/ui/panes/rack_setup.rs +++ b/wicket/src/ui/panes/rack_setup.rs @@ -431,11 +431,21 @@ fn draw_rack_status_details_popup( style::plain_text(), )])); } - Ok(RackOperationStatus::Initializing { id }) => { + Ok(RackOperationStatus::Initializing { id, step }) => { body.lines.push(Line::from(vec![ status, Span::styled("Initializing", style::plain_text()), ])); + let max = step.max_step(); + let index = step.index(); + body.lines.push(Line::from(vec![Span::styled( + format!("Current step: {}/{}", index, max), + style::plain_text(), + )])); + body.lines.push(Line::from(vec![Span::styled( + format!("Current operation: {:?}", step), + style::plain_text(), + )])); body.lines.push(Line::from(vec![Span::styled( format!("Current operation ID: {}", id), style::plain_text(), @@ -632,8 +642,11 @@ fn rss_config_text<'a>( Ok(RackOperationStatus::Initialized { .. }) => { Span::styled("Initialized", ok_style) } - Ok(RackOperationStatus::Initializing { .. }) => { - Span::styled("Initializing", warn_style) + Ok(RackOperationStatus::Initializing { step, .. }) => { + let max = step.max_step(); + let index = step.index(); + let msg = format!("Initializing: Step {}/{}", index, max); + Span::styled(msg, warn_style) } Ok(RackOperationStatus::Resetting { .. }) => { Span::styled("Resetting", warn_style)