From 0a5b91a24c785cbf03e09a0659eedb0a9dba5cc5 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Wed, 27 Sep 2023 16:13:43 -0400 Subject: [PATCH 1/9] Add an SMF property for rack_id to MGS (#4145) A follow up commit will plumb the rack_id along with any discovered sleds to a new "hardware_sled" table in CRDB used for sled addition to an already initialized rack. This is going to be required when adding sleds to initialized racks, in order to create a `StartSledAgentRequest` for new sleds. --- gateway-test-utils/src/setup.rs | 5 ++- gateway/src/bin/mgs.rs | 41 ++++++++++++++++++++---- gateway/src/context.rs | 14 +++++++- gateway/src/lib.rs | 38 ++++++++++++++++------ package-manifest.toml | 10 ++++-- sled-agent/src/services.rs | 18 +++++++++++ smf/mgs-sim/manifest.xml | 57 --------------------------------- smf/mgs/manifest.xml | 6 ++++ 8 files changed, 110 insertions(+), 79 deletions(-) delete mode 100644 smf/mgs-sim/manifest.xml diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index b947f32a19..e789f8de78 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -142,13 +142,12 @@ pub async fn test_setup_with_config( } // Start gateway server - let rack_id = Uuid::parse_str(RACK_UUID).unwrap(); + let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap()); - let args = MgsArguments { id: Uuid::new_v4(), addresses }; + let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id }; let server = omicron_gateway::Server::start( server_config.clone(), args, - rack_id, log.clone(), ) .await diff --git a/gateway/src/bin/mgs.rs b/gateway/src/bin/mgs.rs index fe047b766b..cb9070a9a5 100644 --- a/gateway/src/bin/mgs.rs +++ b/gateway/src/bin/mgs.rs @@ -56,6 +56,7 @@ enum Args { struct ConfigProperties { id: Uuid, addresses: Vec, + rack_id: Option, } #[tokio::main] @@ -91,15 +92,17 @@ async fn do_run() -> Result<(), CmdError> { )) })?; - let (id, addresses) = if id_and_address_from_smf { + let (id, addresses, rack_id) = if id_and_address_from_smf { let config = read_smf_config()?; - (config.id, config.addresses) + (config.id, config.addresses, config.rack_id) } else { - // Clap ensures these are present if `id_and_address_from_smf` - // is false, so we can safely unwrap. - (id.unwrap(), vec![address.unwrap()]) + // Does it matter if `rack_id` is always `None` in this case? + let rack_id = None; + // Clap ensures the first two fields are present if + // `id_and_address_from_smf` is false, so we can safely unwrap. + (id.unwrap(), vec![address.unwrap()], rack_id) }; - let args = MgsArguments { id, addresses }; + let args = MgsArguments { id, addresses, rack_id }; let mut server = start_server(config, args).await.map_err(CmdError::Failure)?; @@ -114,6 +117,7 @@ async fn do_run() -> Result<(), CmdError> { .to_string() )); } + server.set_rack_id(new_config.rack_id); server .adjust_dropshot_addresses(&new_config.addresses) .await @@ -148,6 +152,9 @@ fn read_smf_config() -> Result { // Name of the property within CONFIG_PG for our server addresses. const PROP_ADDR: &str = "address"; + // Name of the property within CONFIG_PG for our rack ID. + const PROP_RACK_ID: &str = "rack_id"; + // This function is pretty boilerplate-y; we can reduce it by using this // error type to help us construct a `CmdError::Failure(_)` string. It // assumes (for the purposes of error messages) any property being fetched @@ -210,6 +217,26 @@ fn read_smf_config() -> Result { )) })?; + let prop_rack_id = config + .get_property(PROP_RACK_ID) + .map_err(|err| Error::GetProperty { prop: PROP_RACK_ID, err })? + .ok_or_else(|| Error::MissingProperty { prop: PROP_RACK_ID })? + .value() + .map_err(|err| Error::GetValue { prop: PROP_RACK_ID, err })? + .ok_or(Error::MissingValue { prop: PROP_RACK_ID })? + .as_string() + .map_err(|err| Error::ValueAsString { prop: PROP_RACK_ID, err })?; + + let rack_id = if prop_rack_id.as_str() == "unknown" { + None + } else { + Some(Uuid::try_parse(&prop_rack_id).map_err(|err| { + CmdError::Failure(format!( + "failed to parse `{CONFIG_PG}/{PROP_RACK_ID}` ({prop_rack_id:?}) as a UUID: {err}" + )) + })?) + }; + let prop_addr = config .get_property(PROP_ADDR) .map_err(|err| Error::GetProperty { prop: PROP_ADDR, err })? @@ -236,7 +263,7 @@ fn read_smf_config() -> Result { "no addresses specified by `{CONFIG_PG}/{PROP_ADDR}`" ))) } else { - Ok(ConfigProperties { id: prop_id, addresses }) + Ok(ConfigProperties { id: prop_id, addresses, rack_id }) } } diff --git a/gateway/src/context.rs b/gateway/src/context.rs index e78e5c7842..939bb9b6b9 100644 --- a/gateway/src/context.rs +++ b/gateway/src/context.rs @@ -6,13 +6,16 @@ use crate::error::StartupError; use crate::management_switch::ManagementSwitch; use crate::management_switch::SwitchConfig; use gateway_sp_comms::InMemoryHostPhase2Provider; -use slog::Logger; +use slog::{info, Logger}; use std::sync::Arc; +use std::sync::OnceLock; +use uuid::Uuid; /// Shared state used by API request handlers pub struct ServerContext { pub mgmt_switch: ManagementSwitch, pub host_phase2_provider: Arc, + pub rack_id: OnceLock, pub log: Logger, } @@ -20,15 +23,24 @@ impl ServerContext { pub async fn new( host_phase2_provider: Arc, switch_config: SwitchConfig, + rack_id_config: Option, log: &Logger, ) -> Result, StartupError> { let mgmt_switch = ManagementSwitch::new(switch_config, &host_phase2_provider, log) .await?; + let rack_id = if let Some(id) = rack_id_config { + info!(log, "Setting rack_id"; "rack_id" => %id); + OnceLock::from(id) + } else { + OnceLock::new() + }; + Ok(Arc::new(ServerContext { mgmt_switch, host_phase2_provider, + rack_id, log: log.clone(), })) } diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 871b05719a..10fcf1539c 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -58,6 +58,7 @@ pub fn run_openapi() -> Result<(), String> { pub struct MgsArguments { pub id: Uuid, pub addresses: Vec, + pub rack_id: Option, } type HttpServer = dropshot::HttpServer>; @@ -125,7 +126,6 @@ impl Server { pub async fn start( config: Config, args: MgsArguments, - _rack_id: Uuid, log: Logger, ) -> Result { if args.addresses.is_empty() { @@ -146,12 +146,14 @@ impl Server { Arc::new(InMemoryHostPhase2Provider::with_capacity( config.host_phase2_recovery_image_cache_max_images, )); - let apictx = - ServerContext::new(host_phase2_provider, config.switch, &log) - .await - .map_err(|error| { - format!("initializing server context: {}", error) - })?; + let apictx = ServerContext::new( + host_phase2_provider, + config.switch, + args.rack_id, + &log, + ) + .await + .map_err(|error| format!("initializing server context: {}", error))?; let mut http_servers = HashMap::with_capacity(args.addresses.len()); let all_servers_shutdown = FuturesUnordered::new(); @@ -283,6 +285,25 @@ impl Server { Ok(()) } + /// The rack_id will be set on a refresh of the SMF property when the sled + /// agent starts. + pub fn set_rack_id(&self, rack_id: Option) { + if let Some(rack_id) = rack_id { + let val = self.apictx.rack_id.get_or_init(|| rack_id); + if *val != rack_id { + error!( + self.apictx.log, + "Ignoring attempted change to rack ID"; + "current_rack_id" => %val, + "ignored_new_rack_id" => %rack_id); + } else { + info!(self.apictx.log, "Set rack_id"; "rack_id" => %rack_id); + } + } else { + warn!(self.apictx.log, "SMF refresh called without a rack id"); + } + } + // TODO does MGS register itself with oximeter? // Register the Nexus server as a metric producer with `oximeter. // pub async fn register_as_producer(&self) { @@ -313,8 +334,7 @@ pub async fn start_server( } else { debug!(log, "registered DTrace probes"); } - let rack_id = Uuid::new_v4(); - let server = Server::start(config, args, rack_id, log).await?; + let server = Server::start(config, args, log).await?; // server.register_as_producer().await; Ok(server) } diff --git a/package-manifest.toml b/package-manifest.toml index b2e1552f3c..4dc0f6b616 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -259,7 +259,10 @@ service_name = "mgs" only_for_targets.image = "standard" only_for_targets.switch = "stub" source.type = "local" -source.paths = [ { from = "smf/mgs-sim", to = "/var/svc/manifest/site/mgs" } ] +source.paths = [ + { from = "smf/mgs/manifest.xml", to = "/var/svc/manifest/site/mgs/manifest.xml" }, + { from = "smf/mgs-sim/config.toml", to = "/var/svc/manifest/site/mgs/config.toml" } +] output.intermediate_only = true output.type = "zone" @@ -268,7 +271,10 @@ service_name = "mgs" only_for_targets.image = "standard" only_for_targets.switch = "softnpu" source.type = "local" -source.paths = [ { from = "smf/mgs-sim", to = "/var/svc/manifest/site/mgs" } ] +source.paths = [ + { from = "smf/mgs/manifest.xml", to = "/var/svc/manifest/site/mgs/manifest.xml" }, + { from = "smf/mgs-sim/config.toml", to = "/var/svc/manifest/site/mgs/config.toml" } +] output.intermediate_only = true output.type = "zone" diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 9868bb4f0d..96cdf8222b 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1645,6 +1645,10 @@ impl ServiceManager { } } + if let Some(info) = self.inner.sled_info.get() { + smfh.setprop("config/rack_id", info.rack_id)?; + } + smfh.refresh()?; } ServiceType::SpSim => { @@ -2754,6 +2758,20 @@ impl ServiceManager { &format!("[{address}]:{MGS_PORT}"), )?; + // It should be impossible for the `sled_info` not to be set here, + // as the underlay is set at the same time. + if let Some(info) = self.inner.sled_info.get() { + smfh.setprop("config/rack_id", info.rack_id)?; + } else { + error!( + self.inner.log, + concat!( + "rack_id not present,", + " even though underlay address exists" + ) + ); + } + smfh.refresh()?; } ServiceType::Dendrite { .. } => { diff --git a/smf/mgs-sim/manifest.xml b/smf/mgs-sim/manifest.xml deleted file mode 100644 index 1e9b752773..0000000000 --- a/smf/mgs-sim/manifest.xml +++ /dev/null @@ -1,57 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/smf/mgs/manifest.xml b/smf/mgs/manifest.xml index 1e9b752773..125c32ce2b 100644 --- a/smf/mgs/manifest.xml +++ b/smf/mgs/manifest.xml @@ -31,6 +31,12 @@ "unknown" with a UUID as a part of starting us. --> + +