Skip to content

Commit

Permalink
Add an SMF property for rack_id to MGS (#4145)
Browse files Browse the repository at this point in the history
A follow up commit will plumb the rack_id along with any discovered
sleds to a new "hardware_sled" table in CRDB used for sled addition to
an already initialized rack. This is going to be required when adding
sleds to initialized racks, in order to create a `StartSledAgentRequest`
for new sleds.
  • Loading branch information
andrewjstone authored Sep 27, 2023
1 parent 9a1f6bc commit 0a5b91a
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 79 deletions.
5 changes: 2 additions & 3 deletions gateway-test-utils/src/setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,12 @@ pub async fn test_setup_with_config(
}

// Start gateway server
let rack_id = Uuid::parse_str(RACK_UUID).unwrap();
let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap());

let args = MgsArguments { id: Uuid::new_v4(), addresses };
let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id };
let server = omicron_gateway::Server::start(
server_config.clone(),
args,
rack_id,
log.clone(),
)
.await
Expand Down
41 changes: 34 additions & 7 deletions gateway/src/bin/mgs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ enum Args {
struct ConfigProperties {
id: Uuid,
addresses: Vec<SocketAddrV6>,
rack_id: Option<Uuid>,
}

#[tokio::main]
Expand Down Expand Up @@ -91,15 +92,17 @@ async fn do_run() -> Result<(), CmdError> {
))
})?;

let (id, addresses) = if id_and_address_from_smf {
let (id, addresses, rack_id) = if id_and_address_from_smf {
let config = read_smf_config()?;
(config.id, config.addresses)
(config.id, config.addresses, config.rack_id)
} else {
// Clap ensures these are present if `id_and_address_from_smf`
// is false, so we can safely unwrap.
(id.unwrap(), vec![address.unwrap()])
// Does it matter if `rack_id` is always `None` in this case?
let rack_id = None;
// Clap ensures the first two fields are present if
// `id_and_address_from_smf` is false, so we can safely unwrap.
(id.unwrap(), vec![address.unwrap()], rack_id)
};
let args = MgsArguments { id, addresses };
let args = MgsArguments { id, addresses, rack_id };
let mut server =
start_server(config, args).await.map_err(CmdError::Failure)?;

Expand All @@ -114,6 +117,7 @@ async fn do_run() -> Result<(), CmdError> {
.to_string()
));
}
server.set_rack_id(new_config.rack_id);
server
.adjust_dropshot_addresses(&new_config.addresses)
.await
Expand Down Expand Up @@ -148,6 +152,9 @@ fn read_smf_config() -> Result<ConfigProperties, CmdError> {
// Name of the property within CONFIG_PG for our server addresses.
const PROP_ADDR: &str = "address";

// Name of the property within CONFIG_PG for our rack ID.
const PROP_RACK_ID: &str = "rack_id";

// This function is pretty boilerplate-y; we can reduce it by using this
// error type to help us construct a `CmdError::Failure(_)` string. It
// assumes (for the purposes of error messages) any property being fetched
Expand Down Expand Up @@ -210,6 +217,26 @@ fn read_smf_config() -> Result<ConfigProperties, CmdError> {
))
})?;

let prop_rack_id = config
.get_property(PROP_RACK_ID)
.map_err(|err| Error::GetProperty { prop: PROP_RACK_ID, err })?
.ok_or_else(|| Error::MissingProperty { prop: PROP_RACK_ID })?
.value()
.map_err(|err| Error::GetValue { prop: PROP_RACK_ID, err })?
.ok_or(Error::MissingValue { prop: PROP_RACK_ID })?
.as_string()
.map_err(|err| Error::ValueAsString { prop: PROP_RACK_ID, err })?;

let rack_id = if prop_rack_id.as_str() == "unknown" {
None
} else {
Some(Uuid::try_parse(&prop_rack_id).map_err(|err| {
CmdError::Failure(format!(
"failed to parse `{CONFIG_PG}/{PROP_RACK_ID}` ({prop_rack_id:?}) as a UUID: {err}"
))
})?)
};

let prop_addr = config
.get_property(PROP_ADDR)
.map_err(|err| Error::GetProperty { prop: PROP_ADDR, err })?
Expand All @@ -236,7 +263,7 @@ fn read_smf_config() -> Result<ConfigProperties, CmdError> {
"no addresses specified by `{CONFIG_PG}/{PROP_ADDR}`"
)))
} else {
Ok(ConfigProperties { id: prop_id, addresses })
Ok(ConfigProperties { id: prop_id, addresses, rack_id })
}
}

Expand Down
14 changes: 13 additions & 1 deletion gateway/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,41 @@ use crate::error::StartupError;
use crate::management_switch::ManagementSwitch;
use crate::management_switch::SwitchConfig;
use gateway_sp_comms::InMemoryHostPhase2Provider;
use slog::Logger;
use slog::{info, Logger};
use std::sync::Arc;
use std::sync::OnceLock;
use uuid::Uuid;

/// Shared state used by API request handlers
pub struct ServerContext {
pub mgmt_switch: ManagementSwitch,
pub host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
pub rack_id: OnceLock<Uuid>,
pub log: Logger,
}

impl ServerContext {
pub async fn new(
host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
switch_config: SwitchConfig,
rack_id_config: Option<Uuid>,
log: &Logger,
) -> Result<Arc<Self>, StartupError> {
let mgmt_switch =
ManagementSwitch::new(switch_config, &host_phase2_provider, log)
.await?;

let rack_id = if let Some(id) = rack_id_config {
info!(log, "Setting rack_id"; "rack_id" => %id);
OnceLock::from(id)
} else {
OnceLock::new()
};

Ok(Arc::new(ServerContext {
mgmt_switch,
host_phase2_provider,
rack_id,
log: log.clone(),
}))
}
Expand Down
38 changes: 29 additions & 9 deletions gateway/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ pub fn run_openapi() -> Result<(), String> {
pub struct MgsArguments {
pub id: Uuid,
pub addresses: Vec<SocketAddrV6>,
pub rack_id: Option<Uuid>,
}

type HttpServer = dropshot::HttpServer<Arc<ServerContext>>;
Expand Down Expand Up @@ -125,7 +126,6 @@ impl Server {
pub async fn start(
config: Config,
args: MgsArguments,
_rack_id: Uuid,
log: Logger,
) -> Result<Server, String> {
if args.addresses.is_empty() {
Expand All @@ -146,12 +146,14 @@ impl Server {
Arc::new(InMemoryHostPhase2Provider::with_capacity(
config.host_phase2_recovery_image_cache_max_images,
));
let apictx =
ServerContext::new(host_phase2_provider, config.switch, &log)
.await
.map_err(|error| {
format!("initializing server context: {}", error)
})?;
let apictx = ServerContext::new(
host_phase2_provider,
config.switch,
args.rack_id,
&log,
)
.await
.map_err(|error| format!("initializing server context: {}", error))?;

let mut http_servers = HashMap::with_capacity(args.addresses.len());
let all_servers_shutdown = FuturesUnordered::new();
Expand Down Expand Up @@ -283,6 +285,25 @@ impl Server {
Ok(())
}

/// The rack_id will be set on a refresh of the SMF property when the sled
/// agent starts.
pub fn set_rack_id(&self, rack_id: Option<Uuid>) {
if let Some(rack_id) = rack_id {
let val = self.apictx.rack_id.get_or_init(|| rack_id);
if *val != rack_id {
error!(
self.apictx.log,
"Ignoring attempted change to rack ID";
"current_rack_id" => %val,
"ignored_new_rack_id" => %rack_id);
} else {
info!(self.apictx.log, "Set rack_id"; "rack_id" => %rack_id);
}
} else {
warn!(self.apictx.log, "SMF refresh called without a rack id");
}
}

// TODO does MGS register itself with oximeter?
// Register the Nexus server as a metric producer with `oximeter.
// pub async fn register_as_producer(&self) {
Expand Down Expand Up @@ -313,8 +334,7 @@ pub async fn start_server(
} else {
debug!(log, "registered DTrace probes");
}
let rack_id = Uuid::new_v4();
let server = Server::start(config, args, rack_id, log).await?;
let server = Server::start(config, args, log).await?;
// server.register_as_producer().await;
Ok(server)
}
10 changes: 8 additions & 2 deletions package-manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,10 @@ service_name = "mgs"
only_for_targets.image = "standard"
only_for_targets.switch = "stub"
source.type = "local"
source.paths = [ { from = "smf/mgs-sim", to = "/var/svc/manifest/site/mgs" } ]
source.paths = [
{ from = "smf/mgs/manifest.xml", to = "/var/svc/manifest/site/mgs/manifest.xml" },
{ from = "smf/mgs-sim/config.toml", to = "/var/svc/manifest/site/mgs/config.toml" }
]
output.intermediate_only = true
output.type = "zone"

Expand All @@ -268,7 +271,10 @@ service_name = "mgs"
only_for_targets.image = "standard"
only_for_targets.switch = "softnpu"
source.type = "local"
source.paths = [ { from = "smf/mgs-sim", to = "/var/svc/manifest/site/mgs" } ]
source.paths = [
{ from = "smf/mgs/manifest.xml", to = "/var/svc/manifest/site/mgs/manifest.xml" },
{ from = "smf/mgs-sim/config.toml", to = "/var/svc/manifest/site/mgs/config.toml" }
]
output.intermediate_only = true
output.type = "zone"

Expand Down
18 changes: 18 additions & 0 deletions sled-agent/src/services.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1645,6 +1645,10 @@ impl ServiceManager {
}
}

if let Some(info) = self.inner.sled_info.get() {
smfh.setprop("config/rack_id", info.rack_id)?;
}

smfh.refresh()?;
}
ServiceType::SpSim => {
Expand Down Expand Up @@ -2754,6 +2758,20 @@ impl ServiceManager {
&format!("[{address}]:{MGS_PORT}"),
)?;

// It should be impossible for the `sled_info` not to be set here,
// as the underlay is set at the same time.
if let Some(info) = self.inner.sled_info.get() {
smfh.setprop("config/rack_id", info.rack_id)?;
} else {
error!(
self.inner.log,
concat!(
"rack_id not present,",
" even though underlay address exists"
)
);
}

smfh.refresh()?;
}
ServiceType::Dendrite { .. } => {
Expand Down
57 changes: 0 additions & 57 deletions smf/mgs-sim/manifest.xml

This file was deleted.

6 changes: 6 additions & 0 deletions smf/mgs/manifest.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
"unknown" with a UUID as a part of starting us.
-->
<propval name='id' type='astring' value='unknown' />
<!--
`config/rack_id` is expected to have a single value; sled-agent will replace
"unknown" with a UUID when it learns about it. This should happen in the same
refresh where the underlay address is provisioned into `config/address`.
-->
<propval name='rack_id' type='astring' value='unknown' />
<!--
`config/address` is allowed to have multiple values, so we do not seed it
with an initial `unknown` that sled-agent would need to delete.
Expand Down

0 comments on commit 0a5b91a

Please sign in to comment.