Skip to content

Commit

Permalink
Support for propolis-based softnpu device, fix multi-switch uplink up…
Browse files Browse the repository at this point in the history
…dates. (#4390)
  • Loading branch information
rcgoodfellow authored Nov 4, 2023
1 parent 649a9fb commit ea4da47
Show file tree
Hide file tree
Showing 25 changed files with 273 additions and 128 deletions.
2 changes: 2 additions & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,5 @@ impl slog::KV for FileKv {
)
}
}

pub const OMICRON_DPD_TAG: &str = "omicron";
8 changes: 4 additions & 4 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -517,12 +517,12 @@ impl RunningZone {
// services are up, so future requests to create network addresses
// or manipulate services will work.
let fmri = "svc:/milestone/single-user:default";
wait_for_service(Some(&zone.name), fmri).await.map_err(|_| {
BootError::Timeout {
wait_for_service(Some(&zone.name), fmri, zone.log.clone())
.await
.map_err(|_| BootError::Timeout {
service: fmri.to_string(),
zone: zone.name.to_string(),
}
})?;
})?;

// If the zone is self-assembling, then SMF service(s) inside the zone
// will be creating the listen address for the zone's service(s),
Expand Down
32 changes: 31 additions & 1 deletion illumos-utils/src/svc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use omicron_common::backoff;
#[cfg_attr(any(test, feature = "testing"), mockall::automock, allow(dead_code))]
mod inner {
use super::*;
use slog::{warn, Logger};

// TODO(https://www.illumos.org/issues/13837): This is a hack;
// remove me when when fixed. Ideally, the ".synchronous()" argument
Expand All @@ -27,10 +28,19 @@ mod inner {
pub async fn wait_for_service<'a, 'b>(
zone: Option<&'a str>,
fmri: &'b str,
log: Logger,
) -> Result<(), Error> {
let name = smf::PropertyName::new("restarter", "state").unwrap();

let log_notification_failure = |_error, _delay| {};
let log_notification_failure = |error, delay| {
warn!(
log,
"wait for service {:?} failed: {}. retry in {:?}",
zone,
error,
delay
);
};
backoff::retry_notify(
backoff::retry_policy_local(),
|| async {
Expand All @@ -47,6 +57,26 @@ mod inner {
== &smf::PropertyValue::Astring("online".to_string())
{
return Ok(());
} else {
// This is helpful in virtual environments where
// services take a few tries to come up. To enable,
// compile with RUSTFLAGS="--cfg svcadm_autoclear"
#[cfg(svcadm_autoclear)]
if let Some(zname) = zone {
if let Err(out) =
tokio::process::Command::new(crate::PFEXEC)
.env_clear()
.arg("svcadm")
.arg("-z")
.arg(zname)
.arg("clear")
.arg("*")
.output()
.await
{
warn!(log, "clearing service maintenance failed: {out}");
};
}
}
}
return Err(backoff::BackoffError::transient(
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ macro_rules! declare_saga_actions {
};
}

pub(crate) const NEXUS_DPD_TAG: &str = "nexus";
use omicron_common::OMICRON_DPD_TAG as NEXUS_DPD_TAG;

pub(crate) use __action_name;
pub(crate) use __emit_action;
Expand Down
93 changes: 57 additions & 36 deletions nexus/src/app/sagas/switch_port_settings_apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::{NexusActionContext, NEXUS_DPD_TAG};
use crate::app::map_switch_zone_addrs;
use crate::app::sagas::retry_until_known_result;
use crate::app::sagas::{
declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError,
Expand All @@ -15,15 +16,17 @@ use dpd_client::types::{
RouteSettingsV4, RouteSettingsV6,
};
use dpd_client::{Ipv4Cidr, Ipv6Cidr};
use internal_dns::ServiceName;
use ipnetwork::IpNetwork;
use mg_admin_client::types::Prefix4;
use mg_admin_client::types::{ApplyRequest, BgpPeerConfig, BgpRoute};
use mg_admin_client::types::{ApplyRequest, BgpPeerConfig};
use nexus_db_model::{SwitchLinkFec, SwitchLinkSpeed, NETWORK_KEY};
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db::datastore::UpdatePrecondition;
use nexus_db_queries::{authn, db};
use nexus_types::external_api::params;
use omicron_common::api::external::{self, DataPageParams, NameOrId};
use omicron_common::address::SLED_AGENT_PORT;
use omicron_common::api::external::{self, NameOrId};
use omicron_common::api::internal::shared::{
ParseSwitchLocationError, SwitchLocation,
};
Expand All @@ -35,8 +38,8 @@ use sled_agent_client::types::{
BgpPeerConfig as OmicronBgpPeerConfig, HostPortConfig,
};
use std::collections::HashMap;
use std::net::IpAddr;
use std::net::SocketAddrV6;
use std::net::{IpAddr, Ipv6Addr};
use std::str::FromStr;
use std::sync::Arc;
use steno::ActionError;
Expand Down Expand Up @@ -177,7 +180,6 @@ pub(crate) fn api_to_dpd_port_settings(
settings: &SwitchPortSettingsCombinedResult,
) -> Result<PortSettings, String> {
let mut dpd_port_settings = PortSettings {
tag: NEXUS_DPD_TAG.into(),
links: HashMap::new(),
v4_routes: HashMap::new(),
v6_routes: HashMap::new(),
Expand All @@ -192,6 +194,7 @@ pub(crate) fn api_to_dpd_port_settings(
LinkSettings {
params: LinkCreate {
autoneg: false,
lane: Some(LinkId(0)),
kr: false,
fec: match l.fec {
SwitchLinkFec::Firecode => PortFec::Firecode,
Expand Down Expand Up @@ -283,7 +286,13 @@ async fn spa_ensure_switch_port_settings(
})?;

retry_until_known_result(log, || async {
dpd_client.port_settings_apply(&port_id, &dpd_port_settings).await
dpd_client
.port_settings_apply(
&port_id,
Some(NEXUS_DPD_TAG),
&dpd_port_settings,
)
.await
})
.await
.map_err(|e| match e {
Expand Down Expand Up @@ -331,7 +340,9 @@ async fn spa_undo_ensure_switch_port_settings(
Some(id) => id,
None => {
retry_until_known_result(log, || async {
dpd_client.port_settings_clear(&port_id).await
dpd_client
.port_settings_clear(&port_id, Some(NEXUS_DPD_TAG))
.await
})
.await
.map_err(|e| external::Error::internal_error(&e.to_string()))?;
Expand All @@ -355,7 +366,13 @@ async fn spa_undo_ensure_switch_port_settings(
})?;

retry_until_known_result(log, || async {
dpd_client.port_settings_apply(&port_id, &dpd_port_settings).await
dpd_client
.port_settings_apply(
&port_id,
Some(NEXUS_DPD_TAG),
&dpd_port_settings,
)
.await
})
.await
.map_err(|e| external::Error::internal_error(&e.to_string()))?;
Expand Down Expand Up @@ -418,22 +435,6 @@ pub(crate) async fn ensure_switch_port_bgp_settings(
))
})?;

// TODO picking the first configured address by default, but this needs
// to be something that can be specified in the API.
let nexthop = match settings.addresses.get(0) {
Some(switch_port_addr) => Ok(switch_port_addr.address.ip()),
None => Err(ActionError::action_failed(
"at least one address required for bgp peering".to_string(),
)),
}?;

let nexthop = match nexthop {
IpAddr::V4(nexthop) => Ok(nexthop),
IpAddr::V6(_) => Err(ActionError::action_failed(
"IPv6 nexthop not yet supported".to_string(),
)),
}?;

let mut prefixes = Vec::new();
for a in &announcements {
let value = match a.network.ip() {
Expand All @@ -455,7 +456,7 @@ pub(crate) async fn ensure_switch_port_bgp_settings(
connect_retry: peer.connect_retry.0.into(),
keepalive: peer.keepalive.0.into(),
resolution: BGP_SESSION_RESOLUTION,
routes: vec![BgpRoute { nexthop, prefixes }],
originate: prefixes,
};

bgp_peer_configs.push(bpc);
Expand Down Expand Up @@ -809,7 +810,7 @@ pub(crate) async fn select_mg_client(
}

pub(crate) async fn get_scrimlet_address(
_location: SwitchLocation,
location: SwitchLocation,
nexus: &Arc<Nexus>,
) -> Result<SocketAddrV6, ActionError> {
/* TODO this depends on DNS entries only coming from RSS, it's broken
Expand All @@ -826,21 +827,41 @@ pub(crate) async fn get_scrimlet_address(
))
})
*/
let opctx = &nexus.opctx_for_internal_api();
Ok(nexus
.sled_list(opctx, &DataPageParams::max_page())
let result = nexus
.resolver()
.await
.lookup_all_ipv6(ServiceName::Dendrite)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"get_scrimlet_address: failed to list sleds: {e}"
"scrimlet dns lookup failed {e}",
))
})?
.into_iter()
.find(|x| x.is_scrimlet())
.ok_or(ActionError::action_failed(
"get_scrimlet_address: no scrimlets found".to_string(),
))?
.address())
});

let mappings = match result {
Ok(addrs) => map_switch_zone_addrs(&nexus.log, addrs).await,
Err(e) => {
warn!(nexus.log, "Failed to lookup Dendrite address: {e}");
return Err(ActionError::action_failed(format!(
"switch mapping failed {e}",
)));
}
};

let addr = match mappings.get(&location) {
Some(addr) => addr,
None => {
return Err(ActionError::action_failed(format!(
"address for switch at location: {location} not found",
)));
}
};

let mut segments = addr.segments();
segments[7] = 1;
let addr = Ipv6Addr::from(segments);

Ok(SocketAddrV6::new(addr, SLED_AGENT_PORT, 0, 0))
}

#[derive(Clone, Debug)]
Expand Down
12 changes: 9 additions & 3 deletions nexus/src/app/sagas/switch_port_settings_clear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::switch_port_settings_apply::select_dendrite_client;
use super::NexusActionContext;
use super::{NexusActionContext, NEXUS_DPD_TAG};
use crate::app::sagas::retry_until_known_result;
use crate::app::sagas::switch_port_settings_apply::{
api_to_dpd_port_settings, apply_bootstore_update, bootstore_update,
Expand Down Expand Up @@ -154,7 +154,7 @@ async fn spa_clear_switch_port_settings(
let dpd_client = select_dendrite_client(&sagactx).await?;

retry_until_known_result(log, || async {
dpd_client.port_settings_clear(&port_id).await
dpd_client.port_settings_clear(&port_id, Some(NEXUS_DPD_TAG)).await
})
.await
.map_err(|e| ActionError::action_failed(e.to_string()))?;
Expand Down Expand Up @@ -197,7 +197,13 @@ async fn spa_undo_clear_switch_port_settings(
.map_err(ActionError::action_failed)?;

retry_until_known_result(log, || async {
dpd_client.port_settings_apply(&port_id, &dpd_port_settings).await
dpd_client
.port_settings_apply(
&port_id,
Some(NEXUS_DPD_TAG),
&dpd_port_settings,
)
.await
})
.await
.map_err(|e| external::Error::internal_error(&e.to_string()))?;
Expand Down
4 changes: 4 additions & 0 deletions nexus/tests/integration_tests/switch_port.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ use omicron_common::api::external::{
type ControlPlaneTestContext =
nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;

// TODO: unfortunately this test can no longer be run in the integration test
// suite because it depends on communicating with MGS which is not part
// of the infrastructure available in the integration test context.
#[ignore]
#[nexus_test]
async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) {
let client = &ctx.external_client;
Expand Down
20 changes: 10 additions & 10 deletions package-manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ source.repo = "maghemite"
# `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
# building `ddm-admin-client` (which will instruct you to update
# `tools/maghemite_openapi_version`).
source.commit = "d7169a61fd8833b3a1e6f46d897ca3295b2a28b6"
source.commit = "82aa17646265449ee0ede9410208e510fa4a5877"
# The SHA256 digest is automatically posted to:
# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/maghemite.sha256.txt
source.sha256 = "d871406ed926571efebdab248de08d4f1ca6c31d4f9a691ce47b186474165c57"
Expand All @@ -438,7 +438,7 @@ source.repo = "maghemite"
# `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
# building `ddm-admin-client` (which will instruct you to update
# `tools/maghemite_openapi_version`).
source.commit = "d7169a61fd8833b3a1e6f46d897ca3295b2a28b6"
source.commit = "82aa17646265449ee0ede9410208e510fa4a5877"
# The SHA256 digest is automatically posted to:
# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/mg-ddm.sha256.txt
source.sha256 = "85ec05a8726989b5cb0a567de6b0855f6f84b6f3409ac99ccaf372be5821e45d"
Expand All @@ -453,10 +453,10 @@ source.repo = "maghemite"
# `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
# building `ddm-admin-client` (which will instruct you to update
# `tools/maghemite_openapi_version`).
source.commit = "d7169a61fd8833b3a1e6f46d897ca3295b2a28b6"
source.commit = "82aa17646265449ee0ede9410208e510fa4a5877"
# The SHA256 digest is automatically posted to:
# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/mg-ddm.sha256.txt
source.sha256 = "452dfb3491e1b6d4df6be1cb689921f59623aed082e47606a78c0f44d918f66a"
source.sha256 = "1badd6adfece0a1b661f7efb9a2ca65e471f45cf9c8ecbd72b228ca174311e31"
output.type = "zone"
output.intermediate_only = true

Expand All @@ -473,8 +473,8 @@ only_for_targets.image = "standard"
# 2. Copy dendrite.tar.gz from dendrite/out to omicron/out
source.type = "prebuilt"
source.repo = "dendrite"
source.commit = "343e3a572cc02efe3f8b68f9affd008623a33966"
source.sha256 = "0808f331741e02d55e199847579dfd01f3658b21c7122cef8c3f9279f43dbab0"
source.commit = "559fad2f379900a05ced410944353c1d19100390"
source.sha256 = "ce14c1f0481b13ce47a25386a3b1e49d9570f4c1c31cad3f13c14f75b130dafa"
output.type = "zone"
output.intermediate_only = true

Expand All @@ -498,8 +498,8 @@ only_for_targets.image = "standard"
# 2. Copy the output zone image from dendrite/out to omicron/out
source.type = "prebuilt"
source.repo = "dendrite"
source.commit = "343e3a572cc02efe3f8b68f9affd008623a33966"
source.sha256 = "c359de1be5073a484d86d4c58e8656a36002ce1dc38506f97b730e21615ccae1"
source.commit = "559fad2f379900a05ced410944353c1d19100390"
source.sha256 = "1a1246e2e596f36182eb6a5e402c272d0cd91aab351c5289cc4a29cb822c8888"
output.type = "zone"
output.intermediate_only = true

Expand All @@ -516,8 +516,8 @@ only_for_targets.image = "standard"
# 2. Copy dendrite.tar.gz from dendrite/out to omicron/out/dendrite-softnpu.tar.gz
source.type = "prebuilt"
source.repo = "dendrite"
source.commit = "343e3a572cc02efe3f8b68f9affd008623a33966"
source.sha256 = "110bfbfb2cf3d3471f3e3a64d26129c7a02f6c5857f9623ebb99690728c3b2ff"
source.commit = "559fad2f379900a05ced410944353c1d19100390"
source.sha256 = "b9ff0f7f9e6193f4fa0aff77f7ec80f726f431ce1024a88021f207beb9079793"
output.type = "zone"
output.intermediate_only = true

Expand Down
1 change: 1 addition & 0 deletions sled-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,6 @@ machine-non-gimlet = []
switch-asic = []
switch-stub = []
switch-softnpu = []
switch-hypersoftnpu = []
rack-topology-single-sled = []
rack-topology-multi-sled = []
Loading

0 comments on commit ea4da47

Please sign in to comment.