Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for propolis-based softnpu device, fix multi-switch uplink updates. #4390

Merged
merged 6 commits into from
Nov 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,5 @@ impl slog::KV for FileKv {
)
}
}

pub const OMICRON_DPD_TAG: &str = "omicron";
8 changes: 4 additions & 4 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -517,12 +517,12 @@ impl RunningZone {
// services are up, so future requests to create network addresses
// or manipulate services will work.
let fmri = "svc:/milestone/single-user:default";
wait_for_service(Some(&zone.name), fmri).await.map_err(|_| {
BootError::Timeout {
wait_for_service(Some(&zone.name), fmri, zone.log.clone())
.await
.map_err(|_| BootError::Timeout {
service: fmri.to_string(),
zone: zone.name.to_string(),
}
})?;
})?;

// If the zone is self-assembling, then SMF service(s) inside the zone
// will be creating the listen address for the zone's service(s),
Expand Down
32 changes: 31 additions & 1 deletion illumos-utils/src/svc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use omicron_common::backoff;
#[cfg_attr(any(test, feature = "testing"), mockall::automock, allow(dead_code))]
mod inner {
use super::*;
use slog::{warn, Logger};

// TODO(https://www.illumos.org/issues/13837): This is a hack;
// remove me when when fixed. Ideally, the ".synchronous()" argument
Expand All @@ -27,10 +28,19 @@ mod inner {
pub async fn wait_for_service<'a, 'b>(
zone: Option<&'a str>,
fmri: &'b str,
log: Logger,
) -> Result<(), Error> {
let name = smf::PropertyName::new("restarter", "state").unwrap();

let log_notification_failure = |_error, _delay| {};
let log_notification_failure = |error, delay| {
warn!(
log,
"wait for service {:?} failed: {}. retry in {:?}",
zone,
error,
delay
);
};
backoff::retry_notify(
backoff::retry_policy_local(),
|| async {
Expand All @@ -47,6 +57,26 @@ mod inner {
== &smf::PropertyValue::Astring("online".to_string())
{
return Ok(());
} else {
// This is helpful in virtual environments where
// services take a few tries to come up. To enable,
// compile with RUSTFLAGS="--cfg svcadm_autoclear"
#[cfg(svcadm_autoclear)]
if let Some(zname) = zone {
if let Err(out) =
tokio::process::Command::new(crate::PFEXEC)
.env_clear()
.arg("svcadm")
.arg("-z")
.arg(zname)
.arg("clear")
.arg("*")
.output()
.await
{
warn!(log, "clearing service maintenance failed: {out}");
};
}
}
}
return Err(backoff::BackoffError::transient(
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ macro_rules! declare_saga_actions {
};
}

pub(crate) const NEXUS_DPD_TAG: &str = "nexus";
use omicron_common::OMICRON_DPD_TAG as NEXUS_DPD_TAG;

pub(crate) use __action_name;
pub(crate) use __emit_action;
Expand Down
93 changes: 57 additions & 36 deletions nexus/src/app/sagas/switch_port_settings_apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::{NexusActionContext, NEXUS_DPD_TAG};
use crate::app::map_switch_zone_addrs;
use crate::app::sagas::retry_until_known_result;
use crate::app::sagas::{
declare_saga_actions, ActionRegistry, NexusSaga, SagaInitError,
Expand All @@ -15,15 +16,17 @@ use dpd_client::types::{
RouteSettingsV4, RouteSettingsV6,
};
use dpd_client::{Ipv4Cidr, Ipv6Cidr};
use internal_dns::ServiceName;
use ipnetwork::IpNetwork;
use mg_admin_client::types::Prefix4;
use mg_admin_client::types::{ApplyRequest, BgpPeerConfig, BgpRoute};
use mg_admin_client::types::{ApplyRequest, BgpPeerConfig};
use nexus_db_model::{SwitchLinkFec, SwitchLinkSpeed, NETWORK_KEY};
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db::datastore::UpdatePrecondition;
use nexus_db_queries::{authn, db};
use nexus_types::external_api::params;
use omicron_common::api::external::{self, DataPageParams, NameOrId};
use omicron_common::address::SLED_AGENT_PORT;
use omicron_common::api::external::{self, NameOrId};
use omicron_common::api::internal::shared::{
ParseSwitchLocationError, SwitchLocation,
};
Expand All @@ -35,8 +38,8 @@ use sled_agent_client::types::{
BgpPeerConfig as OmicronBgpPeerConfig, HostPortConfig,
};
use std::collections::HashMap;
use std::net::IpAddr;
use std::net::SocketAddrV6;
use std::net::{IpAddr, Ipv6Addr};
use std::str::FromStr;
use std::sync::Arc;
use steno::ActionError;
Expand Down Expand Up @@ -177,7 +180,6 @@ pub(crate) fn api_to_dpd_port_settings(
settings: &SwitchPortSettingsCombinedResult,
) -> Result<PortSettings, String> {
let mut dpd_port_settings = PortSettings {
tag: NEXUS_DPD_TAG.into(),
links: HashMap::new(),
v4_routes: HashMap::new(),
v6_routes: HashMap::new(),
Expand All @@ -192,6 +194,7 @@ pub(crate) fn api_to_dpd_port_settings(
LinkSettings {
params: LinkCreate {
autoneg: false,
lane: Some(LinkId(0)),
kr: false,
fec: match l.fec {
SwitchLinkFec::Firecode => PortFec::Firecode,
Expand Down Expand Up @@ -283,7 +286,13 @@ async fn spa_ensure_switch_port_settings(
})?;

retry_until_known_result(log, || async {
dpd_client.port_settings_apply(&port_id, &dpd_port_settings).await
dpd_client
.port_settings_apply(
&port_id,
Some(NEXUS_DPD_TAG),
&dpd_port_settings,
)
.await
})
.await
.map_err(|e| match e {
Expand Down Expand Up @@ -331,7 +340,9 @@ async fn spa_undo_ensure_switch_port_settings(
Some(id) => id,
None => {
retry_until_known_result(log, || async {
dpd_client.port_settings_clear(&port_id).await
dpd_client
.port_settings_clear(&port_id, Some(NEXUS_DPD_TAG))
.await
})
.await
.map_err(|e| external::Error::internal_error(&e.to_string()))?;
Expand All @@ -355,7 +366,13 @@ async fn spa_undo_ensure_switch_port_settings(
})?;

retry_until_known_result(log, || async {
dpd_client.port_settings_apply(&port_id, &dpd_port_settings).await
dpd_client
.port_settings_apply(
&port_id,
Some(NEXUS_DPD_TAG),
&dpd_port_settings,
)
.await
})
.await
.map_err(|e| external::Error::internal_error(&e.to_string()))?;
Expand Down Expand Up @@ -418,22 +435,6 @@ pub(crate) async fn ensure_switch_port_bgp_settings(
))
})?;

// TODO picking the first configured address by default, but this needs
// to be something that can be specified in the API.
let nexthop = match settings.addresses.get(0) {
Some(switch_port_addr) => Ok(switch_port_addr.address.ip()),
None => Err(ActionError::action_failed(
"at least one address required for bgp peering".to_string(),
)),
}?;

let nexthop = match nexthop {
IpAddr::V4(nexthop) => Ok(nexthop),
IpAddr::V6(_) => Err(ActionError::action_failed(
"IPv6 nexthop not yet supported".to_string(),
)),
}?;

let mut prefixes = Vec::new();
for a in &announcements {
let value = match a.network.ip() {
Expand All @@ -455,7 +456,7 @@ pub(crate) async fn ensure_switch_port_bgp_settings(
connect_retry: peer.connect_retry.0.into(),
keepalive: peer.keepalive.0.into(),
resolution: BGP_SESSION_RESOLUTION,
routes: vec![BgpRoute { nexthop, prefixes }],
originate: prefixes,
};

bgp_peer_configs.push(bpc);
Expand Down Expand Up @@ -809,7 +810,7 @@ pub(crate) async fn select_mg_client(
}

pub(crate) async fn get_scrimlet_address(
_location: SwitchLocation,
location: SwitchLocation,
nexus: &Arc<Nexus>,
) -> Result<SocketAddrV6, ActionError> {
/* TODO this depends on DNS entries only coming from RSS, it's broken
Expand All @@ -826,21 +827,41 @@ pub(crate) async fn get_scrimlet_address(
))
})
*/
let opctx = &nexus.opctx_for_internal_api();
Ok(nexus
.sled_list(opctx, &DataPageParams::max_page())
let result = nexus
.resolver()
.await
.lookup_all_ipv6(ServiceName::Dendrite)
.await
.map_err(|e| {
ActionError::action_failed(format!(
"get_scrimlet_address: failed to list sleds: {e}"
"scrimlet dns lookup failed {e}",
))
})?
.into_iter()
.find(|x| x.is_scrimlet())
.ok_or(ActionError::action_failed(
"get_scrimlet_address: no scrimlets found".to_string(),
))?
.address())
});

let mappings = match result {
Ok(addrs) => map_switch_zone_addrs(&nexus.log, addrs).await,
Err(e) => {
warn!(nexus.log, "Failed to lookup Dendrite address: {e}");
return Err(ActionError::action_failed(format!(
"switch mapping failed {e}",
)));
}
};

let addr = match mappings.get(&location) {
Some(addr) => addr,
None => {
return Err(ActionError::action_failed(format!(
"address for switch at location: {location} not found",
)));
}
};

let mut segments = addr.segments();
segments[7] = 1;
let addr = Ipv6Addr::from(segments);

Ok(SocketAddrV6::new(addr, SLED_AGENT_PORT, 0, 0))
}

#[derive(Clone, Debug)]
Expand Down
12 changes: 9 additions & 3 deletions nexus/src/app/sagas/switch_port_settings_clear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::switch_port_settings_apply::select_dendrite_client;
use super::NexusActionContext;
use super::{NexusActionContext, NEXUS_DPD_TAG};
use crate::app::sagas::retry_until_known_result;
use crate::app::sagas::switch_port_settings_apply::{
api_to_dpd_port_settings, apply_bootstore_update, bootstore_update,
Expand Down Expand Up @@ -154,7 +154,7 @@ async fn spa_clear_switch_port_settings(
let dpd_client = select_dendrite_client(&sagactx).await?;

retry_until_known_result(log, || async {
dpd_client.port_settings_clear(&port_id).await
dpd_client.port_settings_clear(&port_id, Some(NEXUS_DPD_TAG)).await
})
.await
.map_err(|e| ActionError::action_failed(e.to_string()))?;
Expand Down Expand Up @@ -197,7 +197,13 @@ async fn spa_undo_clear_switch_port_settings(
.map_err(ActionError::action_failed)?;

retry_until_known_result(log, || async {
dpd_client.port_settings_apply(&port_id, &dpd_port_settings).await
dpd_client
.port_settings_apply(
&port_id,
Some(NEXUS_DPD_TAG),
&dpd_port_settings,
)
.await
})
.await
.map_err(|e| external::Error::internal_error(&e.to_string()))?;
Expand Down
4 changes: 4 additions & 0 deletions nexus/tests/integration_tests/switch_port.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ use omicron_common::api::external::{
type ControlPlaneTestContext =
nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;

// TODO: unfortunately this test can no longer be run in the integration test
// suite because it depends on communicating with MGS which is not part
// of the infrastructure available in the integration test context.
#[ignore]
Comment on lines +27 to +30
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we unable to use the faux MGS here (the same one we use to simulate MGS for the switch zone with SoftNPU)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sure that can be made to work, but it's not a part of the current integration test infrastructure and not something I have the bandwidth to take on at the moment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally understandable. It may be something that fits into the scope of work for the next set of RPW PRs, I was curious if there were any blockers I wasn't aware of.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For what it's worth, as of (I think it was) #4291, the test suite starts a copy of MGS backed by simulated SPs. (I haven't looked at what this test requires so I'm not sure that will work but it may be an option.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool. I'll give this a shot when I merge main back into this branch.

#[nexus_test]
async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) {
let client = &ctx.external_client;
Expand Down
20 changes: 10 additions & 10 deletions package-manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ source.repo = "maghemite"
# `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
# building `ddm-admin-client` (which will instruct you to update
# `tools/maghemite_openapi_version`).
source.commit = "d7169a61fd8833b3a1e6f46d897ca3295b2a28b6"
source.commit = "82aa17646265449ee0ede9410208e510fa4a5877"
# The SHA256 digest is automatically posted to:
# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/maghemite.sha256.txt
source.sha256 = "d871406ed926571efebdab248de08d4f1ca6c31d4f9a691ce47b186474165c57"
Expand All @@ -438,7 +438,7 @@ source.repo = "maghemite"
# `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
# building `ddm-admin-client` (which will instruct you to update
# `tools/maghemite_openapi_version`).
source.commit = "d7169a61fd8833b3a1e6f46d897ca3295b2a28b6"
source.commit = "82aa17646265449ee0ede9410208e510fa4a5877"
# The SHA256 digest is automatically posted to:
# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/mg-ddm.sha256.txt
source.sha256 = "85ec05a8726989b5cb0a567de6b0855f6f84b6f3409ac99ccaf372be5821e45d"
Expand All @@ -453,10 +453,10 @@ source.repo = "maghemite"
# `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
# building `ddm-admin-client` (which will instruct you to update
# `tools/maghemite_openapi_version`).
source.commit = "d7169a61fd8833b3a1e6f46d897ca3295b2a28b6"
source.commit = "82aa17646265449ee0ede9410208e510fa4a5877"
# The SHA256 digest is automatically posted to:
# https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/mg-ddm.sha256.txt
source.sha256 = "452dfb3491e1b6d4df6be1cb689921f59623aed082e47606a78c0f44d918f66a"
source.sha256 = "1badd6adfece0a1b661f7efb9a2ca65e471f45cf9c8ecbd72b228ca174311e31"
output.type = "zone"
output.intermediate_only = true

Expand All @@ -473,8 +473,8 @@ only_for_targets.image = "standard"
# 2. Copy dendrite.tar.gz from dendrite/out to omicron/out
source.type = "prebuilt"
source.repo = "dendrite"
source.commit = "343e3a572cc02efe3f8b68f9affd008623a33966"
source.sha256 = "0808f331741e02d55e199847579dfd01f3658b21c7122cef8c3f9279f43dbab0"
source.commit = "559fad2f379900a05ced410944353c1d19100390"
source.sha256 = "ce14c1f0481b13ce47a25386a3b1e49d9570f4c1c31cad3f13c14f75b130dafa"
output.type = "zone"
output.intermediate_only = true

Expand All @@ -498,8 +498,8 @@ only_for_targets.image = "standard"
# 2. Copy the output zone image from dendrite/out to omicron/out
source.type = "prebuilt"
source.repo = "dendrite"
source.commit = "343e3a572cc02efe3f8b68f9affd008623a33966"
source.sha256 = "c359de1be5073a484d86d4c58e8656a36002ce1dc38506f97b730e21615ccae1"
source.commit = "559fad2f379900a05ced410944353c1d19100390"
source.sha256 = "1a1246e2e596f36182eb6a5e402c272d0cd91aab351c5289cc4a29cb822c8888"
output.type = "zone"
output.intermediate_only = true

Expand All @@ -516,8 +516,8 @@ only_for_targets.image = "standard"
# 2. Copy dendrite.tar.gz from dendrite/out to omicron/out/dendrite-softnpu.tar.gz
source.type = "prebuilt"
source.repo = "dendrite"
source.commit = "343e3a572cc02efe3f8b68f9affd008623a33966"
source.sha256 = "110bfbfb2cf3d3471f3e3a64d26129c7a02f6c5857f9623ebb99690728c3b2ff"
source.commit = "559fad2f379900a05ced410944353c1d19100390"
source.sha256 = "b9ff0f7f9e6193f4fa0aff77f7ec80f726f431ce1024a88021f207beb9079793"
output.type = "zone"
output.intermediate_only = true

Expand Down
1 change: 1 addition & 0 deletions sled-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,6 @@ machine-non-gimlet = []
switch-asic = []
switch-stub = []
switch-softnpu = []
switch-hypersoftnpu = []
rack-topology-single-sled = []
rack-topology-multi-sled = []
Loading
Loading