diff --git a/Cargo.lock b/Cargo.lock index b7c1bb70bd..b8855cb0a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5320,6 +5320,7 @@ dependencies = [ "key-manager", "libc", "macaddr", + "mg-admin-client", "nexus-client 0.1.0", "omicron-common 0.1.0", "omicron-test-utils", diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index 784da8fcc6..155fbf971b 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -103,6 +103,17 @@ pub struct BgpPeerConfig { pub port: String, /// Address of the peer. pub addr: Ipv4Addr, + /// How long to keep a session alive without a keepalive in seconds. + /// Defaults to 6. + pub hold_time: Option, + /// How long to keep a peer in idle after a state machine reset in seconds. + pub idle_hold_time: Option, + /// How long to delay sending open messages to a peer. In seconds. + pub delay_open: Option, + /// The interval in seconds between peer connection retry attempts. + pub connect_retry: Option, + /// The interval to send keepalive messages at. + pub keepalive: Option, } #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index d40b09d2be..1676f44083 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -280,6 +280,15 @@ mod test { let message = regex::Regex::new(r"os error \d+") .unwrap() .replace_all(&e, "os error <>"); + // Communication errors differ based on the configuration of the + // machine running the test. For example whether or not the machine + // has IPv6 configured will determine if an error is network + // unreachable or a timeout due to sending a packet to a known + // discard prefix. So just key in on the communication error in a + // general sense. + let message = regex::Regex::new(r"Communication Error.*") + .unwrap() + .replace_all(&message, "Communication Error <>"); write!(&mut s, "error: {}\n", message).unwrap(); } diff --git a/nexus/inventory/tests/output/collector_errors.txt b/nexus/inventory/tests/output/collector_errors.txt index f231cc7d97..4404046253 100644 --- a/nexus/inventory/tests/output/collector_errors.txt +++ b/nexus/inventory/tests/output/collector_errors.txt @@ -41,4 +41,4 @@ cabooses found: RotSlotB baseboard part "FAKE_SIM_SIDECAR" serial "SimSidecar1": board "SimSidecarRot" errors: -error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error: error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error sending request for url (http://[100::1]:12345/ignition): error trying to connect: tcp connect error: Network is unreachable (os error <>): error trying to connect: tcp connect error: Network is unreachable (os error <>): tcp connect error: Network is unreachable (os error <>): Network is unreachable (os error <>) +error: MGS "http://[100::1]:12345": listing ignition targets: Communication Error <> diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index bed690f839..163f3bd5bb 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -675,10 +675,15 @@ impl super::Nexus { addresses: info.addresses.iter().map(|a| a.address).collect(), bgp_peers: peer_info .iter() - .map(|(_p, asn, addr)| BgpPeerConfig { + .map(|(p, asn, addr)| BgpPeerConfig { addr: *addr, asn: *asn, port: port.port_name.clone(), + hold_time: Some(p.hold_time.0.into()), + connect_retry: Some(p.connect_retry.0.into()), + delay_open: Some(p.delay_open.0.into()), + idle_hold_time: Some(p.idle_hold_time.0.into()), + keepalive: Some(p.keepalive.0.into()), }) .collect(), switch: port.switch_location.parse().unwrap(), diff --git a/nexus/src/app/sagas/switch_port_settings_apply.rs b/nexus/src/app/sagas/switch_port_settings_apply.rs index 830792826e..0c06d6ff83 100644 --- a/nexus/src/app/sagas/switch_port_settings_apply.rs +++ b/nexus/src/app/sagas/switch_port_settings_apply.rs @@ -962,6 +962,11 @@ pub(crate) async fn bootstore_update( asn: *asn, port: switch_port_name.into(), addr, + hold_time: Some(p.hold_time.0.into()), + connect_retry: Some(p.connect_retry.0.into()), + delay_open: Some(p.delay_open.0.into()), + idle_hold_time: Some(p.idle_hold_time.0.into()), + keepalive: Some(p.keepalive.0.into()), }), IpAddr::V6(_) => { warn!(opctx.log, "IPv6 peers not yet supported"); diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 6dcf756737..362a7e91d8 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -277,6 +277,41 @@ "format": "uint32", "minimum": 0 }, + "connect_retry": { + "nullable": true, + "description": "The interval in seconds between peer connection retry attempts.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "delay_open": { + "nullable": true, + "description": "How long to delay sending open messages to a peer. In seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "hold_time": { + "nullable": true, + "description": "How long to keep a session alive without a keepalive in seconds. Defaults to 6.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "idle_hold_time": { + "nullable": true, + "description": "How long to keep a peer in idle after a state machine reset in seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "keepalive": { + "nullable": true, + "description": "The interval to send keepalive messages at.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, "port": { "description": "Switch port the peer is reachable on.", "type": "string" diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 411c52ddff..633058d485 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -803,6 +803,41 @@ "format": "uint32", "minimum": 0 }, + "connect_retry": { + "nullable": true, + "description": "The interval in seconds between peer connection retry attempts.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "delay_open": { + "nullable": true, + "description": "How long to delay sending open messages to a peer. In seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "hold_time": { + "nullable": true, + "description": "How long to keep a session alive without a keepalive in seconds. Defaults to 6.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "idle_hold_time": { + "nullable": true, + "description": "How long to keep a peer in idle after a state machine reset in seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "keepalive": { + "nullable": true, + "description": "The interval to send keepalive messages at.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, "port": { "description": "Switch port the peer is reachable on.", "type": "string" diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 486662853c..23512573e6 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -974,6 +974,41 @@ "format": "uint32", "minimum": 0 }, + "connect_retry": { + "nullable": true, + "description": "The interval in seconds between peer connection retry attempts.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "delay_open": { + "nullable": true, + "description": "How long to delay sending open messages to a peer. In seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "hold_time": { + "nullable": true, + "description": "How long to keep a session alive without a keepalive in seconds. Defaults to 6.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "idle_hold_time": { + "nullable": true, + "description": "How long to keep a peer in idle after a state machine reset in seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "keepalive": { + "nullable": true, + "description": "The interval to send keepalive messages at.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, "port": { "description": "Switch port the peer is reachable on.", "type": "string" diff --git a/openapi/wicketd.json b/openapi/wicketd.json index e0b37f1ba2..a98e7302d9 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -861,6 +861,41 @@ "format": "uint32", "minimum": 0 }, + "connect_retry": { + "nullable": true, + "description": "The interval in seconds between peer connection retry attempts.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "delay_open": { + "nullable": true, + "description": "How long to delay sending open messages to a peer. In seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "hold_time": { + "nullable": true, + "description": "How long to keep a session alive without a keepalive in seconds. Defaults to 6.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "idle_hold_time": { + "nullable": true, + "description": "How long to keep a peer in idle after a state machine reset in seconds.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "keepalive": { + "nullable": true, + "description": "The interval to send keepalive messages at.", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, "port": { "description": "Switch port the peer is reachable on.", "type": "string" diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index 0534c79aef..2ce8ae3bdc 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -132,6 +132,51 @@ "format": "uint32", "minimum": 0.0 }, + "connect_retry": { + "description": "The interval in seconds between peer connection retry attempts.", + "type": [ + "integer", + "null" + ], + "format": "uint64", + "minimum": 0.0 + }, + "delay_open": { + "description": "How long to delay sending open messages to a peer. In seconds.", + "type": [ + "integer", + "null" + ], + "format": "uint64", + "minimum": 0.0 + }, + "hold_time": { + "description": "How long to keep a session alive without a keepalive in seconds. Defaults to 6.", + "type": [ + "integer", + "null" + ], + "format": "uint64", + "minimum": 0.0 + }, + "idle_hold_time": { + "description": "How long to keep a peer in idle after a state machine reset in seconds.", + "type": [ + "integer", + "null" + ], + "format": "uint64", + "minimum": 0.0 + }, + "keepalive": { + "description": "The interval to send keepalive messages at.", + "type": [ + "integer", + "null" + ], + "format": "uint64", + "minimum": 0.0 + }, "port": { "description": "Switch port the peer is reachable on.", "type": "string" diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 3889be5eff..030a15f6bd 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -41,6 +41,7 @@ itertools.workspace = true key-manager.workspace = true libc.workspace = true macaddr.workspace = true +mg-admin-client.workspace = true nexus-client.workspace = true omicron-common.workspace = true once_cell.workspace = true diff --git a/sled-agent/src/bootstrap/early_networking.rs b/sled-agent/src/bootstrap/early_networking.rs index a8aa978f9d..bec309dc27 100644 --- a/sled-agent/src/bootstrap/early_networking.rs +++ b/sled-agent/src/bootstrap/early_networking.rs @@ -17,7 +17,9 @@ use gateway_client::Client as MgsClient; use internal_dns::resolver::{ResolveError, Resolver as DnsResolver}; use internal_dns::ServiceName; use ipnetwork::{IpNetwork, Ipv6Network}; -use omicron_common::address::{Ipv6Subnet, MGS_PORT}; +use mg_admin_client::types::{ApplyRequest, BgpPeerConfig, Prefix4}; +use mg_admin_client::Client as MgdClient; +use omicron_common::address::{Ipv6Subnet, MGD_PORT, MGS_PORT}; use omicron_common::address::{DDMD_PORT, DENDRITE_PORT}; use omicron_common::api::internal::shared::{ PortConfigV1, PortFec, PortSpeed, RackNetworkConfig, RackNetworkConfigV1, @@ -37,6 +39,7 @@ use std::time::{Duration, Instant}; use thiserror::Error; static BOUNDARY_SERVICES_ADDR: &str = "fd00:99::1"; +const BGP_SESSION_RESOLUTION: u64 = 100; /// Errors that can occur during early network setup #[derive(Error, Debug)] @@ -55,6 +58,12 @@ pub enum EarlyNetworkSetupError { #[error("Error during DNS lookup: {0}")] DnsResolver(#[from] ResolveError), + + #[error("BGP configuration error: {0}")] + BgpConfigurationError(String), + + #[error("MGD error: {0}")] + MgdError(String), } enum LookupSwitchZoneAddrsResult { @@ -453,6 +462,67 @@ impl<'a> EarlyNetworkSetup<'a> { ddmd_client.advertise_prefix(Ipv6Subnet::new(ipv6_entry.addr)); } + let mgd = MgdClient::new( + &self.log, + SocketAddrV6::new(switch_zone_underlay_ip, MGD_PORT, 0, 0).into(), + ) + .map_err(|e| { + EarlyNetworkSetupError::MgdError(format!( + "initialize mgd client: {e}" + )) + })?; + + // Iterate through ports and apply BGP config. + for port in &our_ports { + let mut bgp_peer_configs = Vec::new(); + for peer in &port.bgp_peers { + let config = rack_network_config + .bgp + .iter() + .find(|x| x.asn == peer.asn) + .ok_or(EarlyNetworkSetupError::BgpConfigurationError( + format!( + "asn {} referenced by peer undefined", + peer.asn + ), + ))?; + + let bpc = BgpPeerConfig { + asn: peer.asn, + name: format!("{}", peer.addr), + host: format!("{}:179", peer.addr), + hold_time: peer.hold_time.unwrap_or(6), + idle_hold_time: peer.idle_hold_time.unwrap_or(3), + delay_open: peer.delay_open.unwrap_or(0), + connect_retry: peer.connect_retry.unwrap_or(3), + keepalive: peer.keepalive.unwrap_or(2), + resolution: BGP_SESSION_RESOLUTION, + originate: config + .originate + .iter() + .map(|x| Prefix4 { length: x.prefix(), value: x.ip() }) + .collect(), + }; + bgp_peer_configs.push(bpc); + } + + if bgp_peer_configs.is_empty() { + continue; + } + + mgd.inner + .bgp_apply(&ApplyRequest { + peer_group: port.port.clone(), + peers: bgp_peer_configs, + }) + .await + .map_err(|e| { + EarlyNetworkSetupError::BgpConfigurationError(format!( + "BGP peer configuration failed: {e}", + )) + })?; + } + Ok(our_ports) } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 5657c7e69a..362d93479d 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -612,6 +612,11 @@ impl ServiceInner { addr: b.addr, asn: b.asn, port: b.port.clone(), + hold_time: b.hold_time, + connect_retry: b.connect_retry, + delay_open: b.delay_open, + idle_hold_time: b.idle_hold_time, + keepalive: b.keepalive, }) .collect(), }) diff --git a/tools/ci_download_softnpu_machinery b/tools/ci_download_softnpu_machinery index cb5ea40210..3efb030063 100755 --- a/tools/ci_download_softnpu_machinery +++ b/tools/ci_download_softnpu_machinery @@ -15,7 +15,7 @@ OUT_DIR="out/npuzone" # Pinned commit for softnpu ASIC simulator SOFTNPU_REPO="softnpu" -SOFTNPU_COMMIT="c1c42398c82b0220c8b5fa3bfba9c7a3bcaa0943" +SOFTNPU_COMMIT="dec63e67156fe6e958991bbfa090629868115ab5" # This is the softnpu ASIC simulator echo "fetching npuzone" diff --git a/tools/create_virtual_hardware.sh b/tools/create_virtual_hardware.sh index 1db40208f7..884d356222 100755 --- a/tools/create_virtual_hardware.sh +++ b/tools/create_virtual_hardware.sh @@ -63,8 +63,9 @@ function ensure_softnpu_zone { --omicron-zone \ --ports sc0_0,tfportrear0_0 \ --ports sc0_1,tfportqsfp0_0 \ - --sidecar-lite-branch main - } + --sidecar-lite-commit f0585a29fb0285f7a1220c1118856b0e5c1f75c5 \ + --softnpu-commit dec63e67156fe6e958991bbfa090629868115ab5 + } "$SOURCE_DIR"/scrimlet/softnpu-init.sh success "softnpu zone exists" } diff --git a/wicket/src/cli/rack_setup/config_toml.rs b/wicket/src/cli/rack_setup/config_toml.rs index e087c9aa7c..c33c1f72fb 100644 --- a/wicket/src/cli/rack_setup/config_toml.rs +++ b/wicket/src/cli/rack_setup/config_toml.rs @@ -274,6 +274,36 @@ fn populate_network_table( "port", Value::String(Formatted::new(p.port.to_string())), ); + if let Some(x) = p.hold_time { + peer.insert( + "hold_time", + Value::Integer(Formatted::new(x as i64)), + ); + } + if let Some(x) = p.connect_retry { + peer.insert( + "connect_retry", + Value::Integer(Formatted::new(x as i64)), + ); + } + if let Some(x) = p.delay_open { + peer.insert( + "delay_open", + Value::Integer(Formatted::new(x as i64)), + ); + } + if let Some(x) = p.idle_hold_time { + peer.insert( + "idle_hold_time", + Value::Integer(Formatted::new(x as i64)), + ); + } + if let Some(x) = p.keepalive { + peer.insert( + "keepalive", + Value::Integer(Formatted::new(x as i64)), + ); + } peers.push(Value::InlineTable(peer)); } uplink @@ -389,6 +419,11 @@ mod tests { asn: p.asn, port: p.port.clone(), addr: p.addr, + hold_time: p.hold_time, + connect_retry: p.connect_retry, + delay_open: p.delay_open, + idle_hold_time: p.idle_hold_time, + keepalive: p.keepalive, }) .collect(), port: config.port.clone(), @@ -486,6 +521,11 @@ mod tests { asn: 47, addr: "10.2.3.4".parse().unwrap(), port: "port0".into(), + hold_time: Some(6), + connect_retry: Some(3), + delay_open: Some(0), + idle_hold_time: Some(3), + keepalive: Some(2), }], uplink_port_speed: PortSpeed::Speed400G, uplink_port_fec: PortFec::Firecode, diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index a96acc56a0..0aaea427f3 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -521,6 +521,11 @@ fn validate_rack_network_config( addr: p.addr, asn: p.asn, port: p.port.clone(), + hold_time: p.hold_time, + connect_retry: p.connect_retry, + delay_open: p.delay_open, + idle_hold_time: p.idle_hold_time, + keepalive: p.keepalive, }) .collect(), switch: match config.switch {