diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index 1805da8ad8..afc56f40ca 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@f7c663c03b51ed0d93e9cec22a575d3f02175989 # v2 + uses: taiki-e/install-action@6b385b7509c65e9d1b7d6b72244f7e275a7f5cef # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date diff --git a/Cargo.lock b/Cargo.lock index 7df14512fe..b730cbda97 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2276,9 +2276,9 @@ checksum = "aa9a19cbb55df58761df49b23516a86d432839add4af60fc256da840f66ed35b" [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] @@ -3040,9 +3040,9 @@ dependencies = [ [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -5099,9 +5099,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.57" +version = "0.10.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bac25ee399abb46215765b1cb35bc0212377e58a061560d8b29b024fd0430e7c" +checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" dependencies = [ "bitflags 2.4.0", "cfg-if 1.0.0", @@ -5131,9 +5131,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.93" +version = "0.9.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d" +checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" dependencies = [ "cc", "libc", @@ -5631,9 +5631,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pest" @@ -9107,12 +9107,12 @@ dependencies = [ [[package]] name = "url" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "143b538f18257fac9cad154828a57c6bf5157e1aa604d4816b5995bf6de87ae5" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", - "idna 0.4.0", + "idna 0.5.0", "percent-encoding", ] diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 6667f759e4..3ecba7e710 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -225,7 +225,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, - kind: s.kind.map(Into::into), + kind: s.kind.into(), interval: s.interval.into(), } } diff --git a/clients/oximeter-client/src/lib.rs b/clients/oximeter-client/src/lib.rs index 8a03304e06..11aa1452f8 100644 --- a/clients/oximeter-client/src/lib.rs +++ b/clients/oximeter-client/src/lib.rs @@ -43,7 +43,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, - kind: s.kind.map(Into::into), + kind: s.kind.into(), interval: s.interval.into(), } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 1daa85dbe7..780e60b1a2 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -103,7 +103,7 @@ pub struct ProducerEndpoint { /// A unique ID for this producer. pub id: Uuid, /// The kind of producer. - pub kind: Option, + pub kind: ProducerKind, /// The IP address and port at which `oximeter` can collect metrics from the /// producer. pub address: SocketAddr, diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index 155fbf971b..15ab4c66ce 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -140,6 +140,8 @@ pub struct PortConfigV1 { pub uplink_port_fec: PortFec, /// BGP peers on this port pub bgp_peers: Vec, + /// Whether or not to set autonegotiation + pub autoneg: bool, } impl From for PortConfigV1 { @@ -155,6 +157,7 @@ impl From for PortConfigV1 { uplink_port_speed: value.uplink_port_speed, uplink_port_fec: value.uplink_port_fec, bgp_peers: vec![], + autoneg: false, } } } @@ -260,7 +263,7 @@ pub enum ExternalPortDiscovery { } /// Switchport Speed options -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] +#[derive(Copy, Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] #[serde(rename_all = "snake_case")] pub enum PortSpeed { #[serde(alias = "0G")] @@ -284,7 +287,7 @@ pub enum PortSpeed { } /// Switchport FEC options -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] +#[derive(Copy, Clone, Debug, Deserialize, Serialize, PartialEq, JsonSchema)] #[serde(rename_all = "snake_case")] pub enum PortFec { Firecode, diff --git a/nexus/db-model/src/producer_endpoint.rs b/nexus/db-model/src/producer_endpoint.rs index 52a69e0508..f282f6f08f 100644 --- a/nexus/db-model/src/producer_endpoint.rs +++ b/nexus/db-model/src/producer_endpoint.rs @@ -52,7 +52,7 @@ pub struct ProducerEndpoint { #[diesel(embed)] identity: ProducerEndpointIdentity, - pub kind: Option, + pub kind: ProducerKind, pub ip: ipnetwork::IpNetwork, pub port: SqlU16, pub interval: f64, @@ -69,7 +69,7 @@ impl ProducerEndpoint { ) -> Self { Self { identity: ProducerEndpointIdentity::new(endpoint.id), - kind: endpoint.kind.map(Into::into), + kind: endpoint.kind.into(), ip: endpoint.address.ip().into(), port: endpoint.address.port().into(), base_route: endpoint.base_route.clone(), diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 6527da3637..be345032ac 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -146,6 +146,7 @@ table! { mtu -> Int4, fec -> crate::SwitchLinkFecEnum, speed -> crate::SwitchLinkSpeedEnum, + autoneg -> Bool, } } @@ -399,7 +400,7 @@ table! { id -> Uuid, time_created -> Timestamptz, time_modified -> Timestamptz, - kind -> Nullable, + kind -> crate::ProducerKindEnum, ip -> Inet, port -> Int4, interval -> Float8, @@ -1300,7 +1301,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(15, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(17, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/nexus/db-model/src/switch_port.rs b/nexus/db-model/src/switch_port.rs index 44588899b6..6ff8612d2f 100644 --- a/nexus/db-model/src/switch_port.rs +++ b/nexus/db-model/src/switch_port.rs @@ -355,6 +355,7 @@ pub struct SwitchPortLinkConfig { pub mtu: SqlU16, pub fec: SwitchLinkFec, pub speed: SwitchLinkSpeed, + pub autoneg: bool, } impl SwitchPortLinkConfig { @@ -365,6 +366,7 @@ impl SwitchPortLinkConfig { mtu: u16, fec: SwitchLinkFec, speed: SwitchLinkSpeed, + autoneg: bool, ) -> Self { Self { port_settings_id, @@ -372,6 +374,7 @@ impl SwitchPortLinkConfig { link_name, fec, speed, + autoneg, mtu: mtu.into(), } } diff --git a/nexus/db-queries/src/db/column_walker.rs b/nexus/db-queries/src/db/column_walker.rs new file mode 100644 index 0000000000..64c3b450c8 --- /dev/null +++ b/nexus/db-queries/src/db/column_walker.rs @@ -0,0 +1,112 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! CTE utility for iterating over all columns in a table. + +use diesel::prelude::*; +use std::marker::PhantomData; + +/// Used to iterate over a tuple of columns ("T"). +/// +/// Diesel exposes "AllColumns" as a tuple, which is difficult to iterate over +/// -- after all, all the types are distinct. However, each of these types +/// implements "Column", so we can use a macro to provide a +/// "convertion-to-iterator" implemenation for our expected tuples. +pub(crate) struct ColumnWalker { + remaining: PhantomData, +} + +impl ColumnWalker { + pub fn new() -> Self { + Self { remaining: PhantomData } + } +} + +macro_rules! impl_column_walker { + ( $len:literal $($column:ident)+ ) => ( + impl<$($column: Column),+> IntoIterator for ColumnWalker<($($column,)+)> { + type Item = &'static str; + type IntoIter = std::array::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + [$($column::NAME,)+].into_iter() + } + } + ); +} + +// implementations for 1 - 32 columns +impl_column_walker! { 1 A } +impl_column_walker! { 2 A B } +impl_column_walker! { 3 A B C } +impl_column_walker! { 4 A B C D } +impl_column_walker! { 5 A B C D E } +impl_column_walker! { 6 A B C D E F } +impl_column_walker! { 7 A B C D E F G } +impl_column_walker! { 8 A B C D E F G H } +impl_column_walker! { 9 A B C D E F G H I } +impl_column_walker! { 10 A B C D E F G H I J } +impl_column_walker! { 11 A B C D E F G H I J K } +impl_column_walker! { 12 A B C D E F G H I J K L } +impl_column_walker! { 13 A B C D E F G H I J K L M } +impl_column_walker! { 14 A B C D E F G H I J K L M N } +impl_column_walker! { 15 A B C D E F G H I J K L M N O } +impl_column_walker! { 16 A B C D E F G H I J K L M N O P } +impl_column_walker! { 17 A B C D E F G H I J K L M N O P Q } +impl_column_walker! { 18 A B C D E F G H I J K L M N O P Q R } +impl_column_walker! { 19 A B C D E F G H I J K L M N O P Q R S } +impl_column_walker! { 20 A B C D E F G H I J K L M N O P Q R S T } +impl_column_walker! { 21 A B C D E F G H I J K L M N O P Q R S T U } +impl_column_walker! { 22 A B C D E F G H I J K L M N O P Q R S T U V } +impl_column_walker! { 23 A B C D E F G H I J K L M N O P Q R S T U V W } +impl_column_walker! { 24 A B C D E F G H I J K L M N O P Q R S T U V W X } +impl_column_walker! { 25 A B C D E F G H I J K L M N O P Q R S T U V W X Y } +impl_column_walker! { 26 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z } +impl_column_walker! { 27 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A1 } +impl_column_walker! { 28 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A1 B1 } +impl_column_walker! { 29 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A1 B1 C1 } +impl_column_walker! { 30 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A1 B1 C1 D1 } +impl_column_walker! { 31 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A1 B1 C1 D1 E1 } +impl_column_walker! { 32 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A1 B1 C1 D1 E1 F1 } + +#[cfg(test)] +mod test { + use super::*; + + table! { + test_schema.test_table (id) { + id -> Uuid, + value -> Int4, + time_deleted -> Nullable, + } + } + + // We can convert all a tables columns into an iteratable format. + #[test] + fn test_walk_table() { + let all_columns = + ColumnWalker::<::AllColumns>::new(); + + let mut iter = all_columns.into_iter(); + assert_eq!(iter.next(), Some("id")); + assert_eq!(iter.next(), Some("value")); + assert_eq!(iter.next(), Some("time_deleted")); + assert_eq!(iter.next(), None); + } + + // We can, if we want to, also make a ColumnWalker out of an arbitrary tuple + // of columns. + #[test] + fn test_walk_columns() { + let all_columns = ColumnWalker::<( + test_table::columns::id, + test_table::columns::value, + )>::new(); + + let mut iter = all_columns.into_iter(); + assert_eq!(iter.next(), Some("id")); + assert_eq!(iter.next(), Some("value")); + assert_eq!(iter.next(), None); + } +} diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index d7319347f0..6bd4e61f70 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -234,6 +234,7 @@ impl DataStore { c.mtu, c.fec.into(), c.speed.into(), + c.autoneg, )); } result.link_lldp = @@ -304,39 +305,41 @@ impl DataStore { .await?; let mut bgp_peer_config = Vec::new(); - for (interface_name, p) in ¶ms.bgp_peers { - use db::schema::bgp_config; - let bgp_config_id = match &p.bgp_config { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - bgp_config_dsl::bgp_config - .filter(bgp_config::time_deleted.is_null()) - .filter(bgp_config::name.eq(name)) - .select(bgp_config::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| - TxnError::CustomError( - SwitchPortSettingsCreateError::BgpConfigNotFound, - ) - )? - } - }; + for (interface_name, peer_config) in ¶ms.bgp_peers { + for p in &peer_config.peers { + use db::schema::bgp_config; + let bgp_config_id = match &p.bgp_config { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + bgp_config_dsl::bgp_config + .filter(bgp_config::time_deleted.is_null()) + .filter(bgp_config::name.eq(name)) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|_| + TxnError::CustomError( + SwitchPortSettingsCreateError::BgpConfigNotFound, + ) + )? + } + }; - bgp_peer_config.push(SwitchPortBgpPeerConfig::new( - psid, - bgp_config_id, - interface_name.clone(), - p.addr.into(), - p.hold_time.into(), - p.idle_hold_time.into(), - p.delay_open.into(), - p.connect_retry.into(), - p.keepalive.into(), - )); + bgp_peer_config.push(SwitchPortBgpPeerConfig::new( + psid, + bgp_config_id, + interface_name.clone(), + p.addr.into(), + p.hold_time.into(), + p.idle_hold_time.into(), + p.delay_open.into(), + p.connect_retry.into(), + p.keepalive.into(), + )); + } } result.bgp_peers = diesel::insert_into( @@ -1152,8 +1155,8 @@ mod test { use crate::db::datastore::{datastore_test, UpdatePrecondition}; use nexus_test_utils::db::test_setup_database; use nexus_types::external_api::params::{ - BgpAnnounceSetCreate, BgpConfigCreate, BgpPeerConfig, SwitchPortConfig, - SwitchPortGeometry, SwitchPortSettingsCreate, + BgpAnnounceSetCreate, BgpConfigCreate, BgpPeer, BgpPeerConfig, + SwitchPortConfig, SwitchPortGeometry, SwitchPortSettingsCreate, }; use omicron_common::api::external::{ IdentityMetadataCreateParams, Name, NameOrId, @@ -1217,19 +1220,21 @@ mod test { bgp_peers: HashMap::from([( "phy0".into(), BgpPeerConfig { - bgp_announce_set: NameOrId::Name( - "test-announce-set".parse().unwrap(), - ), - bgp_config: NameOrId::Name( - "test-bgp-config".parse().unwrap(), - ), - interface_name: "qsfp0".into(), - addr: "192.168.1.1".parse().unwrap(), - hold_time: 0, - idle_hold_time: 0, - delay_open: 0, - connect_retry: 0, - keepalive: 0, + peers: vec![BgpPeer { + bgp_announce_set: NameOrId::Name( + "test-announce-set".parse().unwrap(), + ), + bgp_config: NameOrId::Name( + "test-bgp-config".parse().unwrap(), + ), + interface_name: "qsfp0".into(), + addr: "192.168.1.1".parse().unwrap(), + hold_time: 0, + idle_hold_time: 0, + delay_open: 0, + connect_retry: 0, + keepalive: 0, + }], }, )]), addresses: HashMap::new(), diff --git a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs index 83856e10c7..c5c2751723 100644 --- a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs +++ b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs @@ -124,10 +124,12 @@ impl DataStore { .get_result_async(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - assert!( - collection.is_empty(), - "Collection deleted while non-empty: {collection:?}" - ); + + if !collection.is_empty() { + return Err(Error::internal_error(&format!( + "Collection deleted while non-empty: {collection:?}" + ))); + } Ok(()) } diff --git a/nexus/db-queries/src/db/mod.rs b/nexus/db-queries/src/db/mod.rs index 8b7424a056..b7c7079b54 100644 --- a/nexus/db-queries/src/db/mod.rs +++ b/nexus/db-queries/src/db/mod.rs @@ -12,6 +12,7 @@ pub mod collection_attach; pub mod collection_detach; pub mod collection_detach_many; pub mod collection_insert; +mod column_walker; mod config; mod cte_utils; // This is marked public for use by the integration tests diff --git a/nexus/db-queries/src/db/update_and_check.rs b/nexus/db-queries/src/db/update_and_check.rs index d6bf14c083..fed79d5254 100644 --- a/nexus/db-queries/src/db/update_and_check.rs +++ b/nexus/db-queries/src/db/update_and_check.rs @@ -4,6 +4,7 @@ //! CTE implementation for "UPDATE with extended return status". +use super::column_walker::ColumnWalker; use super::pool::DbConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::associations::HasTable; @@ -21,7 +22,7 @@ use std::marker::PhantomData; /// allows referencing generics with names (and extending usage /// without re-stating those generic parameters everywhere). pub trait UpdateStatementExt { - type Table: QuerySource; + type Table: Table + QuerySource; type WhereClause; type Changeset; @@ -32,7 +33,7 @@ pub trait UpdateStatementExt { impl UpdateStatementExt for UpdateStatement where - T: QuerySource, + T: Table + QuerySource, { type Table = T; type WhereClause = U; @@ -201,11 +202,11 @@ where /// /// ```text /// // WITH found AS (SELECT FROM T WHERE ) -/// // updated AS (UPDATE T SET RETURNING *) +/// // updated AS (UPDATE T SET RETURNING ) /// // SELECT /// // found. /// // updated. -/// // found.* +/// // found. /// // FROM /// // found /// // LEFT JOIN @@ -217,41 +218,48 @@ impl QueryFragment for UpdateAndQueryStatement where US: UpdateStatementExt, US::Table: HasTable + Table, + ColumnWalker<<::Table as Table>::AllColumns>: + IntoIterator, PrimaryKey: diesel::Column, UpdateStatement: QueryFragment, { fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, Pg>) -> QueryResult<()> { + let primary_key = as Column>::NAME; + out.push_sql("WITH found AS ("); self.find_subquery.walk_ast(out.reborrow())?; out.push_sql("), updated AS ("); self.update_statement.walk_ast(out.reborrow())?; - // TODO: Only need primary? Or would we actually want - // to pass the returned rows back through the result? - out.push_sql(" RETURNING *) "); + out.push_sql(" RETURNING "); + out.push_identifier(primary_key)?; + out.push_sql(") "); out.push_sql("SELECT"); - let name = as Column>::NAME; out.push_sql(" found."); - out.push_identifier(name)?; + out.push_identifier(primary_key)?; out.push_sql(", updated."); - out.push_identifier(name)?; - // TODO: I'd prefer to list all columns explicitly. But how? - // The types exist within Table::AllColumns, and each one - // has a name as "::Name". - // But Table::AllColumns is a tuple, which makes iteration - // a pain. - // - // TODO: Technically, we're repeating the PK here. - out.push_sql(", found.*"); + out.push_identifier(primary_key)?; + + // List all the "found" columns explicitly. + // This admittedly repeats the primary key, but that keeps the query + // "simple" since it returns all columns in the same order as + // AllColumns. + let all_columns = ColumnWalker::< + <::Table as Table>::AllColumns, + >::new(); + for column in all_columns.into_iter() { + out.push_sql(", found."); + out.push_identifier(column)?; + } out.push_sql(" FROM found LEFT JOIN updated ON"); out.push_sql(" found."); - out.push_identifier(name)?; + out.push_identifier(primary_key)?; out.push_sql(" = "); out.push_sql("updated."); - out.push_identifier(name)?; + out.push_identifier(primary_key)?; Ok(()) } diff --git a/nexus/src/app/oximeter.rs b/nexus/src/app/oximeter.rs index 66f39a32b6..a168b35293 100644 --- a/nexus/src/app/oximeter.rs +++ b/nexus/src/app/oximeter.rs @@ -127,9 +127,7 @@ impl super::Nexus { for producer in producers.into_iter() { let producer_info = oximeter_client::types::ProducerEndpoint { id: producer.id(), - kind: producer - .kind - .map(|kind| nexus::ProducerKind::from(kind).into()), + kind: nexus::ProducerKind::from(producer.kind).into(), address: SocketAddr::new( producer.ip.ip(), producer.port.try_into().unwrap(), @@ -152,7 +150,7 @@ impl super::Nexus { pub(crate) async fn register_as_producer(&self, address: SocketAddr) { let producer_endpoint = nexus::ProducerEndpoint { id: self.id, - kind: Some(nexus::ProducerKind::Service), + kind: nexus::ProducerKind::Service, address, base_route: String::from("/metrics/collect"), interval: Duration::from_secs(10), diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 984ece2d0c..95283faa1c 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -23,10 +23,16 @@ use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::params::Address; use nexus_types::external_api::params::AddressConfig; use nexus_types::external_api::params::AddressLotBlockCreate; +use nexus_types::external_api::params::BgpAnnounceSetCreate; +use nexus_types::external_api::params::BgpAnnouncementCreate; +use nexus_types::external_api::params::BgpConfigCreate; +use nexus_types::external_api::params::BgpPeer; +use nexus_types::external_api::params::LinkConfig; +use nexus_types::external_api::params::LldpServiceConfig; use nexus_types::external_api::params::RouteConfig; use nexus_types::external_api::params::SwitchPortConfig; use nexus_types::external_api::params::{ - AddressLotCreate, LoopbackAddressCreate, Route, SiloCreate, + AddressLotCreate, BgpPeerConfig, LoopbackAddressCreate, Route, SiloCreate, SwitchPortSettingsCreate, }; use nexus_types::external_api::shared::Baseboard; @@ -51,8 +57,8 @@ use sled_agent_client::types::EarlyNetworkConfigBody; use sled_agent_client::types::StartSledAgentRequest; use sled_agent_client::types::StartSledAgentRequestBody; use sled_agent_client::types::{ - BgpConfig, BgpPeerConfig, EarlyNetworkConfig, PortConfigV1, - RackNetworkConfigV1, RouteConfig as SledRouteConfig, + BgpConfig, BgpPeerConfig as SledBgpPeerConfig, EarlyNetworkConfig, + PortConfigV1, RackNetworkConfigV1, RouteConfig as SledRouteConfig, }; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -406,6 +412,108 @@ impl super::Nexus { Error::internal_error(&format!("unable to retrieve authz_address_lot for infra address_lot: {e}")) })?; + let mut bgp_configs = HashMap::new(); + + for bgp_config in &rack_network_config.bgp { + bgp_configs.insert(bgp_config.asn, bgp_config.clone()); + + let bgp_config_name: Name = + format!("as{}", bgp_config.asn).parse().unwrap(); + + let announce_set_name: Name = + format!("as{}-announce", bgp_config.asn).parse().unwrap(); + + let address_lot_name: Name = + format!("as{}-lot", bgp_config.asn).parse().unwrap(); + + self.db_datastore + .address_lot_create( + &opctx, + &AddressLotCreate { + identity: IdentityMetadataCreateParams { + name: address_lot_name, + description: format!( + "Address lot for announce set in as {}", + bgp_config.asn + ), + }, + kind: AddressLotKind::Infra, + blocks: bgp_config + .originate + .iter() + .map(|o| AddressLotBlockCreate { + first_address: o.network().into(), + last_address: o.broadcast().into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create address lot for BGP as {}: {}", + bgp_config.asn, e + )) + })?; + + self.db_datastore + .bgp_create_announce_set( + &opctx, + &BgpAnnounceSetCreate { + identity: IdentityMetadataCreateParams { + name: announce_set_name.clone(), + description: format!( + "Announce set for AS {}", + bgp_config.asn + ), + }, + announcement: bgp_config + .originate + .iter() + .map(|x| BgpAnnouncementCreate { + address_lot_block: NameOrId::Name( + format!("as{}", bgp_config.asn) + .parse() + .unwrap(), + ), + network: IpNetwork::from(*x).into(), + }) + .collect(), + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to create bgp announce set for as {}: {}", + bgp_config.asn, e + )) + })?; + + self.db_datastore + .bgp_config_set( + &opctx, + &BgpConfigCreate { + identity: IdentityMetadataCreateParams { + name: bgp_config_name, + description: format!( + "BGP config for AS {}", + bgp_config.asn + ), + }, + asn: bgp_config.asn, + bgp_announce_set_id: announce_set_name.into(), + vrf: None, + }, + ) + .await + .map_err(|e| { + Error::internal_error(&format!( + "unable to set bgp config for as {}: {}", + bgp_config.asn, e + )) + })?; + } + for (idx, uplink_config) in rack_network_config.ports.iter().enumerate() { @@ -503,6 +611,43 @@ impl super::Nexus { .routes .insert("phy0".to_string(), RouteConfig { routes }); + let peers: Vec = uplink_config + .bgp_peers + .iter() + .map(|r| BgpPeer { + bgp_announce_set: NameOrId::Name( + format!("as{}-announce", r.asn).parse().unwrap(), + ), + bgp_config: NameOrId::Name( + format!("as{}", r.asn).parse().unwrap(), + ), + interface_name: "phy0".into(), + addr: r.addr.into(), + hold_time: r.hold_time.unwrap_or(6) as u32, + idle_hold_time: r.idle_hold_time.unwrap_or(3) as u32, + delay_open: r.delay_open.unwrap_or(0) as u32, + connect_retry: r.connect_retry.unwrap_or(3) as u32, + keepalive: r.keepalive.unwrap_or(2) as u32, + }) + .collect(); + + port_settings_params + .bgp_peers + .insert("phy0".to_string(), BgpPeerConfig { peers }); + + let link = LinkConfig { + mtu: 1500, //TODO https://github.com/oxidecomputer/omicron/issues/2274 + lldp: LldpServiceConfig { + enabled: false, + lldp_config: None, + }, + fec: uplink_config.uplink_port_fec.into(), + speed: uplink_config.uplink_port_speed.into(), + autoneg: uplink_config.autoneg, + }; + + port_settings_params.links.insert("phy".to_string(), link); + match self .db_datastore .switch_port_settings_create( @@ -658,7 +803,7 @@ impl super::Nexus { addresses: info.addresses.iter().map(|a| a.address).collect(), bgp_peers: peer_info .iter() - .map(|(p, asn, addr)| BgpPeerConfig { + .map(|(p, asn, addr)| SledBgpPeerConfig { addr: *addr, asn: *asn, port: port.port_name.clone(), @@ -673,16 +818,21 @@ impl super::Nexus { port: port.port_name.clone(), uplink_port_fec: info .links - .get(0) //TODO breakout support + .get(0) //TODO https://github.com/oxidecomputer/omicron/issues/3062 .map(|l| l.fec) .unwrap_or(SwitchLinkFec::None) .into(), uplink_port_speed: info .links - .get(0) //TODO breakout support + .get(0) //TODO https://github.com/oxidecomputer/omicron/issues/3062 .map(|l| l.speed) .unwrap_or(SwitchLinkSpeed::Speed100G) .into(), + autoneg: info + .links + .get(0) //TODO breakout support + .map(|l| l.autoneg) + .unwrap_or(false), }; ports.push(p); diff --git a/nexus/src/app/sagas/switch_port_settings_common.rs b/nexus/src/app/sagas/switch_port_settings_common.rs index b328c6d1ac..9132645782 100644 --- a/nexus/src/app/sagas/switch_port_settings_common.rs +++ b/nexus/src/app/sagas/switch_port_settings_common.rs @@ -55,7 +55,7 @@ pub(crate) fn api_to_dpd_port_settings( link_id.to_string(), LinkSettings { params: LinkCreate { - autoneg: false, + autoneg: l.autoneg, lane: Some(LinkId(0)), kr: false, fec: match l.fec { @@ -251,6 +251,7 @@ pub(crate) async fn bootstore_update( .map(|l| l.speed) .unwrap_or(SwitchLinkSpeed::Speed100G) .into(), + autoneg: settings.links.get(0).map(|l| l.autoneg).unwrap_or(false), bgp_peers: peer_info .iter() .filter_map(|(p, asn)| { diff --git a/nexus/src/app/test_interfaces.rs b/nexus/src/app/test_interfaces.rs index 6161a9a1c1..581b9a89bb 100644 --- a/nexus/src/app/test_interfaces.rs +++ b/nexus/src/app/test_interfaces.rs @@ -10,10 +10,9 @@ use sled_agent_client::Client as SledAgentClient; use std::sync::Arc; use uuid::Uuid; +pub use super::update::HostPhase1Updater; pub use super::update::MgsClients; -pub use super::update::RotUpdateError; pub use super::update::RotUpdater; -pub use super::update::SpUpdateError; pub use super::update::SpUpdater; pub use super::update::UpdateProgress; pub use gateway_client::types::SpType; diff --git a/nexus/src/app/update/common_sp_update.rs b/nexus/src/app/update/common_sp_update.rs new file mode 100644 index 0000000000..69a5b132a2 --- /dev/null +++ b/nexus/src/app/update/common_sp_update.rs @@ -0,0 +1,239 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Module containing implementation details shared amongst all MGS-to-SP-driven +//! updates. + +use super::MgsClients; +use super::UpdateProgress; +use gateway_client::types::SpType; +use gateway_client::types::SpUpdateStatus; +use slog::Logger; +use std::time::Duration; +use tokio::sync::watch; +use uuid::Uuid; + +type GatewayClientError = gateway_client::Error; + +/// Error type returned when an update to a component managed by the SP fails. +/// +/// Note that the SP manages itself, as well, so "SP component" here includes +/// the SP. +#[derive(Debug, thiserror::Error)] +pub enum SpComponentUpdateError { + #[error("error communicating with MGS")] + MgsCommunication(#[from] GatewayClientError), + #[error("different update is now preparing ({0})")] + DifferentUpdatePreparing(Uuid), + #[error("different update is now in progress ({0})")] + DifferentUpdateInProgress(Uuid), + #[error("different update is now complete ({0})")] + DifferentUpdateComplete(Uuid), + #[error("different update is now aborted ({0})")] + DifferentUpdateAborted(Uuid), + #[error("different update failed ({0})")] + DifferentUpdateFailed(Uuid), + #[error("update status lost (did the SP reset?)")] + UpdateStatusLost, + #[error("update was aborted")] + UpdateAborted, + #[error("update failed (error code {0})")] + UpdateFailedWithCode(u32), + #[error("update failed (error message {0})")] + UpdateFailedWithMessage(String), +} + +pub(super) trait SpComponentUpdater { + /// The target component. + /// + /// Should be produced via `SpComponent::const_as_str()`. + fn component(&self) -> &'static str; + + /// The type of the target SP. + fn target_sp_type(&self) -> SpType; + + /// The slot number of the target SP. + fn target_sp_slot(&self) -> u32; + + /// The target firmware slot for the component. + fn firmware_slot(&self) -> u16; + + /// The ID of this update. + fn update_id(&self) -> Uuid; + + /// The update payload data to send to MGS. + // TODO-performance This has to be convertible into a `reqwest::Body`, so we + // return an owned Vec. That requires all our implementors to clone the data + // at least once; maybe we should use `Bytes` instead (which is cheap to + // clone and also convertible into a reqwest::Body)? + fn update_data(&self) -> Vec; + + /// The sending half of the watch channel to report update progress. + fn progress(&self) -> &watch::Sender>; + + /// Logger to use while performing this update. + fn logger(&self) -> &Logger; +} + +pub(super) async fn deliver_update( + updater: &(dyn SpComponentUpdater + Send + Sync), + mgs_clients: &mut MgsClients, +) -> Result<(), SpComponentUpdateError> { + // How frequently do we poll MGS for the update progress? + const STATUS_POLL_INTERVAL: Duration = Duration::from_secs(3); + + // Start the update. + mgs_clients + .try_all_serially(updater.logger(), |client| async move { + client + .sp_component_update( + updater.target_sp_type(), + updater.target_sp_slot(), + updater.component(), + updater.firmware_slot(), + &updater.update_id(), + reqwest::Body::from(updater.update_data()), + ) + .await?; + updater.progress().send_replace(Some(UpdateProgress::Started)); + info!( + updater.logger(), "update started"; + "mgs_addr" => client.baseurl(), + ); + Ok(()) + }) + .await?; + + // Wait for the update to complete. + loop { + let status = mgs_clients + .try_all_serially(updater.logger(), |client| async move { + let update_status = client + .sp_component_update_status( + updater.target_sp_type(), + updater.target_sp_slot(), + updater.component(), + ) + .await?; + + debug!( + updater.logger(), "got update status"; + "mgs_addr" => client.baseurl(), + "status" => ?update_status, + ); + + Ok(update_status) + }) + .await?; + + if status_is_complete( + status.into_inner(), + updater.update_id(), + updater.progress(), + updater.logger(), + )? { + updater.progress().send_replace(Some(UpdateProgress::InProgress { + progress: Some(1.0), + })); + return Ok(()); + } + + tokio::time::sleep(STATUS_POLL_INTERVAL).await; + } +} + +fn status_is_complete( + status: SpUpdateStatus, + update_id: Uuid, + progress_tx: &watch::Sender>, + log: &Logger, +) -> Result { + match status { + // For `Preparing` and `InProgress`, we could check the progress + // information returned by these steps and try to check that + // we're still _making_ progress, but every Nexus instance needs + // to do that anyway in case we (or the MGS instance delivering + // the update) crash, so we'll omit that check here. Instead, we + // just sleep and we'll poll again shortly. + SpUpdateStatus::Preparing { id, progress } => { + if id == update_id { + let progress = progress.and_then(|progress| { + if progress.current > progress.total { + warn!( + log, "nonsense preparing progress"; + "current" => progress.current, + "total" => progress.total, + ); + None + } else if progress.total == 0 { + None + } else { + Some( + f64::from(progress.current) + / f64::from(progress.total), + ) + } + }); + progress_tx + .send_replace(Some(UpdateProgress::Preparing { progress })); + Ok(false) + } else { + Err(SpComponentUpdateError::DifferentUpdatePreparing(id)) + } + } + SpUpdateStatus::InProgress { id, bytes_received, total_bytes } => { + if id == update_id { + let progress = if bytes_received > total_bytes { + warn!( + log, "nonsense update progress"; + "bytes_received" => bytes_received, + "total_bytes" => total_bytes, + ); + None + } else if total_bytes == 0 { + None + } else { + Some(f64::from(bytes_received) / f64::from(total_bytes)) + }; + progress_tx.send_replace(Some(UpdateProgress::InProgress { + progress, + })); + Ok(false) + } else { + Err(SpComponentUpdateError::DifferentUpdateInProgress(id)) + } + } + SpUpdateStatus::Complete { id } => { + if id == update_id { + Ok(true) + } else { + Err(SpComponentUpdateError::DifferentUpdateComplete(id)) + } + } + SpUpdateStatus::None => Err(SpComponentUpdateError::UpdateStatusLost), + SpUpdateStatus::Aborted { id } => { + if id == update_id { + Err(SpComponentUpdateError::UpdateAborted) + } else { + Err(SpComponentUpdateError::DifferentUpdateAborted(id)) + } + } + SpUpdateStatus::Failed { code, id } => { + if id == update_id { + Err(SpComponentUpdateError::UpdateFailedWithCode(code)) + } else { + Err(SpComponentUpdateError::DifferentUpdateFailed(id)) + } + } + SpUpdateStatus::RotError { id, message } => { + if id == update_id { + Err(SpComponentUpdateError::UpdateFailedWithMessage(format!( + "rot error: {message}" + ))) + } else { + Err(SpComponentUpdateError::DifferentUpdateFailed(id)) + } + } + } +} diff --git a/nexus/src/app/update/host_phase1_updater.rs b/nexus/src/app/update/host_phase1_updater.rs new file mode 100644 index 0000000000..fb013d0ffe --- /dev/null +++ b/nexus/src/app/update/host_phase1_updater.rs @@ -0,0 +1,177 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Module containing types for updating host OS phase1 images via MGS. + +use super::common_sp_update::deliver_update; +use super::common_sp_update::SpComponentUpdater; +use super::MgsClients; +use super::SpComponentUpdateError; +use super::UpdateProgress; +use gateway_client::types::SpComponentFirmwareSlot; +use gateway_client::types::SpType; +use gateway_client::SpComponent; +use slog::Logger; +use tokio::sync::watch; +use uuid::Uuid; + +type GatewayClientError = gateway_client::Error; + +pub struct HostPhase1Updater { + log: Logger, + progress: watch::Sender>, + sp_type: SpType, + sp_slot: u32, + target_host_slot: u16, + update_id: Uuid, + // TODO-clarity maybe a newtype for this? TBD how we get this from + // wherever it's stored, which might give us a stronger type already. + phase1_data: Vec, +} + +impl HostPhase1Updater { + pub fn new( + sp_type: SpType, + sp_slot: u32, + target_host_slot: u16, + update_id: Uuid, + phase1_data: Vec, + log: &Logger, + ) -> Self { + let log = log.new(slog::o!( + "component" => "HostPhase1Updater", + "sp_type" => format!("{sp_type:?}"), + "sp_slot" => sp_slot, + "target_host_slot" => target_host_slot, + "update_id" => format!("{update_id}"), + )); + let progress = watch::Sender::new(None); + Self { + log, + progress, + sp_type, + sp_slot, + target_host_slot, + update_id, + phase1_data, + } + } + + pub fn progress_watcher(&self) -> watch::Receiver> { + self.progress.subscribe() + } + + /// Drive this host phase 1 update to completion (or failure). + /// + /// Only one MGS instance is required to drive an update; however, if + /// multiple MGS instances are available and passed to this method and an + /// error occurs communicating with one instance, `HostPhase1Updater` will + /// try the remaining instances before failing. + pub async fn update( + mut self, + mgs_clients: &mut MgsClients, + ) -> Result<(), SpComponentUpdateError> { + // The async block below wants a `&self` reference, but we take `self` + // for API clarity (to start a new update, the caller should construct a + // new instance of the updater). Create a `&self` ref that we use + // through the remainder of this method. + let me = &self; + + // Prior to delivering the update, ensure the correct target slot is + // activated. + // + // TODO-correctness Should we be doing this, or should a higher level + // executor set this up before calling us? + mgs_clients + .try_all_serially(&self.log, |client| async move { + me.mark_target_slot_active(&client).await + }) + .await?; + + // Deliver and drive the update to completion + deliver_update(&mut self, mgs_clients).await?; + + // Unlike SP and RoT updates, we have nothing to do after delivery of + // the update completes; signal to any watchers that we're done. + self.progress.send_replace(Some(UpdateProgress::Complete)); + + // wait for any progress watchers to be dropped before we return; + // otherwise, they'll get `RecvError`s when trying to check the current + // status + self.progress.closed().await; + + Ok(()) + } + + async fn mark_target_slot_active( + &self, + client: &gateway_client::Client, + ) -> Result<(), GatewayClientError> { + // TODO-correctness Should we always persist this choice? + let persist = true; + + let slot = self.firmware_slot(); + + // TODO-correctness Until + // https://github.com/oxidecomputer/hubris/issues/1172 is fixed, the + // host must be in A2 for this operation to succeed. After it is fixed, + // there will still be a window while a host is booting where this + // operation can fail. How do we handle this? + client + .sp_component_active_slot_set( + self.sp_type, + self.sp_slot, + self.component(), + persist, + &SpComponentFirmwareSlot { slot }, + ) + .await?; + + // TODO-correctness Should we send some kind of update to + // `self.progress`? We haven't actually started delivering an update + // yet, but it seems weird to give no indication that we have + // successfully (potentially) modified the state of the target sled. + + info!( + self.log, "host phase1 target slot marked active"; + "mgs_addr" => client.baseurl(), + ); + + Ok(()) + } +} + +impl SpComponentUpdater for HostPhase1Updater { + fn component(&self) -> &'static str { + SpComponent::HOST_CPU_BOOT_FLASH.const_as_str() + } + + fn target_sp_type(&self) -> SpType { + self.sp_type + } + + fn target_sp_slot(&self) -> u32 { + self.sp_slot + } + + fn firmware_slot(&self) -> u16 { + self.target_host_slot + } + + fn update_id(&self) -> Uuid { + self.update_id + } + + fn update_data(&self) -> Vec { + self.phase1_data.clone() + } + + fn progress(&self) -> &watch::Sender> { + &self.progress + } + + fn logger(&self) -> &Logger { + &self.log + } +} diff --git a/nexus/src/app/update/mgs_clients.rs b/nexus/src/app/update/mgs_clients.rs index 5915505829..4b200a1819 100644 --- a/nexus/src/app/update/mgs_clients.rs +++ b/nexus/src/app/update/mgs_clients.rs @@ -5,53 +5,14 @@ //! Module providing support for handling failover between multiple MGS clients use futures::Future; -use gateway_client::types::SpType; -use gateway_client::types::SpUpdateStatus; use gateway_client::Client; use slog::Logger; use std::collections::VecDeque; use std::sync::Arc; -use uuid::Uuid; pub(super) type GatewayClientError = gateway_client::Error; -pub(super) enum PollUpdateStatus { - Preparing { progress: Option }, - InProgress { progress: Option }, - Complete, -} - -#[derive(Debug, thiserror::Error)] -pub enum UpdateStatusError { - #[error("different update is now preparing ({0})")] - DifferentUpdatePreparing(Uuid), - #[error("different update is now in progress ({0})")] - DifferentUpdateInProgress(Uuid), - #[error("different update is now complete ({0})")] - DifferentUpdateComplete(Uuid), - #[error("different update is now aborted ({0})")] - DifferentUpdateAborted(Uuid), - #[error("different update failed ({0})")] - DifferentUpdateFailed(Uuid), - #[error("update status lost (did the SP reset?)")] - UpdateStatusLost, - #[error("update was aborted")] - UpdateAborted, - #[error("update failed (error code {0})")] - UpdateFailedWithCode(u32), - #[error("update failed (error message {0})")] - UpdateFailedWithMessage(String), -} - -#[derive(Debug, thiserror::Error)] -pub(super) enum PollUpdateStatusError { - #[error(transparent)] - StatusError(#[from] UpdateStatusError), - #[error(transparent)] - ClientError(#[from] GatewayClientError), -} - #[derive(Debug, Clone)] pub struct MgsClients { clients: VecDeque>, @@ -130,111 +91,4 @@ impl MgsClients { // errors. Return the error from the last MGS we tried. Err(GatewayClientError::CommunicationError(last_err.unwrap())) } - - /// Poll for the status of an expected-to-be-in-progress update. - pub(super) async fn poll_update_status( - &mut self, - sp_type: SpType, - sp_slot: u32, - component: &'static str, - update_id: Uuid, - log: &Logger, - ) -> Result { - let update_status = self - .try_all_serially(log, |client| async move { - let update_status = client - .sp_component_update_status(sp_type, sp_slot, component) - .await?; - - debug!( - log, "got update status"; - "mgs_addr" => client.baseurl(), - "status" => ?update_status, - ); - - Ok(update_status) - }) - .await? - .into_inner(); - - match update_status { - SpUpdateStatus::Preparing { id, progress } => { - if id == update_id { - let progress = progress.and_then(|progress| { - if progress.current > progress.total { - warn!( - log, "nonsense preparing progress"; - "current" => progress.current, - "total" => progress.total, - ); - None - } else if progress.total == 0 { - None - } else { - Some( - f64::from(progress.current) - / f64::from(progress.total), - ) - } - }); - Ok(PollUpdateStatus::Preparing { progress }) - } else { - Err(UpdateStatusError::DifferentUpdatePreparing(id).into()) - } - } - SpUpdateStatus::InProgress { id, bytes_received, total_bytes } => { - if id == update_id { - let progress = if bytes_received > total_bytes { - warn!( - log, "nonsense update progress"; - "bytes_received" => bytes_received, - "total_bytes" => total_bytes, - ); - None - } else if total_bytes == 0 { - None - } else { - Some(f64::from(bytes_received) / f64::from(total_bytes)) - }; - Ok(PollUpdateStatus::InProgress { progress }) - } else { - Err(UpdateStatusError::DifferentUpdateInProgress(id).into()) - } - } - SpUpdateStatus::Complete { id } => { - if id == update_id { - Ok(PollUpdateStatus::Complete) - } else { - Err(UpdateStatusError::DifferentUpdateComplete(id).into()) - } - } - SpUpdateStatus::None => { - Err(UpdateStatusError::UpdateStatusLost.into()) - } - SpUpdateStatus::Aborted { id } => { - if id == update_id { - Err(UpdateStatusError::UpdateAborted.into()) - } else { - Err(UpdateStatusError::DifferentUpdateAborted(id).into()) - } - } - SpUpdateStatus::Failed { code, id } => { - if id == update_id { - Err(UpdateStatusError::UpdateFailedWithCode(code).into()) - } else { - Err(UpdateStatusError::DifferentUpdateFailed(id).into()) - } - } - SpUpdateStatus::RotError { id, message } => { - if id == update_id { - Err(UpdateStatusError::UpdateFailedWithMessage(format!( - "rot error: {message}" - )) - .into()) - } else { - Err(UpdateStatusError::DifferentUpdateFailed(id).into()) - } - } - } - } } diff --git a/nexus/src/app/update/mod.rs b/nexus/src/app/update/mod.rs index 7d5c642822..5075e421ae 100644 --- a/nexus/src/app/update/mod.rs +++ b/nexus/src/app/update/mod.rs @@ -26,13 +26,17 @@ use std::path::Path; use tokio::io::AsyncWriteExt; use uuid::Uuid; +mod common_sp_update; +mod host_phase1_updater; mod mgs_clients; mod rot_updater; mod sp_updater; -pub use mgs_clients::{MgsClients, UpdateStatusError}; -pub use rot_updater::{RotUpdateError, RotUpdater}; -pub use sp_updater::{SpUpdateError, SpUpdater}; +pub use common_sp_update::SpComponentUpdateError; +pub use host_phase1_updater::HostPhase1Updater; +pub use mgs_clients::MgsClients; +pub use rot_updater::RotUpdater; +pub use sp_updater::SpUpdater; #[derive(Debug, PartialEq, Clone)] pub enum UpdateProgress { diff --git a/nexus/src/app/update/rot_updater.rs b/nexus/src/app/update/rot_updater.rs index d7d21e3b3a..12126a7de9 100644 --- a/nexus/src/app/update/rot_updater.rs +++ b/nexus/src/app/update/rot_updater.rs @@ -4,40 +4,21 @@ //! Module containing types for updating RoTs via MGS. -use super::mgs_clients::PollUpdateStatusError; +use super::common_sp_update::deliver_update; +use super::common_sp_update::SpComponentUpdater; use super::MgsClients; +use super::SpComponentUpdateError; use super::UpdateProgress; -use super::UpdateStatusError; -use crate::app::update::mgs_clients::PollUpdateStatus; use gateway_client::types::RotSlot; use gateway_client::types::SpComponentFirmwareSlot; use gateway_client::types::SpType; use gateway_client::SpComponent; use slog::Logger; -use std::time::Duration; use tokio::sync::watch; use uuid::Uuid; type GatewayClientError = gateway_client::Error; -#[derive(Debug, thiserror::Error)] -pub enum RotUpdateError { - #[error("error communicating with MGS")] - MgsCommunication(#[from] GatewayClientError), - - #[error("failed checking update status: {0}")] - PollUpdateStatus(#[from] UpdateStatusError), -} - -impl From for RotUpdateError { - fn from(err: PollUpdateStatusError) -> Self { - match err { - PollUpdateStatusError::StatusError(err) => err.into(), - PollUpdateStatusError::ClientError(err) => err.into(), - } - } -} - pub struct RotUpdater { log: Logger, progress: watch::Sender>, @@ -89,9 +70,14 @@ impl RotUpdater { /// error occurs communicating with one instance, `RotUpdater` will try the /// remaining instances before failing. pub async fn update( - self, - mut mgs_clients: MgsClients, - ) -> Result<(), RotUpdateError> { + mut self, + mgs_clients: &mut MgsClients, + ) -> Result<(), SpComponentUpdateError> { + // Deliver and drive the update to "completion" (which isn't really + // complete for the RoT, since we still have to do the steps below after + // the delivery of the update completes). + deliver_update(&mut self, mgs_clients).await?; + // The async blocks below want `&self` references, but we take `self` // for API clarity (to start a new update, the caller should construct a // new updater). Create a `&self` ref that we use through the remainder @@ -100,23 +86,13 @@ impl RotUpdater { mgs_clients .try_all_serially(&self.log, |client| async move { - me.start_update_one_mgs(&client).await - }) - .await?; - - // `wait_for_update_completion` uses `try_all_mgs_clients` internally, - // so we don't wrap it here. - me.wait_for_update_completion(&mut mgs_clients).await?; - - mgs_clients - .try_all_serially(&self.log, |client| async move { - me.mark_target_slot_active_one_mgs(&client).await + me.mark_target_slot_active(&client).await }) .await?; mgs_clients .try_all_serially(&self.log, |client| async move { - me.finalize_update_via_reset_one_mgs(&client).await + me.finalize_update_via_reset(&client).await }) .await?; @@ -128,82 +104,7 @@ impl RotUpdater { Ok(()) } - async fn start_update_one_mgs( - &self, - client: &gateway_client::Client, - ) -> Result<(), GatewayClientError> { - let firmware_slot = self.target_rot_slot.as_u16(); - - // Start the update. - client - .sp_component_update( - self.sp_type, - self.sp_slot, - SpComponent::ROT.const_as_str(), - firmware_slot, - &self.update_id, - reqwest::Body::from(self.rot_hubris_archive.clone()), - ) - .await?; - - self.progress.send_replace(Some(UpdateProgress::Started)); - - info!( - self.log, "RoT update started"; - "mgs_addr" => client.baseurl(), - ); - - Ok(()) - } - - async fn wait_for_update_completion( - &self, - mgs_clients: &mut MgsClients, - ) -> Result<(), RotUpdateError> { - // How frequently do we poll MGS for the update progress? - const STATUS_POLL_INTERVAL: Duration = Duration::from_secs(3); - - loop { - let status = mgs_clients - .poll_update_status( - self.sp_type, - self.sp_slot, - SpComponent::ROT.const_as_str(), - self.update_id, - &self.log, - ) - .await?; - - // For `Preparing` and `InProgress`, we could check the progress - // information returned by these steps and try to check that - // we're still _making_ progress, but every Nexus instance needs - // to do that anyway in case we (or the MGS instance delivering - // the update) crash, so we'll omit that check here. Instead, we - // just sleep and we'll poll again shortly. - match status { - PollUpdateStatus::Preparing { progress } => { - self.progress.send_replace(Some( - UpdateProgress::Preparing { progress }, - )); - } - PollUpdateStatus::InProgress { progress } => { - self.progress.send_replace(Some( - UpdateProgress::InProgress { progress }, - )); - } - PollUpdateStatus::Complete => { - self.progress.send_replace(Some( - UpdateProgress::InProgress { progress: Some(1.0) }, - )); - return Ok(()); - } - } - - tokio::time::sleep(STATUS_POLL_INTERVAL).await; - } - } - - async fn mark_target_slot_active_one_mgs( + async fn mark_target_slot_active( &self, client: &gateway_client::Client, ) -> Result<(), GatewayClientError> { @@ -211,13 +112,13 @@ impl RotUpdater { // tell it to persist our choice. let persist = true; - let slot = self.target_rot_slot.as_u16(); + let slot = self.firmware_slot(); client .sp_component_active_slot_set( self.sp_type, self.sp_slot, - SpComponent::ROT.const_as_str(), + self.component(), persist, &SpComponentFirmwareSlot { slot }, ) @@ -236,16 +137,12 @@ impl RotUpdater { Ok(()) } - async fn finalize_update_via_reset_one_mgs( + async fn finalize_update_via_reset( &self, client: &gateway_client::Client, ) -> Result<(), GatewayClientError> { client - .sp_component_reset( - self.sp_type, - self.sp_slot, - SpComponent::ROT.const_as_str(), - ) + .sp_component_reset(self.sp_type, self.sp_slot, self.component()) .await?; self.progress.send_replace(Some(UpdateProgress::Complete)); @@ -258,15 +155,39 @@ impl RotUpdater { } } -trait RotSlotAsU16 { - fn as_u16(&self) -> u16; -} +impl SpComponentUpdater for RotUpdater { + fn component(&self) -> &'static str { + SpComponent::ROT.const_as_str() + } + + fn target_sp_type(&self) -> SpType { + self.sp_type + } -impl RotSlotAsU16 for RotSlot { - fn as_u16(&self) -> u16 { - match self { + fn target_sp_slot(&self) -> u32 { + self.sp_slot + } + + fn firmware_slot(&self) -> u16 { + match self.target_rot_slot { RotSlot::A => 0, RotSlot::B => 1, } } + + fn update_id(&self) -> Uuid { + self.update_id + } + + fn update_data(&self) -> Vec { + self.rot_hubris_archive.clone() + } + + fn progress(&self) -> &watch::Sender> { + &self.progress + } + + fn logger(&self) -> &Logger { + &self.log + } } diff --git a/nexus/src/app/update/sp_updater.rs b/nexus/src/app/update/sp_updater.rs index 419a733441..2a6ddc6de6 100644 --- a/nexus/src/app/update/sp_updater.rs +++ b/nexus/src/app/update/sp_updater.rs @@ -4,39 +4,19 @@ //! Module containing types for updating SPs via MGS. -use crate::app::update::mgs_clients::PollUpdateStatus; - -use super::mgs_clients::PollUpdateStatusError; +use super::common_sp_update::deliver_update; +use super::common_sp_update::SpComponentUpdater; use super::MgsClients; +use super::SpComponentUpdateError; use super::UpdateProgress; -use super::UpdateStatusError; use gateway_client::types::SpType; use gateway_client::SpComponent; use slog::Logger; -use std::time::Duration; use tokio::sync::watch; use uuid::Uuid; type GatewayClientError = gateway_client::Error; -#[derive(Debug, thiserror::Error)] -pub enum SpUpdateError { - #[error("error communicating with MGS")] - MgsCommunication(#[from] GatewayClientError), - - #[error("failed checking update status: {0}")] - PollUpdateStatus(#[from] UpdateStatusError), -} - -impl From for SpUpdateError { - fn from(err: PollUpdateStatusError) -> Self { - match err { - PollUpdateStatusError::StatusError(err) => err.into(), - PollUpdateStatusError::ClientError(err) => err.into(), - } - } -} - pub struct SpUpdater { log: Logger, progress: watch::Sender>, @@ -77,10 +57,15 @@ impl SpUpdater { /// error occurs communicating with one instance, `SpUpdater` will try the /// remaining instances before failing. pub async fn update( - self, - mut mgs_clients: MgsClients, - ) -> Result<(), SpUpdateError> { - // The async blocks below want `&self` references, but we take `self` + mut self, + mgs_clients: &mut MgsClients, + ) -> Result<(), SpComponentUpdateError> { + // Deliver and drive the update to "completion" (which isn't really + // complete for the SP, since we still have to reset it after the + // delivery of the update completes). + deliver_update(&mut self, mgs_clients).await?; + + // The async block below wants a `&self` reference, but we take `self` // for API clarity (to start a new SP update, the caller should // construct a new `SpUpdater`). Create a `&self` ref that we use // through the remainder of this method. @@ -88,17 +73,7 @@ impl SpUpdater { mgs_clients .try_all_serially(&self.log, |client| async move { - me.start_update_one_mgs(&client).await - }) - .await?; - - // `wait_for_update_completion` uses `try_all_mgs_clients` internally, - // so we don't wrap it here. - me.wait_for_update_completion(&mut mgs_clients).await?; - - mgs_clients - .try_all_serially(&self.log, |client| async move { - me.finalize_update_via_reset_one_mgs(&client).await + me.finalize_update_via_reset(&client).await }) .await?; @@ -110,102 +85,57 @@ impl SpUpdater { Ok(()) } - async fn start_update_one_mgs( + async fn finalize_update_via_reset( &self, client: &gateway_client::Client, ) -> Result<(), GatewayClientError> { - // The SP has two firmware slots, but they're aren't individually - // labled. We always request an update to slot 0, which means "the - // inactive slot". - let firmware_slot = 0; - - // Start the update. client - .sp_component_update( - self.sp_type, - self.sp_slot, - SpComponent::SP_ITSELF.const_as_str(), - firmware_slot, - &self.update_id, - reqwest::Body::from(self.sp_hubris_archive.clone()), - ) + .sp_component_reset(self.sp_type, self.sp_slot, self.component()) .await?; - self.progress.send_replace(Some(UpdateProgress::Started)); - + self.progress.send_replace(Some(UpdateProgress::Complete)); info!( - self.log, "SP update started"; + self.log, "SP update complete"; "mgs_addr" => client.baseurl(), ); Ok(()) } +} - async fn wait_for_update_completion( - &self, - mgs_clients: &mut MgsClients, - ) -> Result<(), SpUpdateError> { - // How frequently do we poll MGS for the update progress? - const STATUS_POLL_INTERVAL: Duration = Duration::from_secs(3); - - loop { - let status = mgs_clients - .poll_update_status( - self.sp_type, - self.sp_slot, - SpComponent::SP_ITSELF.const_as_str(), - self.update_id, - &self.log, - ) - .await?; - - // For `Preparing` and `InProgress`, we could check the progress - // information returned by these steps and try to check that - // we're still _making_ progress, but every Nexus instance needs - // to do that anyway in case we (or the MGS instance delivering - // the update) crash, so we'll omit that check here. Instead, we - // just sleep and we'll poll again shortly. - match status { - PollUpdateStatus::Preparing { progress } => { - self.progress.send_replace(Some( - UpdateProgress::Preparing { progress }, - )); - } - PollUpdateStatus::InProgress { progress } => { - self.progress.send_replace(Some( - UpdateProgress::InProgress { progress }, - )); - } - PollUpdateStatus::Complete => { - self.progress.send_replace(Some( - UpdateProgress::InProgress { progress: Some(1.0) }, - )); - return Ok(()); - } - } - - tokio::time::sleep(STATUS_POLL_INTERVAL).await; - } +impl SpComponentUpdater for SpUpdater { + fn component(&self) -> &'static str { + SpComponent::SP_ITSELF.const_as_str() } - async fn finalize_update_via_reset_one_mgs( - &self, - client: &gateway_client::Client, - ) -> Result<(), GatewayClientError> { - client - .sp_component_reset( - self.sp_type, - self.sp_slot, - SpComponent::SP_ITSELF.const_as_str(), - ) - .await?; + fn target_sp_type(&self) -> SpType { + self.sp_type + } - self.progress.send_replace(Some(UpdateProgress::Complete)); - info!( - self.log, "SP update complete"; - "mgs_addr" => client.baseurl(), - ); + fn target_sp_slot(&self) -> u32 { + self.sp_slot + } - Ok(()) + fn firmware_slot(&self) -> u16 { + // The SP has two firmware slots, but they're aren't individually + // labled. We always request an update to slot 0, which means "the + // inactive slot". + 0 + } + + fn update_id(&self) -> Uuid { + self.update_id + } + + fn update_data(&self) -> Vec { + self.sp_hubris_archive.clone() + } + + fn progress(&self) -> &watch::Sender> { + &self.progress + } + + fn logger(&self) -> &Logger { + &self.log } } diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 1e7de6132b..52ff8910f9 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -1093,7 +1093,7 @@ pub async fn start_producer_server( let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, - kind: Some(ProducerKind::Service), + kind: ProducerKind::Service, address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_secs(1), diff --git a/nexus/tests/integration_tests/host_phase1_updater.rs b/nexus/tests/integration_tests/host_phase1_updater.rs new file mode 100644 index 0000000000..01d546636e --- /dev/null +++ b/nexus/tests/integration_tests/host_phase1_updater.rs @@ -0,0 +1,584 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Tests `HostPhase1Updater`'s delivery of updates to host phase 1 flash via +//! MGS to SP. + +use gateway_client::types::SpType; +use gateway_messages::{SpPort, UpdateInProgressStatus, UpdateStatus}; +use gateway_test_utils::setup as mgs_setup; +use omicron_nexus::app::test_interfaces::{ + HostPhase1Updater, MgsClients, UpdateProgress, +}; +use rand::RngCore; +use sp_sim::SimulatedSp; +use std::mem; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; +use tokio::io::AsyncWriteExt; +use tokio::net::TcpListener; +use tokio::net::TcpStream; +use tokio::sync::mpsc; +use uuid::Uuid; + +fn make_fake_host_phase1_image() -> Vec { + let mut image = vec![0; 128]; + rand::thread_rng().fill_bytes(&mut image); + image +} + +#[tokio::test] +async fn test_host_phase1_updater_updates_sled() { + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_host_phase1_updater_updates_sled", + SpPort::One, + ) + .await; + + // Configure an MGS client. + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); + + for target_host_slot in [0, 1] { + // Configure and instantiate an `HostPhase1Updater`. + let sp_type = SpType::Sled; + let sp_slot = 0; + let update_id = Uuid::new_v4(); + let phase1_data = make_fake_host_phase1_image(); + + let host_phase1_updater = HostPhase1Updater::new( + sp_type, + sp_slot, + target_host_slot, + update_id, + phase1_data.clone(), + &mgstestctx.logctx.log, + ); + + // Run the update. + host_phase1_updater + .update(&mut mgs_clients) + .await + .expect("update failed"); + + // Ensure the SP received the complete update. + let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] + .last_host_phase1_update_data(target_host_slot) + .await + .expect("simulated host phase1 did not receive an update"); + + assert_eq!( + phase1_data.as_slice(), + &*last_update_image, + "simulated host phase1 update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + phase1_data.len(), + ); + } + + mgstestctx.teardown().await; +} + +#[tokio::test] +async fn test_host_phase1_updater_remembers_successful_mgs_instance() { + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_host_phase1_updater_remembers_successful_mgs_instance", + SpPort::One, + ) + .await; + + // Also start a local TCP server that we will claim is an MGS instance, but + // it will close connections immediately after accepting them. This will + // allow us to count how many connections it receives, while simultaneously + // causing errors in the HostPhase1Updater when it attempts to use this + // "MGS". + let (failing_mgs_task, failing_mgs_addr, failing_mgs_conn_counter) = { + let socket = TcpListener::bind("[::1]:0").await.unwrap(); + let addr = socket.local_addr().unwrap(); + let conn_count = Arc::new(AtomicUsize::new(0)); + + let task = { + let conn_count = Arc::clone(&conn_count); + tokio::spawn(async move { + loop { + let (mut stream, _peer) = socket.accept().await.unwrap(); + conn_count.fetch_add(1, Ordering::SeqCst); + stream.shutdown().await.unwrap(); + } + }) + }; + + (task, addr, conn_count) + }; + + // Order the MGS clients such that the bogus MGS that immediately closes + // connections comes first. `HostPhase1Updater` should remember that the + // second MGS instance succeeds, and only send subsequent requests to it: we + // should only see a single attempted connection to the bogus MGS, even + // though delivering an update requires a bare minimum of three requests + // (start the update, query the status, reset the SP) and often more (if + // repeated queries are required to wait for completion). + let mut mgs_clients = MgsClients::from_clients([ + gateway_client::Client::new( + &format!("http://{failing_mgs_addr}"), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), + ), + gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + ), + ]); + + let sp_type = SpType::Sled; + let sp_slot = 0; + let target_host_slot = 0; + let update_id = Uuid::new_v4(); + let phase1_data = make_fake_host_phase1_image(); + + let host_phase1_updater = HostPhase1Updater::new( + sp_type, + sp_slot, + target_host_slot, + update_id, + phase1_data.clone(), + &mgstestctx.logctx.log, + ); + + host_phase1_updater.update(&mut mgs_clients).await.expect("update failed"); + + let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] + .last_host_phase1_update_data(target_host_slot) + .await + .expect("simulated host phase1 did not receive an update"); + + assert_eq!( + phase1_data.as_slice(), + &*last_update_image, + "simulated host phase1 update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + phase1_data.len(), + ); + + // Check that our bogus MGS only received a single connection attempt. + // (After HostPhase1Updater failed to talk to this instance, it should have + // fallen back to the valid one for all further requests.) + assert_eq!( + failing_mgs_conn_counter.load(Ordering::SeqCst), + 1, + "bogus MGS instance didn't receive the expected number of connections" + ); + failing_mgs_task.abort(); + + mgstestctx.teardown().await; +} + +#[tokio::test] +async fn test_host_phase1_updater_switches_mgs_instances_on_failure() { + enum MgsProxy { + One(TcpStream), + Two(TcpStream), + } + + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_host_phase1_updater_switches_mgs_instances_on_failure", + SpPort::One, + ) + .await; + let mgs_bind_addr = mgstestctx.client.bind_address; + + let spawn_mgs_proxy_task = |mut stream: TcpStream| { + tokio::spawn(async move { + let mut mgs_stream = TcpStream::connect(mgs_bind_addr) + .await + .expect("failed to connect to MGS"); + tokio::io::copy_bidirectional(&mut stream, &mut mgs_stream) + .await + .expect("failed to proxy connection to MGS"); + }) + }; + + // Start two MGS proxy tasks; when each receives an incoming TCP connection, + // it forwards that `TcpStream` along the `mgs_proxy_connections` channel + // along with a tag of which proxy it is. We'll use this below to flip flop + // between MGS "instances" (really these two proxies). + let (mgs_proxy_connections_tx, mut mgs_proxy_connections_rx) = + mpsc::unbounded_channel(); + let (mgs_proxy_one_task, mgs_proxy_one_addr) = { + let socket = TcpListener::bind("[::1]:0").await.unwrap(); + let addr = socket.local_addr().unwrap(); + let mgs_proxy_connections_tx = mgs_proxy_connections_tx.clone(); + let task = tokio::spawn(async move { + loop { + let (stream, _peer) = socket.accept().await.unwrap(); + mgs_proxy_connections_tx.send(MgsProxy::One(stream)).unwrap(); + } + }); + (task, addr) + }; + let (mgs_proxy_two_task, mgs_proxy_two_addr) = { + let socket = TcpListener::bind("[::1]:0").await.unwrap(); + let addr = socket.local_addr().unwrap(); + let task = tokio::spawn(async move { + loop { + let (stream, _peer) = socket.accept().await.unwrap(); + mgs_proxy_connections_tx.send(MgsProxy::Two(stream)).unwrap(); + } + }); + (task, addr) + }; + + // Disable connection pooling so each request gets a new TCP connection. + let client = + reqwest::Client::builder().pool_max_idle_per_host(0).build().unwrap(); + + // Configure two MGS clients pointed at our two proxy tasks. + let mut mgs_clients = MgsClients::from_clients([ + gateway_client::Client::new_with_client( + &format!("http://{mgs_proxy_one_addr}"), + client.clone(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), + ), + gateway_client::Client::new_with_client( + &format!("http://{mgs_proxy_two_addr}"), + client, + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient2")), + ), + ]); + + let sp_type = SpType::Sled; + let sp_slot = 0; + let target_host_slot = 0; + let update_id = Uuid::new_v4(); + let phase1_data = make_fake_host_phase1_image(); + + let host_phase1_updater = HostPhase1Updater::new( + sp_type, + sp_slot, + target_host_slot, + update_id, + phase1_data.clone(), + &mgstestctx.logctx.log, + ); + + // Spawn the actual update task. + let mut update_task = tokio::spawn(async move { + host_phase1_updater.update(&mut mgs_clients).await + }); + + // Loop over incoming requests. We expect this sequence: + // + // 1. Connection arrives on the first proxy + // 2. We spawn a task to service that request, and set `should_swap` + // 3. Connection arrives on the first proxy + // 4. We drop that connection, flip `expected_proxy`, and clear + // `should_swap` + // 5. Connection arrives on the second proxy + // 6. We spawn a task to service that request, and set `should_swap` + // 7. Connection arrives on the second proxy + // 8. We drop that connection, flip `expected_proxy`, and clear + // `should_swap` + // + // ... repeat until the update is complete. + let mut expected_proxy = 0; + let mut proxy_one_count = 0; + let mut proxy_two_count = 0; + let mut total_requests_handled = 0; + let mut should_swap = false; + loop { + tokio::select! { + Some(proxy_stream) = mgs_proxy_connections_rx.recv() => { + let stream = match proxy_stream { + MgsProxy::One(stream) => { + assert_eq!(expected_proxy, 0); + proxy_one_count += 1; + stream + } + MgsProxy::Two(stream) => { + assert_eq!(expected_proxy, 1); + proxy_two_count += 1; + stream + } + }; + + // Should we trigger `HostPhase1Updater` to swap to the other + // MGS (proxy)? If so, do that by dropping this connection + // (which will cause a client failure) and note that we expect + // the next incoming request to come on the other proxy. + if should_swap { + mem::drop(stream); + expected_proxy ^= 1; + should_swap = false; + } else { + // Otherwise, handle this connection. + total_requests_handled += 1; + spawn_mgs_proxy_task(stream); + should_swap = true; + } + } + + result = &mut update_task => { + match result { + Ok(Ok(())) => { + mgs_proxy_one_task.abort(); + mgs_proxy_two_task.abort(); + break; + } + Ok(Err(err)) => panic!("update failed: {err}"), + Err(err) => panic!("update task panicked: {err}"), + } + } + } + } + + // A host flash update requires a minimum of 3 requests to MGS: set the + // active flash slot, post the update, and check the status. There may be + // more requests if the update is not yet complete when the status is + // checked, but we can just check that each of our proxies received at least + // 2 incoming requests; based on our outline above, if we got the minimum of + // 3 requests, it would look like this: + // + // 1. POST update -> first proxy (success) + // 2. GET status -> first proxy (fail) + // 3. GET status retry -> second proxy (success) + // 4. POST reset -> second proxy (fail) + // 5. POST reset -> first proxy (success) + // + // This pattern would repeat if multiple status requests were required, so + // we always expect the first proxy to see exactly one more connection + // attempt than the second (because it went first before they started + // swapping), and the two together should see a total of one less than + // double the number of successful requests required. + assert!(total_requests_handled >= 3); + assert_eq!(proxy_one_count, proxy_two_count + 1); + assert_eq!( + (proxy_one_count + proxy_two_count + 1) / 2, + total_requests_handled + ); + + let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] + .last_host_phase1_update_data(target_host_slot) + .await + .expect("simulated host phase1 did not receive an update"); + + assert_eq!( + phase1_data.as_slice(), + &*last_update_image, + "simulated host phase1 update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + phase1_data.len(), + ); + + mgstestctx.teardown().await; +} + +#[tokio::test] +async fn test_host_phase1_updater_delivers_progress() { + // Start MGS + Sim SP. + let mgstestctx = mgs_setup::test_setup( + "test_host_phase1_updater_delivers_progress", + SpPort::One, + ) + .await; + + // Configure an MGS client. + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); + + let sp_type = SpType::Sled; + let sp_slot = 0; + let target_host_slot = 0; + let update_id = Uuid::new_v4(); + let phase1_data = make_fake_host_phase1_image(); + + let host_phase1_updater = HostPhase1Updater::new( + sp_type, + sp_slot, + target_host_slot, + update_id, + phase1_data.clone(), + &mgstestctx.logctx.log, + ); + + let phase1_data_len = phase1_data.len() as u32; + + // Subscribe to update progress, and check that there is no status yet; we + // haven't started the update. + let mut progress = host_phase1_updater.progress_watcher(); + assert_eq!(*progress.borrow_and_update(), None); + + // Install a semaphore on the requests our target SP will receive so we can + // inspect progress messages without racing. + let target_sp = &mgstestctx.simrack.gimlets[sp_slot as usize]; + let sp_accept_sema = target_sp.install_udp_accept_semaphore().await; + let mut sp_responses = target_sp.responses_sent_count().unwrap(); + + // Spawn the update on a background task so we can watch `progress` as it is + // applied. + let do_update_task = tokio::spawn(async move { + host_phase1_updater.update(&mut mgs_clients).await + }); + + // Allow the SP to respond to 2 messages: the message to activate the target + // flash slot and the "prepare update" messages that triggers the start of an + // update, then ensure we see the "started" progress. + sp_accept_sema.send(2).unwrap(); + progress.changed().await.unwrap(); + assert_eq!(*progress.borrow_and_update(), Some(UpdateProgress::Started)); + + // Ensure our simulated SP is in the state we expect: it's prepared for an + // update but has not yet received any data. + assert_eq!( + target_sp.current_update_status().await, + UpdateStatus::InProgress(UpdateInProgressStatus { + id: update_id.into(), + bytes_received: 0, + total_size: phase1_data_len, + }) + ); + + // Record the number of responses the SP has sent; we'll use + // `sp_responses.changed()` in the loop below, and want to mark whatever + // value this watch channel currently has as seen. + sp_responses.borrow_and_update(); + + // At this point, there are two clients racing each other to talk to our + // simulated SP: + // + // 1. MGS is trying to deliver the update + // 2. `host_phase1_updater` is trying to poll (via MGS) for update status + // + // and we want to ensure that we see any relevant progress reports from + // `host_phase1_updater`. We'll let one MGS -> SP message through at a time + // (waiting until our SP has responded by waiting for a change to + // `sp_responses`) then check its update state: if it changed, the packet we + // let through was data from MGS; otherwise, it was a status request from + // `host_phase1_updater`. + // + // This loop will continue until either: + // + // 1. We see an `UpdateStatus::InProgress` message indicating 100% delivery, + // at which point we break out of the loop + // 2. We time out waiting for the previous step (by timing out for either + // the SP to process a request or `host_phase1_updater` to realize + // there's been progress), at which point we panic and fail this test. + let mut prev_bytes_received = 0; + let mut expect_progress_change = false; + loop { + // Allow the SP to accept and respond to a single UDP packet. + sp_accept_sema.send(1).unwrap(); + + // Wait until the SP has sent a response, with a safety rail that we + // haven't screwed up our untangle-the-race logic: if we don't see the + // SP process any new messages after several seconds, our test is + // broken, so fail. + tokio::time::timeout(Duration::from_secs(10), sp_responses.changed()) + .await + .expect("timeout waiting for SP response count to change") + .expect("sp response count sender dropped"); + + // Inspec the SP's in-memory update state; we expect only `InProgress` + // or `Complete`, and in either case we note whether we expect to see + // status changes from `host_phase1_updater`. + match target_sp.current_update_status().await { + UpdateStatus::InProgress(sp_progress) => { + if sp_progress.bytes_received > prev_bytes_received { + prev_bytes_received = sp_progress.bytes_received; + expect_progress_change = true; + continue; + } + } + UpdateStatus::Complete(_) => { + if prev_bytes_received < phase1_data_len { + break; + } + } + status @ (UpdateStatus::None + | UpdateStatus::Preparing(_) + | UpdateStatus::SpUpdateAuxFlashChckScan { .. } + | UpdateStatus::Aborted(_) + | UpdateStatus::Failed { .. } + | UpdateStatus::RotError { .. }) => { + panic!("unexpected status {status:?}"); + } + } + + // If we get here, the most recent packet did _not_ change the SP's + // internal update state, so it was a status request from + // `host_phase1_updater`. If we expect the updater to see new progress, + // wait for that change here. + if expect_progress_change { + // Safety rail that we haven't screwed up our untangle-the-race + // logic: if we don't see a new progress after several seconds, our + // test is broken, so fail. + tokio::time::timeout(Duration::from_secs(10), progress.changed()) + .await + .expect("progress timeout") + .expect("progress watch sender dropped"); + let status = progress.borrow_and_update().clone().unwrap(); + expect_progress_change = false; + + assert!( + matches!(status, UpdateProgress::InProgress { .. }), + "unexpected progress status {status:?}" + ); + } + } + + // We know the SP has received a complete update, but `HostPhase1Updater` + // may still need to request status to realize that; release the socket + // semaphore so the SP can respond. + sp_accept_sema.send(usize::MAX).unwrap(); + + // Unlike the SP and RoT cases, there are no MGS/SP steps in between the + // update completing and `HostPhase1Updater` sending + // `UpdateProgress::Complete`. Therefore, it's a race whether we'll see + // some number of `InProgress` status before `Complete`, but we should + // quickly move to `Complete`. + loop { + tokio::time::timeout(Duration::from_secs(10), progress.changed()) + .await + .expect("progress timeout") + .expect("progress watch sender dropped"); + let status = progress.borrow_and_update().clone().unwrap(); + match status { + UpdateProgress::Complete => break, + UpdateProgress::InProgress { .. } => continue, + _ => panic!("unexpected progress status {status:?}"), + } + } + + // drop our progress receiver so `do_update_task` can complete + mem::drop(progress); + + do_update_task.await.expect("update task panicked").expect("update failed"); + + let last_update_image = target_sp + .last_host_phase1_update_data(target_host_slot) + .await + .expect("simulated host phase1 did not receive an update"); + + assert_eq!( + phase1_data.as_slice(), + &*last_update_image, + "simulated host phase1 update contents (len {}) \ + do not match test generated fake image (len {})", + last_update_image.len(), + phase1_data.len(), + ); + + mgstestctx.teardown().await; +} diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index 87c5c74f0f..4d7b41cfa8 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -12,6 +12,7 @@ mod commands; mod console_api; mod device_auth; mod disks; +mod host_phase1_updater; mod images; mod initialization; mod instances; diff --git a/nexus/tests/integration_tests/oximeter.rs b/nexus/tests/integration_tests/oximeter.rs index e97f36daf4..7dc453d713 100644 --- a/nexus/tests/integration_tests/oximeter.rs +++ b/nexus/tests/integration_tests/oximeter.rs @@ -361,7 +361,7 @@ async fn test_oximeter_collector_reregistration_gets_all_assignments() { ids.insert(id); let info = ProducerEndpoint { id, - kind: Some(ProducerKind::Service), + kind: ProducerKind::Service, address: SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 12345), base_route: String::from("/collect"), interval: Duration::from_secs(1), diff --git a/nexus/tests/integration_tests/rot_updater.rs b/nexus/tests/integration_tests/rot_updater.rs index 750f9571d0..2e6d65f8b1 100644 --- a/nexus/tests/integration_tests/rot_updater.rs +++ b/nexus/tests/integration_tests/rot_updater.rs @@ -45,10 +45,11 @@ async fn test_rot_updater_updates_sled() { .await; // Configure an MGS client. - let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( - &mgstestctx.client.url("/").to_string(), - mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )]); + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); // Configure and instantiate an `RotUpdater`. let sp_type = SpType::Sled; @@ -67,7 +68,7 @@ async fn test_rot_updater_updates_sled() { ); // Run the update. - rot_updater.update(mgs_clients).await.expect("update failed"); + rot_updater.update(&mut mgs_clients).await.expect("update failed"); // Ensure the RoT received the complete update. let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] @@ -97,10 +98,11 @@ async fn test_rot_updater_updates_switch() { .await; // Configure an MGS client. - let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( - &mgstestctx.client.url("/").to_string(), - mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )]); + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); let sp_type = SpType::Switch; let sp_slot = 0; @@ -117,7 +119,7 @@ async fn test_rot_updater_updates_switch() { &mgstestctx.logctx.log, ); - rot_updater.update(mgs_clients).await.expect("update failed"); + rot_updater.update(&mut mgs_clients).await.expect("update failed"); let last_update_image = mgstestctx.simrack.sidecars[sp_slot as usize] .last_rot_update_data() @@ -177,7 +179,7 @@ async fn test_rot_updater_remembers_successful_mgs_instance() { // delivering an update requires a bare minimum of three requests (start the // update, query the status, reset the RoT) and often more (if repeated // queries are required to wait for completion). - let mgs_clients = MgsClients::from_clients([ + let mut mgs_clients = MgsClients::from_clients([ gateway_client::Client::new( &format!("http://{failing_mgs_addr}"), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), @@ -203,7 +205,7 @@ async fn test_rot_updater_remembers_successful_mgs_instance() { &mgstestctx.logctx.log, ); - rot_updater.update(mgs_clients).await.expect("update failed"); + rot_updater.update(&mut mgs_clients).await.expect("update failed"); let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] .last_rot_update_data() @@ -295,7 +297,7 @@ async fn test_rot_updater_switches_mgs_instances_on_failure() { reqwest::Client::builder().pool_max_idle_per_host(0).build().unwrap(); // Configure two MGS clients pointed at our two proxy tasks. - let mgs_clients = MgsClients::from_clients([ + let mut mgs_clients = MgsClients::from_clients([ gateway_client::Client::new_with_client( &format!("http://{mgs_proxy_one_addr}"), client.clone(), @@ -324,7 +326,8 @@ async fn test_rot_updater_switches_mgs_instances_on_failure() { ); // Spawn the actual update task. - let mut update_task = tokio::spawn(rot_updater.update(mgs_clients)); + let mut update_task = + tokio::spawn(async move { rot_updater.update(&mut mgs_clients).await }); // Loop over incoming requests. We expect this sequence: // @@ -447,10 +450,11 @@ async fn test_rot_updater_delivers_progress() { .await; // Configure an MGS client. - let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( - &mgstestctx.client.url("/").to_string(), - mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )]); + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); let sp_type = SpType::Sled; let sp_slot = 0; @@ -483,10 +487,11 @@ async fn test_rot_updater_delivers_progress() { // Spawn the update on a background task so we can watch `progress` as it is // applied. - let do_update_task = tokio::spawn(rot_updater.update(mgs_clients)); + let do_update_task = + tokio::spawn(async move { rot_updater.update(&mut mgs_clients).await }); // Allow the SP to respond to 1 message: the "prepare update" messages that - // trigger the start of an update, then ensure we see the "started" + // triggers the start of an update, then ensure we see the "started" // progress. sp_accept_sema.send(1).unwrap(); progress.changed().await.unwrap(); @@ -556,7 +561,6 @@ async fn test_rot_updater_delivers_progress() { UpdateStatus::Complete(_) => { if prev_bytes_received < rot_image_len { prev_bytes_received = rot_image_len; - continue; } } status @ (UpdateStatus::None diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs index 89735ac3d9..1b6764e609 100644 --- a/nexus/tests/integration_tests/sp_updater.rs +++ b/nexus/tests/integration_tests/sp_updater.rs @@ -46,10 +46,11 @@ async fn test_sp_updater_updates_sled() { .await; // Configure an MGS client. - let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( - &mgstestctx.client.url("/").to_string(), - mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )]); + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); // Configure and instantiate an `SpUpdater`. let sp_type = SpType::Sled; @@ -66,7 +67,7 @@ async fn test_sp_updater_updates_sled() { ); // Run the update. - sp_updater.update(mgs_clients).await.expect("update failed"); + sp_updater.update(&mut mgs_clients).await.expect("update failed"); // Ensure the SP received the complete update. let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] @@ -96,10 +97,11 @@ async fn test_sp_updater_updates_switch() { .await; // Configure an MGS client. - let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( - &mgstestctx.client.url("/").to_string(), - mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )]); + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); let sp_type = SpType::Switch; let sp_slot = 0; @@ -114,7 +116,7 @@ async fn test_sp_updater_updates_switch() { &mgstestctx.logctx.log, ); - sp_updater.update(mgs_clients).await.expect("update failed"); + sp_updater.update(&mut mgs_clients).await.expect("update failed"); let last_update_image = mgstestctx.simrack.sidecars[sp_slot as usize] .last_sp_update_data() @@ -174,7 +176,7 @@ async fn test_sp_updater_remembers_successful_mgs_instance() { // delivering an update requires a bare minimum of three requests (start the // update, query the status, reset the SP) and often more (if repeated // queries are required to wait for completion). - let mgs_clients = MgsClients::from_clients([ + let mut mgs_clients = MgsClients::from_clients([ gateway_client::Client::new( &format!("http://{failing_mgs_addr}"), mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient1")), @@ -198,7 +200,7 @@ async fn test_sp_updater_remembers_successful_mgs_instance() { &mgstestctx.logctx.log, ); - sp_updater.update(mgs_clients).await.expect("update failed"); + sp_updater.update(&mut mgs_clients).await.expect("update failed"); let last_update_image = mgstestctx.simrack.gimlets[sp_slot as usize] .last_sp_update_data() @@ -290,7 +292,7 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() { reqwest::Client::builder().pool_max_idle_per_host(0).build().unwrap(); // Configure two MGS clients pointed at our two proxy tasks. - let mgs_clients = MgsClients::from_clients([ + let mut mgs_clients = MgsClients::from_clients([ gateway_client::Client::new_with_client( &format!("http://{mgs_proxy_one_addr}"), client.clone(), @@ -317,7 +319,8 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() { ); // Spawn the actual update task. - let mut update_task = tokio::spawn(sp_updater.update(mgs_clients)); + let mut update_task = + tokio::spawn(async move { sp_updater.update(&mut mgs_clients).await }); // Loop over incoming requests. We expect this sequence: // @@ -436,10 +439,11 @@ async fn test_sp_updater_delivers_progress() { .await; // Configure an MGS client. - let mgs_clients = MgsClients::from_clients([gateway_client::Client::new( - &mgstestctx.client.url("/").to_string(), - mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), - )]); + let mut mgs_clients = + MgsClients::from_clients([gateway_client::Client::new( + &mgstestctx.client.url("/").to_string(), + mgstestctx.logctx.log.new(slog::o!("component" => "MgsClient")), + )]); let sp_type = SpType::Sled; let sp_slot = 0; @@ -470,10 +474,11 @@ async fn test_sp_updater_delivers_progress() { // Spawn the update on a background task so we can watch `progress` as it is // applied. - let do_update_task = tokio::spawn(sp_updater.update(mgs_clients)); + let do_update_task = + tokio::spawn(async move { sp_updater.update(&mut mgs_clients).await }); // Allow the SP to respond to 2 messages: the caboose check and the "prepare - // update" messages that trigger the start of an update, then ensure we see + // update" messages that triggers the start of an update, then ensure we see // the "started" progress. sp_accept_sema.send(2).unwrap(); progress.changed().await.unwrap(); @@ -543,7 +548,6 @@ async fn test_sp_updater_delivers_progress() { UpdateStatus::Complete(_) => { if prev_bytes_received < sp_image_len { prev_bytes_received = sp_image_len; - continue; } } status @ (UpdateStatus::None diff --git a/nexus/tests/integration_tests/switch_port.rs b/nexus/tests/integration_tests/switch_port.rs index d163fc6b06..df4d96c6d1 100644 --- a/nexus/tests/integration_tests/switch_port.rs +++ b/nexus/tests/integration_tests/switch_port.rs @@ -10,7 +10,7 @@ use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params::{ Address, AddressConfig, AddressLotBlockCreate, AddressLotCreate, - BgpAnnounceSetCreate, BgpAnnouncementCreate, BgpConfigCreate, + BgpAnnounceSetCreate, BgpAnnouncementCreate, BgpConfigCreate, BgpPeer, BgpPeerConfig, LinkConfig, LinkFec, LinkSpeed, LldpServiceConfig, Route, RouteConfig, SwitchInterfaceConfig, SwitchInterfaceKind, SwitchPortApplySettings, SwitchPortSettingsCreate, @@ -118,6 +118,7 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { lldp: LldpServiceConfig { enabled: false, lldp_config: None }, fec: LinkFec::None, speed: LinkSpeed::Speed100G, + autoneg: false, }, ); // interfaces @@ -252,15 +253,17 @@ async fn test_port_settings_basic_crud(ctx: &ControlPlaneTestContext) { settings.bgp_peers.insert( "phy0".into(), BgpPeerConfig { - bgp_config: NameOrId::Name("as47".parse().unwrap()), //TODO - bgp_announce_set: NameOrId::Name("instances".parse().unwrap()), //TODO - interface_name: "phy0".to_string(), - addr: "1.2.3.4".parse().unwrap(), - hold_time: 6, - idle_hold_time: 6, - delay_open: 0, - connect_retry: 3, - keepalive: 2, + peers: vec![BgpPeer { + bgp_config: NameOrId::Name("as47".parse().unwrap()), + bgp_announce_set: NameOrId::Name("instances".parse().unwrap()), + interface_name: "phy0".to_string(), + addr: "1.2.3.4".parse().unwrap(), + hold_time: 6, + idle_hold_time: 6, + delay_open: 0, + connect_retry: 3, + keepalive: 2, + }], }, ); let _created: SwitchPortSettingsView = NexusRequest::objects_post( diff --git a/nexus/types/src/external_api/params.rs b/nexus/types/src/external_api/params.rs index a5f1f3f874..3303d38367 100644 --- a/nexus/types/src/external_api/params.rs +++ b/nexus/types/src/external_api/params.rs @@ -1354,6 +1354,18 @@ pub enum LinkFec { Rs, } +impl From for LinkFec { + fn from(x: omicron_common::api::internal::shared::PortFec) -> LinkFec { + match x { + omicron_common::api::internal::shared::PortFec::Firecode => { + Self::Firecode + } + omicron_common::api::internal::shared::PortFec::None => Self::None, + omicron_common::api::internal::shared::PortFec::Rs => Self::Rs, + } + } +} + /// The speed of a link. #[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] @@ -1378,6 +1390,40 @@ pub enum LinkSpeed { Speed400G, } +impl From for LinkSpeed { + fn from(x: omicron_common::api::internal::shared::PortSpeed) -> Self { + match x { + omicron_common::api::internal::shared::PortSpeed::Speed0G => { + Self::Speed0G + } + omicron_common::api::internal::shared::PortSpeed::Speed1G => { + Self::Speed1G + } + omicron_common::api::internal::shared::PortSpeed::Speed10G => { + Self::Speed10G + } + omicron_common::api::internal::shared::PortSpeed::Speed25G => { + Self::Speed25G + } + omicron_common::api::internal::shared::PortSpeed::Speed40G => { + Self::Speed40G + } + omicron_common::api::internal::shared::PortSpeed::Speed50G => { + Self::Speed50G + } + omicron_common::api::internal::shared::PortSpeed::Speed100G => { + Self::Speed100G + } + omicron_common::api::internal::shared::PortSpeed::Speed200G => { + Self::Speed200G + } + omicron_common::api::internal::shared::PortSpeed::Speed400G => { + Self::Speed400G + } + } + } +} + /// Switch link configuration. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct LinkConfig { @@ -1392,6 +1438,9 @@ pub struct LinkConfig { /// The speed of the link. pub speed: LinkSpeed, + + /// Whether or not to set autonegotiation + pub autoneg: bool, } /// The LLDP configuration associated with a port. LLDP may be either enabled or @@ -1479,12 +1528,17 @@ pub struct BgpConfigListSelector { pub name_or_id: Option, } +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct BgpPeerConfig { + pub peers: Vec, +} + /// A BGP peer configuration for an interface. Includes the set of announcements /// that will be advertised to the peer identified by `addr`. The `bgp_config` /// parameter is a reference to global BGP parameters. The `interface_name` /// indicates what interface the peer should be contacted on. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] -pub struct BgpPeerConfig { +pub struct BgpPeer { /// The set of announcements advertised by the peer. pub bgp_announce_set: NameOrId, diff --git a/openapi/bootstrap-agent.json b/openapi/bootstrap-agent.json index 2c7ffbc337..efd9c05fa9 100644 --- a/openapi/bootstrap-agent.json +++ b/openapi/bootstrap-agent.json @@ -510,6 +510,10 @@ "$ref": "#/components/schemas/IpNetwork" } }, + "autoneg": { + "description": "Whether or not to set autonegotiation", + "type": "boolean" + }, "bgp_peers": { "description": "BGP peers on this port", "type": "array", @@ -555,6 +559,7 @@ }, "required": [ "addresses", + "autoneg", "bgp_peers", "port", "routes", diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index c358b4109b..82c799b78d 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4240,6 +4240,10 @@ "$ref": "#/components/schemas/IpNetwork" } }, + "autoneg": { + "description": "Whether or not to set autonegotiation", + "type": "boolean" + }, "bgp_peers": { "description": "BGP peers on this port", "type": "array", @@ -4285,6 +4289,7 @@ }, "required": [ "addresses", + "autoneg", "bgp_peers", "port", "routes", @@ -4343,7 +4348,6 @@ ] }, "kind": { - "nullable": true, "description": "The kind of producer.", "allOf": [ { @@ -4356,7 +4360,8 @@ "address", "base_route", "id", - "interval" + "interval", + "kind" ] }, "ProducerKind": { diff --git a/openapi/nexus.json b/openapi/nexus.json index 08e6cd7149..15e75f93ff 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -7865,7 +7865,7 @@ "switch" ] }, - "BgpPeerConfig": { + "BgpPeer": { "description": "A BGP peer configuration for an interface. Includes the set of announcements that will be advertised to the peer identified by `addr`. The `bgp_config` parameter is a reference to global BGP parameters. The `interface_name` indicates what interface the peer should be contacted on.", "type": "object", "properties": { @@ -7937,6 +7937,20 @@ "keepalive" ] }, + "BgpPeerConfig": { + "type": "object", + "properties": { + "peers": { + "type": "array", + "items": { + "$ref": "#/components/schemas/BgpPeer" + } + } + }, + "required": [ + "peers" + ] + }, "BgpPeerState": { "description": "The current state of a BGP peer.", "oneOf": [ @@ -11938,6 +11952,10 @@ "description": "Switch link configuration.", "type": "object", "properties": { + "autoneg": { + "description": "Whether or not to set autonegotiation", + "type": "boolean" + }, "fec": { "description": "The forward error correction mode of the link.", "allOf": [ @@ -11970,6 +11988,7 @@ } }, "required": [ + "autoneg", "fec", "lldp", "mtu", diff --git a/openapi/oximeter.json b/openapi/oximeter.json index f7e534c95d..f5c78d53cd 100644 --- a/openapi/oximeter.json +++ b/openapi/oximeter.json @@ -212,7 +212,6 @@ ] }, "kind": { - "nullable": true, "description": "The kind of producer.", "allOf": [ { @@ -225,7 +224,8 @@ "address", "base_route", "id", - "interval" + "interval", + "kind" ] }, "ProducerEndpointResultsPage": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 098aee3cb4..6a0d692e99 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -5337,6 +5337,10 @@ "$ref": "#/components/schemas/IpNetwork" } }, + "autoneg": { + "description": "Whether or not to set autonegotiation", + "type": "boolean" + }, "bgp_peers": { "description": "BGP peers on this port", "type": "array", @@ -5382,6 +5386,7 @@ }, "required": [ "addresses", + "autoneg", "bgp_peers", "port", "routes", diff --git a/openapi/wicketd.json b/openapi/wicketd.json index 60ad9a42df..32e3b70de2 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -1545,6 +1545,10 @@ "$ref": "#/components/schemas/IpNetwork" } }, + "autoneg": { + "description": "Whether or not to set autonegotiation", + "type": "boolean" + }, "bgp_peers": { "description": "BGP peers on this port", "type": "array", @@ -1590,6 +1594,7 @@ }, "required": [ "addresses", + "autoneg", "bgp_peers", "port", "routes", diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index f6da172909..4135125a48 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -659,6 +659,24 @@ mod tests { use tokio::time::Instant; use uuid::Uuid; + // Interval on which oximeter collects from producers in these tests. + const COLLECTION_INTERVAL: Duration = Duration::from_secs(1); + + // Interval in calls to `tokio::time::advance`. This must be sufficiently + // small relative to `COLLECTION_INTERVAL` to ensure all ticks of internal + // timers complete as expected. + const TICK_INTERVAL: Duration = Duration::from_millis(10); + + // Total number of collection attempts. + const N_COLLECTIONS: u64 = 5; + + // Period these tests wait using `tokio::time::advance()` before checking + // their test conditions. + const TEST_WAIT_PERIOD: Duration = Duration::from_millis( + COLLECTION_INTERVAL.as_millis() as u64 * N_COLLECTIONS + + COLLECTION_INTERVAL.as_millis() as u64 / 2, + ); + // Test that we count successful collections from a target correctly. #[tokio::test] async fn test_self_stat_collection_count() { @@ -692,13 +710,12 @@ mod tests { let _task = tokio::task::spawn(server); // Register the dummy producer. - let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), - kind: Some(ProducerKind::Service), + kind: ProducerKind::Service, address, base_route: String::from("/"), - interval, + interval: COLLECTION_INTERVAL, }; collector .register_producer(endpoint) @@ -708,10 +725,8 @@ mod tests { // Step time until there has been exactly `N_COLLECTIONS` collections. tokio::time::pause(); let now = Instant::now(); - const N_COLLECTIONS: usize = 5; - let wait_for = interval * N_COLLECTIONS as u32 + interval / 2; - while now.elapsed() < wait_for { - tokio::time::advance(interval / 10).await; + while now.elapsed() < TEST_WAIT_PERIOD { + tokio::time::advance(TICK_INTERVAL).await; } // Request the statistics from the task itself. @@ -729,7 +744,7 @@ mod tests { .await .expect("failed to request statistics from task"); let stats = rx.await.expect("failed to receive statistics from task"); - assert_eq!(stats.collections.datum.value(), N_COLLECTIONS as u64); + assert_eq!(stats.collections.datum.value(), N_COLLECTIONS); assert!(stats.failed_collections.is_empty()); logctx.cleanup_successful(); } @@ -751,10 +766,9 @@ mod tests { // Register a bogus producer, which is equivalent to a producer that is // unreachable. - let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), - kind: Some(ProducerKind::Service), + kind: ProducerKind::Service, address: SocketAddr::V6(SocketAddrV6::new( Ipv6Addr::LOCALHOST, 0, @@ -762,7 +776,7 @@ mod tests { 0, )), base_route: String::from("/"), - interval, + interval: COLLECTION_INTERVAL, }; collector .register_producer(endpoint) @@ -772,10 +786,8 @@ mod tests { // Step time until there has been exactly `N_COLLECTIONS` collections. tokio::time::pause(); let now = Instant::now(); - const N_COLLECTIONS: usize = 5; - let wait_for = interval * N_COLLECTIONS as u32 + interval / 2; - while now.elapsed() < wait_for { - tokio::time::advance(interval / 10).await; + while now.elapsed() < TEST_WAIT_PERIOD { + tokio::time::advance(TICK_INTERVAL).await; } // Request the statistics from the task itself. @@ -801,7 +813,7 @@ mod tests { .unwrap() .datum .value(), - N_COLLECTIONS as u64 + N_COLLECTIONS, ); assert_eq!(stats.failed_collections.len(), 1); logctx.cleanup_successful(); @@ -840,13 +852,12 @@ mod tests { let _task = tokio::task::spawn(server); // Register the rather flaky producer. - let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), - kind: Some(ProducerKind::Service), + kind: ProducerKind::Service, address, base_route: String::from("/"), - interval, + interval: COLLECTION_INTERVAL, }; collector .register_producer(endpoint) @@ -856,10 +867,8 @@ mod tests { // Step time until there has been exactly `N_COLLECTIONS` collections. tokio::time::pause(); let now = Instant::now(); - const N_COLLECTIONS: usize = 5; - let wait_for = interval * N_COLLECTIONS as u32 + interval / 2; - while now.elapsed() < wait_for { - tokio::time::advance(interval / 10).await; + while now.elapsed() < TEST_WAIT_PERIOD { + tokio::time::advance(TICK_INTERVAL).await; } // Request the statistics from the task itself. @@ -885,7 +894,7 @@ mod tests { .unwrap() .datum .value(), - N_COLLECTIONS as u64 + N_COLLECTIONS, ); assert_eq!(stats.failed_collections.len(), 1); logctx.cleanup_successful(); diff --git a/oximeter/producer/examples/producer.rs b/oximeter/producer/examples/producer.rs index baa4f57bf7..8dbe0b6ad9 100644 --- a/oximeter/producer/examples/producer.rs +++ b/oximeter/producer/examples/producer.rs @@ -125,7 +125,7 @@ async fn main() -> anyhow::Result<()> { registry.register_producer(producer).unwrap(); let server_info = ProducerEndpoint { id: registry.producer_id(), - kind: Some(ProducerKind::Service), + kind: ProducerKind::Service, address: args.address, base_route: "/collect".to_string(), interval: Duration::from_secs(10), diff --git a/package-manifest.toml b/package-manifest.toml index 26c45f0ff7..3bce4aafee 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -384,10 +384,10 @@ only_for_targets.image = "standard" # 3. Use source.type = "manual" instead of "prebuilt" source.type = "prebuilt" source.repo = "crucible" -source.commit = "51a3121c8318fc7ac97d74f917ce1d37962e785f" +source.commit = "945f040d259ca8013d3fb26f510453da7cd7b1a6" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/crucible/image//crucible.sha256.txt -source.sha256 = "897d0fd6c0b82db42256a63a13c228152e1117434afa2681f649b291e3c6f46d" +source.sha256 = "f8c23cbf89fd0bbd928d8e3db1357bbea6e6b50560e221f873da5b56ed9d7527" output.type = "zone" [package.crucible-pantry] @@ -395,10 +395,10 @@ service_name = "crucible_pantry" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "crucible" -source.commit = "51a3121c8318fc7ac97d74f917ce1d37962e785f" +source.commit = "945f040d259ca8013d3fb26f510453da7cd7b1a6" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/crucible/image//crucible-pantry.sha256.txt -source.sha256 = "fe545de7ac4f15454d7827927149c5f0fc68ce9545b4f1ef96aac9ac8039805a" +source.sha256 = "a25b31c81798eb65564dbe259858fdd9715784d212d3508791b1ef0cf6d17da6" output.type = "zone" # Refer to @@ -409,10 +409,10 @@ service_name = "propolis-server" only_for_targets.image = "standard" source.type = "prebuilt" source.repo = "propolis" -source.commit = "54398875a2125227d13827d4236dce943c019b1c" +source.commit = "3e1d129151c3621d28ead5c6e5760693ba6e7fec" # The SHA256 digest is automatically posted to: # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/propolis/image//propolis-server.sha256.txt -source.sha256 = "01b8563db6626f90ee3fb6d97e7921b0a680373d843c1bea7ebf46fcea4f7b28" +source.sha256 = "cd341409eb2ffc3d8bec89fd20cad61d170f89d3adf926f6104eb01f4f4da881" output.type = "zone" [package.mg-ddm-gz] @@ -476,8 +476,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "8ff834e7d0a6adb263240edd40537f2c0768f1a4" -source.sha256 = "c00e79f55e0bdf048069b2d18a4d009ddfef46e7e5d846887cf96e843a8884bd" +source.commit = "2af6adea85c62ac37e451148b84e5eb0ef005f36" +source.sha256 = "dc93b671cce54e83ed55faaa267f81ba9e65abcd6714aa559d68a8783d73b1c1" output.type = "zone" output.intermediate_only = true @@ -501,8 +501,8 @@ only_for_targets.image = "standard" # 2. Copy the output zone image from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "8ff834e7d0a6adb263240edd40537f2c0768f1a4" -source.sha256 = "428cce1e9aa399b1b49c04e7fd0bc1cb0e3f3fae6fda96055892a42e010c9d6f" +source.commit = "2af6adea85c62ac37e451148b84e5eb0ef005f36" +source.sha256 = "c34b10d47fa3eb9f9f6b3655ea4ed8a726f93399ea177efea79f5c89f2ab5a1e" output.type = "zone" output.intermediate_only = true @@ -519,8 +519,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out/dendrite-softnpu.tar.gz source.type = "prebuilt" source.repo = "dendrite" -source.commit = "8ff834e7d0a6adb263240edd40537f2c0768f1a4" -source.sha256 = "5dd3534bec5eb4f857d0bf3994b26650288f650d409eec6aaa29860a2f481c37" +source.commit = "2af6adea85c62ac37e451148b84e5eb0ef005f36" +source.sha256 = "ce7065227c092ee82704f39a966b7441e3ae82d75eedb6eb281bd8b3e5873e32" output.type = "zone" output.intermediate_only = true diff --git a/schema/crdb/16.0.0/up01.sql b/schema/crdb/16.0.0/up01.sql new file mode 100644 index 0000000000..f9806c5917 --- /dev/null +++ b/schema/crdb/16.0.0/up01.sql @@ -0,0 +1,14 @@ +/* + * Previous commits added the optional kind of a producer. In this version, + * we're making the value required and not nullable. We'll first delete all + * records with a NULL kind -- there should not be any, since all producers both + * in an out of tree have been updated. Nonetheless, this is safe because + * currently we're updating offline, and all producers should re-register when + * they are restarted. + * + * NOTE: Full table scans are disallowed, however we don't have an index on + * producer kind (and don't currently need one). Allow full table scans for the + * context of this one statement. + */ +SET LOCAL disallow_full_table_scans = off; +DELETE FROM omicron.public.metric_producer WHERE kind IS NULL; diff --git a/schema/crdb/16.0.0/up02.sql b/schema/crdb/16.0.0/up02.sql new file mode 100644 index 0000000000..9c1ad2ea47 --- /dev/null +++ b/schema/crdb/16.0.0/up02.sql @@ -0,0 +1,4 @@ +/* + * Next, we make the field itself required in the database. + */ +ALTER TABLE IF EXISTS omicron.public.metric_producer ALTER COLUMN kind SET NOT NULL; diff --git a/schema/crdb/17.0.0/up1.sql b/schema/crdb/17.0.0/up1.sql new file mode 100644 index 0000000000..d28d5ca4b5 --- /dev/null +++ b/schema/crdb/17.0.0/up1.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS autoneg BOOL NOT NULL DEFAULT false; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 178c7af913..f4caa2a4e6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1182,7 +1182,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.metric_producer ( id UUID PRIMARY KEY, time_created TIMESTAMPTZ NOT NULL, time_modified TIMESTAMPTZ NOT NULL, - kind omicron.public.producer_kind, + kind omicron.public.producer_kind NOT NULL, ip INET NOT NULL, port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, interval FLOAT NOT NULL, @@ -3000,6 +3000,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.db_metadata ( CHECK (singleton = true) ); +ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS autoneg BOOL NOT NULL DEFAULT false; + INSERT INTO omicron.public.db_metadata ( singleton, time_created, @@ -3007,7 +3009,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '15.0.0', NULL) + ( TRUE, NOW(), NOW(), '17.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/rss-sled-plan.json b/schema/rss-sled-plan.json index 2ce8ae3bdc..5086c38a9c 100644 --- a/schema/rss-sled-plan.json +++ b/schema/rss-sled-plan.json @@ -366,6 +366,7 @@ "type": "object", "required": [ "addresses", + "autoneg", "bgp_peers", "port", "routes", @@ -381,6 +382,10 @@ "$ref": "#/definitions/IpNetwork" } }, + "autoneg": { + "description": "Whether or not to set autonegotiation", + "type": "boolean" + }, "bgp_peers": { "description": "BGP peers on this port", "type": "array", diff --git a/sled-agent/src/bootstrap/early_networking.rs b/sled-agent/src/bootstrap/early_networking.rs index bec309dc27..cb411a2546 100644 --- a/sled-agent/src/bootstrap/early_networking.rs +++ b/sled-agent/src/bootstrap/early_networking.rs @@ -548,23 +548,20 @@ impl<'a> EarlyNetworkSetup<'a> { let mut addrs = Vec::new(); for a in &port_config.addresses { + // TODO We're discarding the `uplink_cidr.prefix()` here and only using + // the IP address; at some point we probably need to give the full CIDR + // to dendrite? addrs.push(a.ip()); } - // TODO We're discarding the `uplink_cidr.prefix()` here and only using - // the IP address; at some point we probably need to give the full CIDR - // to dendrite? let link_settings = LinkSettings { - // TODO Allow user to configure link properties - // https://github.com/oxidecomputer/omicron/issues/3061 params: LinkCreate { - autoneg: false, - kr: false, + autoneg: port_config.autoneg, + kr: false, //NOTE: kr does not apply to user configurable links. fec: convert_fec(&port_config.uplink_port_fec), speed: convert_speed(&port_config.uplink_port_speed), lane: Some(LinkId(0)), }, - //addrs: vec![addr], addrs, }; dpd_port_settings.links.insert(link_id.to_string(), link_settings); @@ -866,6 +863,7 @@ mod tests { port: uplink.uplink_port, uplink_port_speed: uplink.uplink_port_speed, uplink_port_fec: uplink.uplink_port_fec, + autoneg: false, bgp_peers: vec![], }], bgp: vec![], diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 4b6b4be541..8038658fb1 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -622,14 +622,9 @@ impl ServiceInner { .collect(), addresses: config.addresses.clone(), switch: config.switch.into(), - uplink_port_speed: config - .uplink_port_speed - .clone() - .into(), - uplink_port_fec: config - .uplink_port_fec - .clone() - .into(), + uplink_port_speed: config.uplink_port_speed.into(), + uplink_port_fec: config.uplink_port_fec.into(), + autoneg: config.autoneg, bgp_peers: config .bgp_peers .iter() diff --git a/sled-agent/src/sim/disk.rs b/sled-agent/src/sim/disk.rs index f131fd2bff..fc388f6ce2 100644 --- a/sled-agent/src/sim/disk.rs +++ b/sled-agent/src/sim/disk.rs @@ -169,7 +169,7 @@ impl SimDisk { let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, - kind: Some(ProducerKind::SledAgent), + kind: ProducerKind::SledAgent, address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_millis(200), diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1a369570fc..90e9706198 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -507,7 +507,7 @@ impl SledAgent { // Nexus. This should not block progress here. let endpoint = ProducerEndpoint { id: request.body.id, - kind: Some(ProducerKind::SledAgent), + kind: ProducerKind::SledAgent, address: sled_address.into(), base_route: String::from("/metrics/collect"), interval: crate::metrics::METRIC_COLLECTION_INTERVAL, diff --git a/smf/sled-agent/gimlet-standalone/config-rss.toml b/smf/sled-agent/gimlet-standalone/config-rss.toml index 29a7a79eba..f7a93260e3 100644 --- a/smf/sled-agent/gimlet-standalone/config-rss.toml +++ b/smf/sled-agent/gimlet-standalone/config-rss.toml @@ -110,6 +110,8 @@ port = "qsfp0" uplink_port_speed = "40G" # The forward error correction mode for this port. uplink_port_fec="none" +# Do not use autonegotiation +autoneg = false # Switch to use for the uplink. For single-rack deployments this can be # "switch0" (upper slot) or "switch1" (lower slot). For single-node softnpu # and dendrite stub environments, use "switch0" diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index fea3cfa5d8..fdc81c0f8f 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -109,7 +109,9 @@ port = "qsfp0" # The speed of this port. uplink_port_speed = "40G" # The forward error correction mode for this port. -uplink_port_fec="none" +uplink_port_fec = "none" +# Do not use autonegotiation +autoneg = false # Switch to use for the uplink. For single-rack deployments this can be # "switch0" (upper slot) or "switch1" (lower slot). For single-node softnpu # and dendrite stub environments, use "switch0" diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs index 635e8fde6b..5cfad94c86 100644 --- a/sp-sim/src/gimlet.rs +++ b/sp-sim/src/gimlet.rs @@ -123,6 +123,15 @@ impl SimulatedSp for Gimlet { handler.update_state.last_rot_update_data() } + async fn last_host_phase1_update_data( + &self, + slot: u16, + ) -> Option> { + let handler = self.handler.as_ref()?; + let handler = handler.lock().await; + handler.update_state.last_host_phase1_update_data(slot) + } + async fn current_update_status(&self) -> gateway_messages::UpdateStatus { let Some(handler) = self.handler.as_ref() else { return gateway_messages::UpdateStatus::None; @@ -1188,7 +1197,7 @@ impl SpHandler for Handler { port: SpPort, component: SpComponent, ) -> Result { - warn!( + debug!( &self.log, "asked for component active slot"; "sender" => %sender, "port" => ?port, @@ -1211,7 +1220,7 @@ impl SpHandler for Handler { slot: u16, persist: bool, ) -> Result<(), SpError> { - warn!( + debug!( &self.log, "asked to set component active slot"; "sender" => %sender, "port" => ?port, @@ -1222,9 +1231,12 @@ impl SpHandler for Handler { if component == SpComponent::ROT { self.rot_active_slot = rot_slot_id_from_u16(slot)?; Ok(()) + } else if component == SpComponent::HOST_CPU_BOOT_FLASH { + self.update_state.set_active_host_slot(slot); + Ok(()) } else { // The real SP returns `RequestUnsupportedForComponent` for anything - // other than the RoT, including SP_ITSELF. + // other than the RoT and host boot flash, including SP_ITSELF. Err(SpError::RequestUnsupportedForComponent) } } diff --git a/sp-sim/src/lib.rs b/sp-sim/src/lib.rs index 0958e8a177..87643af9a8 100644 --- a/sp-sim/src/lib.rs +++ b/sp-sim/src/lib.rs @@ -68,6 +68,12 @@ pub trait SimulatedSp { /// Only returns data after a simulated reset of the RoT. async fn last_rot_update_data(&self) -> Option>; + /// Get the last completed update delivered to the host phase1 flash slot. + async fn last_host_phase1_update_data( + &self, + slot: u16, + ) -> Option>; + /// Get the current update status, just as would be returned by an MGS /// request to get the update status. async fn current_update_status(&self) -> gateway_messages::UpdateStatus; diff --git a/sp-sim/src/sidecar.rs b/sp-sim/src/sidecar.rs index 19e84ffc64..1bd6fe4964 100644 --- a/sp-sim/src/sidecar.rs +++ b/sp-sim/src/sidecar.rs @@ -134,6 +134,14 @@ impl SimulatedSp for Sidecar { handler.update_state.last_rot_update_data() } + async fn last_host_phase1_update_data( + &self, + _slot: u16, + ) -> Option> { + // sidecars do not have attached hosts + None + } + async fn current_update_status(&self) -> gateway_messages::UpdateStatus { let Some(handler) = self.handler.as_ref() else { return gateway_messages::UpdateStatus::None; diff --git a/sp-sim/src/update.rs b/sp-sim/src/update.rs index 9879a3ecde..0efa730a26 100644 --- a/sp-sim/src/update.rs +++ b/sp-sim/src/update.rs @@ -2,6 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use std::collections::BTreeMap; use std::io::Cursor; use std::mem; @@ -15,6 +16,8 @@ pub(crate) struct SimSpUpdate { state: UpdateState, last_sp_update_data: Option>, last_rot_update_data: Option>, + last_host_phase1_update_data: BTreeMap>, + active_host_slot: Option, } impl Default for SimSpUpdate { @@ -23,6 +26,13 @@ impl Default for SimSpUpdate { state: UpdateState::NotPrepared, last_sp_update_data: None, last_rot_update_data: None, + last_host_phase1_update_data: BTreeMap::new(), + + // In the real SP, there is always _some_ active host slot. We could + // emulate that by always defaulting to slot 0, but instead we'll + // ensure any tests that expect to read or write a particular slot + // set that slot as active first. + active_host_slot: None, } } } @@ -43,9 +53,20 @@ impl SimSpUpdate { UpdateState::NotPrepared | UpdateState::Aborted(_) | UpdateState::Completed { .. } => { + let slot = if component == SpComponent::HOST_CPU_BOOT_FLASH { + match self.active_host_slot { + Some(slot) => slot, + None => return Err(SpError::InvalidSlotForComponent), + } + } else { + // We don't manage SP or RoT slots, so just use 0 + 0 + }; + self.state = UpdateState::Prepared { component, id, + slot, data: Cursor::new(vec![0u8; total_size].into_boxed_slice()), }; Ok(()) @@ -63,7 +84,7 @@ impl SimSpUpdate { chunk_data: &[u8], ) -> Result<(), SpError> { match &mut self.state { - UpdateState::Prepared { component, id, data } => { + UpdateState::Prepared { component, id, slot, data } => { // Ensure that the update ID and target component are correct. if chunk.id != *id || chunk.component != *component { return Err(SpError::InvalidUpdateId { sp_update_id: *id }); @@ -84,10 +105,17 @@ impl SimSpUpdate { if data.position() == data.get_ref().len() as u64 { let mut stolen = Cursor::new(Box::default()); mem::swap(data, &mut stolen); + let data = stolen.into_inner(); + + if *component == SpComponent::HOST_CPU_BOOT_FLASH { + self.last_host_phase1_update_data + .insert(*slot, data.clone()); + } + self.state = UpdateState::Completed { component: *component, id: *id, - data: stolen.into_inner(), + data, }; } @@ -150,6 +178,17 @@ impl SimSpUpdate { pub(crate) fn last_rot_update_data(&self) -> Option> { self.last_rot_update_data.clone() } + + pub(crate) fn last_host_phase1_update_data( + &self, + slot: u16, + ) -> Option> { + self.last_host_phase1_update_data.get(&slot).cloned() + } + + pub(crate) fn set_active_host_slot(&mut self, slot: u16) { + self.active_host_slot = Some(slot); + } } enum UpdateState { @@ -157,6 +196,7 @@ enum UpdateState { Prepared { component: SpComponent, id: UpdateId, + slot: u16, // data would ordinarily be a Cursor>, but that can grow and // reallocate. We want to ensure that we don't receive any more data // than originally promised, so use a Cursor> to ensure that diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index ba4b5a5722..c2dda4dbd0 100644 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="8ff834e7d0a6adb263240edd40537f2c0768f1a4" +COMMIT="2af6adea85c62ac37e451148b84e5eb0ef005f36" SHA2="07d115bfa8498a8015ca2a8447efeeac32e24aeb25baf3d5e2313216e11293c0" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 619a6bf287..77ee198fc5 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="c00e79f55e0bdf048069b2d18a4d009ddfef46e7e5d846887cf96e843a8884bd" -CIDL_SHA256_LINUX_DPD="b5d829b4628759ac374106f3c56c29074b29577fd0ff72f61c3b8289fea430fe" -CIDL_SHA256_LINUX_SWADM="afc68828f54dc57b32dc1556fc588baeab12341c30e96cc0fadb49f401b4b48f" +CIDL_SHA256_ILLUMOS="dc93b671cce54e83ed55faaa267f81ba9e65abcd6714aa559d68a8783d73b1c1" +CIDL_SHA256_LINUX_DPD="b13b391a085ba6bf16fdd99774f64c9d53cd7220ad518d5839c8558fb925c40c" +CIDL_SHA256_LINUX_SWADM="6bfa4e367eb2b0be89f1588ac458026a186314597a4feb9fee6cea60101c7ebe" diff --git a/tools/generate-wicketd-api.sh b/tools/generate-wicketd-api.sh new file mode 100755 index 0000000000..f1af33aecc --- /dev/null +++ b/tools/generate-wicketd-api.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +./target/debug/wicketd openapi > openapi/wicketd.json diff --git a/tufaceous-lib/Cargo.toml b/tufaceous-lib/Cargo.toml index 0df3a33f98..aa9a26e3bb 100644 --- a/tufaceous-lib/Cargo.toml +++ b/tufaceous-lib/Cargo.toml @@ -33,7 +33,7 @@ tar.workspace = true tokio.workspace = true toml.workspace = true tough.workspace = true -url = "2.4.1" +url = "2.5.0" zip.workspace = true omicron-workspace-hack.workspace = true diff --git a/wicket/src/cli/rack_setup/config_template.toml b/wicket/src/cli/rack_setup/config_template.toml index 617b61fadc..2886fa01d7 100644 --- a/wicket/src/cli/rack_setup/config_template.toml +++ b/wicket/src/cli/rack_setup/config_template.toml @@ -65,6 +65,9 @@ uplink_port_speed = "" # `none`, `firecode`, or `rs` uplink_port_fec = "" +# `true` or `false` +autoneg = "" + # A list of bgp peers # { addr = "1.7.0.1", asn = 47, port = "qsfp0" } bgp_peers = [] diff --git a/wicket/src/cli/rack_setup/config_toml.rs b/wicket/src/cli/rack_setup/config_toml.rs index 9b1a25a50e..5a8e8a560e 100644 --- a/wicket/src/cli/rack_setup/config_toml.rs +++ b/wicket/src/cli/rack_setup/config_toml.rs @@ -229,6 +229,12 @@ fn populate_network_table( ); _last_key = Some(property); } + uplink.insert( + "autoneg", + Item::Value(Value::Boolean(Formatted::new( + cfg.autoneg, + ))), + ); let mut routes = Array::new(); for r in &cfg.routes { @@ -449,6 +455,7 @@ mod tests { PortFec::None => InternalPortFec::None, PortFec::Rs => InternalPortFec::Rs, }, + autoneg: config.autoneg, switch: match config.switch { SwitchLocation::Switch0 => { InternalSwitchLocation::Switch0 @@ -529,6 +536,7 @@ mod tests { }], uplink_port_speed: PortSpeed::Speed400G, uplink_port_fec: PortFec::Firecode, + autoneg: true, port: "port0".into(), switch: SwitchLocation::Switch0, }], diff --git a/wicketd/src/preflight_check/uplink.rs b/wicketd/src/preflight_check/uplink.rs index d94baf1995..25411f17a5 100644 --- a/wicketd/src/preflight_check/uplink.rs +++ b/wicketd/src/preflight_check/uplink.rs @@ -775,10 +775,8 @@ fn build_port_settings( LinkSettings { addrs, params: LinkCreate { - // TODO we should take these parameters too - // https://github.com/oxidecomputer/omicron/issues/3061 - autoneg: false, - kr: false, + autoneg: uplink.autoneg, + kr: false, //NOTE: kr does not apply to user configurable links fec, speed, lane: Some(LinkId(0)), diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index 0aaea427f3..f654597d81 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -548,6 +548,7 @@ fn validate_rack_network_config( PortFec::None => BaPortFec::None, PortFec::Rs => BaPortFec::Rs, }, + autoneg: config.autoneg, }) .collect(), bgp: config