diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml index bf29cf4308..21125cf034 100644 --- a/.github/workflows/hakari.yml +++ b/.github/workflows/hakari.yml @@ -24,7 +24,7 @@ jobs: with: toolchain: stable - name: Install cargo-hakari - uses: taiki-e/install-action@d79dff47733726f636463323dd2d82724f6c36ba # v2 + uses: taiki-e/install-action@ada21a86dcbd8480ccdd77e11e167f51a002fb3e # v2 with: tool: cargo-hakari - name: Check workspace-hack Cargo.toml is up-to-date diff --git a/Cargo.lock b/Cargo.lock index 3f9c9cdb4f..f88c27a948 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1883,9 +1883,9 @@ checksum = "a7993efb860416547839c115490d4951c6d0f8ec04a3594d9dd99d50ed7ec170" [[package]] name = "diesel" -version = "2.1.6" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff236accb9a5069572099f0b350a92e9560e8e63a9b8d546162f4a5e03026bb2" +checksum = "bf97ee7261bb708fa3402fa9c17a54b70e90e3cb98afb3dc8999d5512cb03f94" dependencies = [ "bitflags 2.6.0", "byteorder", @@ -1903,7 +1903,7 @@ dependencies = [ [[package]] name = "diesel-dtrace" version = "0.3.0" -source = "git+https://github.com/oxidecomputer/diesel-dtrace?branch=main#62ef5ca0fe243a0929791bb9efbb7ed9c32c5368" +source = "git+https://github.com/oxidecomputer/diesel-dtrace?branch=main#8fcc2bb37c635598c39711d8034b14227c210096" dependencies = [ "diesel", "serde", @@ -1914,11 +1914,12 @@ dependencies = [ [[package]] name = "diesel_derives" -version = "2.1.4" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14701062d6bed917b5c7103bdffaee1e4609279e240488ad24e7bd979ca6866c" +checksum = "d6ff2be1e7312c858b2ef974f5c7089833ae57b5311b334b30923af58e5718d8" dependencies = [ "diesel_table_macro_syntax", + "dsl_auto_type", "proc-macro2", "quote", "syn 2.0.72", @@ -1926,9 +1927,9 @@ dependencies = [ [[package]] name = "diesel_table_macro_syntax" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5" +checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ "syn 2.0.72", ] @@ -2203,6 +2204,20 @@ dependencies = [ "syn 2.0.72", ] +[[package]] +name = "dsl_auto_type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5d9abe6314103864cc2d8901b7ae224e0ab1a103a0a416661b4097b0779b607" +dependencies = [ + "darling", + "either", + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.72", +] + [[package]] name = "dtrace-parser" version = "0.2.0" @@ -4505,14 +4520,13 @@ dependencies = [ [[package]] name = "mockall" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43766c2b5203b10de348ffe19f7e54564b64f3d6018ff7648d1e2d6d3a0f0a48" +checksum = "d4c28b3fb6d753d28c20e826cd46ee611fda1cf3cde03a443a974043247c065a" dependencies = [ "cfg-if", "downcast", "fragile", - "lazy_static", "mockall_derive", "predicates", "predicates-tree", @@ -4520,9 +4534,9 @@ dependencies = [ [[package]] name = "mockall_derive" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7cbce79ec385a1d4f54baa90a76401eb15d9cab93685f62e7e9f942aa00ae2" +checksum = "341014e7f530314e9a1fdbc7400b244efea7122662c96bfa248c31da5bfb2020" dependencies = [ "cfg-if", "proc-macro2", @@ -6137,7 +6151,6 @@ dependencies = [ "crossterm", "crypto-common", "der", - "diesel", "digest", "dof", "either", @@ -6162,7 +6175,6 @@ dependencies = [ "hyper 0.14.30", "indexmap 2.3.0", "inout", - "ipnetwork", "itertools 0.10.5", "itertools 0.12.1", "lalrpop-util", @@ -7639,9 +7651,9 @@ dependencies = [ [[package]] name = "proptest" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31b476131c3c86cb68032fdc5cb6d5a1045e3e42d96b69fa599fd77701e1f5bf" +checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" dependencies = [ "bit-set", "bit-vec", @@ -8742,9 +8754,9 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.204" +version = "1.0.205" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150" dependencies = [ "serde_derive", ] @@ -8780,9 +8792,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.204" +version = "1.0.205" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 01c9eee011..536941a72d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -314,7 +314,8 @@ db-macros = { path = "nexus/db-macros" } debug-ignore = "1.0.5" derive_more = "0.99.18" derive-where = "1.2.7" -diesel = { version = "2.1.6", features = ["postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } +# Having the i-implement-... feature here makes diesel go away from the workspace-hack +diesel = { version = "2.2.2", features = ["i-implement-a-third-party-backend-and-opt-into-breaking-changes", "postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } diesel-dtrace = { git = "https://github.com/oxidecomputer/diesel-dtrace", branch = "main" } dns-server = { path = "dns-server" } dns-server-api = { path = "dns-server-api" } @@ -385,7 +386,7 @@ libnvme = { git = "https://github.com/oxidecomputer/libnvme", rev = "dd5bb221d32 linear-map = "1.2.0" macaddr = { version = "1.0.1", features = ["serde_std"] } maplit = "1.0.2" -mockall = "0.12" +mockall = "0.13" newtype_derive = "0.1.6" mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "220dd026e83142b83bd93123f465a64dd4600201" } ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "220dd026e83142b83bd93123f465a64dd4600201" } @@ -470,7 +471,7 @@ progenitor-client = { git = "https://github.com/oxidecomputer/progenitor", branc bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "24a74d0c76b6a63961ecef76acb1516b6e66c5c9" } propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "24a74d0c76b6a63961ecef76acb1516b6e66c5c9" } propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "24a74d0c76b6a63961ecef76acb1516b6e66c5c9" } -proptest = "1.4.0" +proptest = "1.5.0" quote = "1.0" rand = "0.8.5" rand_core = "0.6.4" diff --git a/clients/gateway-client/src/lib.rs b/clients/gateway-client/src/lib.rs index ab936de079..332778ef86 100644 --- a/clients/gateway-client/src/lib.rs +++ b/clients/gateway-client/src/lib.rs @@ -64,7 +64,7 @@ progenitor::generate_api!( HostPhase2RecoveryImageId = { derives = [PartialEq, Eq, PartialOrd, Ord] }, ImageVersion = { derives = [PartialEq, Eq, PartialOrd, Ord] }, RotImageDetails = { derives = [PartialEq, Eq, PartialOrd, Ord] }, - RotImageError = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize] }, + RotImageError = { derives = [ PartialEq, Eq, PartialOrd, Ord] }, RotSlot = { derives = [PartialEq, Eq, PartialOrd, Ord] }, RotState = { derives = [PartialEq, Eq, PartialOrd, Ord] }, SpIdentifier = { derives = [Copy, PartialEq, Hash, Eq] }, diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 162c3f4dbf..b7722144fe 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -122,22 +122,6 @@ impl From for omicron_common::api::internal::nexus::VmmState { } } -impl From - for types::InstanceRuntimeState -{ - fn from( - s: omicron_common::api::internal::nexus::InstanceRuntimeState, - ) -> Self { - Self { - dst_propolis_id: s.dst_propolis_id, - gen: s.gen, - migration_id: s.migration_id, - propolis_id: s.propolis_id, - time_updated: s.time_updated, - } - } -} - impl From for types::VmmRuntimeState { @@ -153,10 +137,10 @@ impl From s: omicron_common::api::internal::nexus::SledInstanceState, ) -> Self { Self { - instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - migration_state: s.migration_state.map(Into::into), + migration_in: s.migration_in.map(Into::into), + migration_out: s.migration_out.map(Into::into), } } } @@ -169,7 +153,6 @@ impl From ) -> Self { Self { migration_id: s.migration_id, - role: s.role.into(), state: s.state.into(), gen: s.gen, time_updated: s.time_updated, @@ -177,18 +160,6 @@ impl From } } -impl From - for types::MigrationRole -{ - fn from(s: omicron_common::api::internal::nexus::MigrationRole) -> Self { - use omicron_common::api::internal::nexus::MigrationRole as Input; - match s { - Input::Source => Self::Source, - Input::Target => Self::Target, - } - } -} - impl From for types::MigrationState { diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 073cb9cfeb..c53b966da6 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -5,6 +5,9 @@ //! Interface for making API requests to a Sled Agent use async_trait::async_trait; +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; use std::convert::TryFrom; use uuid::Uuid; @@ -38,6 +41,7 @@ progenitor::generate_api!( replace = { Baseboard = nexus_sled_agent_shared::inventory::Baseboard, ByteCount = omicron_common::api::external::ByteCount, + DatasetKind = omicron_common::api::internal::shared::DatasetKind, DiskIdentity = omicron_common::disk::DiskIdentity, DiskVariant = omicron_common::disk::DiskVariant, Generation = omicron_common::api::external::Generation, @@ -163,10 +167,10 @@ impl From { fn from(s: types::SledInstanceState) -> Self { Self { - instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - migration_state: s.migration_state.map(Into::into), + migration_in: s.migration_in.map(Into::into), + migration_out: s.migration_out.map(Into::into), } } } @@ -178,25 +182,12 @@ impl From Self { migration_id: s.migration_id, state: s.state.into(), - role: s.role.into(), gen: s.gen, time_updated: s.time_updated, } } } -impl From - for omicron_common::api::internal::nexus::MigrationRole -{ - fn from(r: types::MigrationRole) -> Self { - use omicron_common::api::internal::nexus::MigrationRole as Output; - match r { - types::MigrationRole::Source => Output::Source, - types::MigrationRole::Target => Output::Target, - } - } -} - impl From for omicron_common::api::internal::nexus::MigrationState { @@ -458,12 +449,29 @@ impl From /// are bonus endpoints, not generated in the real client. #[async_trait] pub trait TestInterfaces { + async fn instance_single_step(&self, id: Uuid); async fn instance_finish_transition(&self, id: Uuid); + async fn instance_simulate_migration_source( + &self, + id: Uuid, + params: SimulateMigrationSource, + ); async fn disk_finish_transition(&self, id: Uuid); } #[async_trait] impl TestInterfaces for Client { + async fn instance_single_step(&self, id: Uuid) { + let baseurl = self.baseurl(); + let client = self.client(); + let url = format!("{}/instances/{}/poke-single-step", baseurl, id); + client + .post(url) + .send() + .await + .expect("instance_single_step() failed unexpectedly"); + } + async fn instance_finish_transition(&self, id: Uuid) { let baseurl = self.baseurl(); let client = self.client(); @@ -485,4 +493,46 @@ impl TestInterfaces for Client { .await .expect("disk_finish_transition() failed unexpectedly"); } + + async fn instance_simulate_migration_source( + &self, + id: Uuid, + params: SimulateMigrationSource, + ) { + let baseurl = self.baseurl(); + let client = self.client(); + let url = format!("{baseurl}/instances/{id}/sim-migration-source"); + client + .post(url) + .json(¶ms) + .send() + .await + .expect("instance_simulate_migration_source() failed unexpectedly"); + } +} + +/// Parameters to the `/instances/{id}/sim-migration-source` test API. +/// +/// This message type is not included in the OpenAPI spec, because this API +/// exists only in test builds. +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SimulateMigrationSource { + /// The ID of the migration out of the instance's current active VMM. + pub migration_id: Uuid, + /// What migration result (success or failure) to simulate. + pub result: SimulatedMigrationResult, +} + +/// The result of a simulated migration out from an instance's current active +/// VMM. +#[derive(Serialize, Deserialize, JsonSchema)] +pub enum SimulatedMigrationResult { + /// Simulate a successful migration out. + Success, + /// Simulate a failed migration out. + /// + /// # Note + /// + /// This is not currently implemented by the simulated sled-agent. + Failure, } diff --git a/common/src/address.rs b/common/src/address.rs index 44942a9854..5ed5689289 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -25,6 +25,9 @@ pub const MAX_PORT: u16 = u16::MAX; /// minimum possible value for a tcp or udp port pub const MIN_PORT: u16 = u16::MIN; +/// The amount of redundancy for boundary NTP servers. +pub const BOUNDARY_NTP_REDUNDANCY: usize = 2; + /// The amount of redundancy for Nexus services. /// /// This is used by both RSS (to distribute the initial set of services) and the diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index d4ed1773f6..7f4eb358a4 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -117,18 +117,38 @@ pub struct VmmRuntimeState { /// specific VMM and the instance it incarnates. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct SledInstanceState { - /// The sled's conception of the state of the instance. - pub instance_state: InstanceRuntimeState, - /// The ID of the VMM whose state is being reported. pub propolis_id: PropolisUuid, /// The most recent state of the sled's VMM process. pub vmm_state: VmmRuntimeState, - /// The current state of any in-progress migration for this instance, as - /// understood by this sled. - pub migration_state: Option, + /// The current state of any inbound migration to this VMM. + pub migration_in: Option, + + /// The state of any outbound migration from this VMM. + pub migration_out: Option, +} + +#[derive(Copy, Clone, Debug, Default)] +pub struct Migrations<'state> { + pub migration_in: Option<&'state MigrationRuntimeState>, + pub migration_out: Option<&'state MigrationRuntimeState>, +} + +impl Migrations<'_> { + pub fn empty() -> Self { + Self { migration_in: None, migration_out: None } + } +} + +impl SledInstanceState { + pub fn migrations(&self) -> Migrations<'_> { + Migrations { + migration_in: self.migration_in.as_ref(), + migration_out: self.migration_out.as_ref(), + } + } } /// An update from a sled regarding the state of a migration, indicating the @@ -137,7 +157,6 @@ pub struct SledInstanceState { pub struct MigrationRuntimeState { pub migration_id: Uuid, pub state: MigrationState, - pub role: MigrationRole, pub gen: Generation, /// Timestamp for the migration state update. @@ -192,32 +211,6 @@ impl fmt::Display for MigrationState { } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, -)] -#[serde(rename_all = "snake_case")] -pub enum MigrationRole { - /// This update concerns the source VMM of a migration. - Source, - /// This update concerns the target VMM of a migration. - Target, -} - -impl MigrationRole { - pub fn label(&self) -> &'static str { - match self { - Self::Source => "source", - Self::Target => "target", - } - } -} - -impl fmt::Display for MigrationRole { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(self.label()) - } -} - // Oximeter producer/collector objects. /// The kind of metric producer this is. diff --git a/common/src/api/internal/shared.rs b/common/src/api/internal/shared.rs index 403e0855a3..cd11bfe92a 100644 --- a/common/src/api/internal/shared.rs +++ b/common/src/api/internal/shared.rs @@ -704,13 +704,9 @@ pub struct ResolvedVpcRouteSet { } /// Describes the purpose of the dataset. -#[derive( - Debug, JsonSchema, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, EnumCount, -)] -#[serde(tag = "type", rename_all = "snake_case")] +#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, EnumCount)] pub enum DatasetKind { // Durable datasets for zones - #[serde(rename = "cockroachdb")] Cockroach, Crucible, Clickhouse, @@ -720,9 +716,7 @@ pub enum DatasetKind { // Zone filesystems ZoneRoot, - Zone { - name: String, - }, + Zone { name: String }, // Other datasets Debug, @@ -747,6 +741,27 @@ impl<'de> Deserialize<'de> for DatasetKind { } } +impl JsonSchema for DatasetKind { + fn schema_name() -> String { + "DatasetKind".to_string() + } + + fn json_schema( + gen: &mut schemars::gen::SchemaGenerator, + ) -> schemars::schema::Schema { + // The schema is a bit more complicated than this -- it's either one of + // the fixed values or a string starting with "zone/" -- but this is + // good enough for now. + let mut schema = ::json_schema(gen).into_object(); + schema.metadata().description = Some( + "The kind of dataset. See the `DatasetKind` enum \ + in omicron-common for possible values." + .to_owned(), + ); + schema.into() + } +} + impl DatasetKind { pub fn dataset_should_be_encrypted(&self) -> bool { match self { diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 6648c38878..92aa6cf125 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -55,6 +55,7 @@ use nexus_db_model::DnsVersion; use nexus_db_model::DnsZone; use nexus_db_model::ExternalIp; use nexus_db_model::HwBaseboardId; +use nexus_db_model::Image; use nexus_db_model::Instance; use nexus_db_model::InvCollection; use nexus_db_model::InvPhysicalDisk; @@ -91,6 +92,7 @@ use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::model::ServiceKind; +use nexus_db_queries::db::pagination::paginated; use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; @@ -486,6 +488,27 @@ struct RegionArgs { enum RegionCommands { /// List regions that are still missing ports ListRegionsMissingPorts, + + /// List all regions + List(RegionListArgs), + + /// Find what is using a region + UsedBy(RegionUsedByArgs), + + /// Find deleted volume regions + FindDeletedVolumeRegions, +} + +#[derive(Debug, Args)] +struct RegionListArgs { + /// Print region IDs only + #[arg(short)] + id_only: bool, +} + +#[derive(Debug, Args)] +struct RegionUsedByArgs { + region_id: Vec, } #[derive(Debug, Args)] @@ -738,6 +761,29 @@ impl DbArgs { DbCommands::Region(RegionArgs { command: RegionCommands::ListRegionsMissingPorts, }) => cmd_db_region_missing_porst(&opctx, &datastore).await, + DbCommands::Region(RegionArgs { + command: RegionCommands::List(region_list_args), + }) => { + cmd_db_region_list( + &datastore, + &self.fetch_opts, + region_list_args, + ) + .await + } + DbCommands::Region(RegionArgs { + command: RegionCommands::UsedBy(region_used_by_args), + }) => { + cmd_db_region_used_by( + &datastore, + &self.fetch_opts, + region_used_by_args, + ) + .await + } + DbCommands::Region(RegionArgs { + command: RegionCommands::FindDeletedVolumeRegions, + }) => cmd_db_region_find_deleted(&datastore).await, DbCommands::RegionReplacement(RegionReplacementArgs { command: RegionReplacementCommands::List(args), }) => { @@ -1994,6 +2040,305 @@ async fn cmd_db_region_missing_porst( Ok(()) } +/// List all regions +async fn cmd_db_region_list( + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &RegionListArgs, +) -> Result<(), anyhow::Error> { + use db::schema::region::dsl; + + let regions: Vec = paginated( + dsl::region, + dsl::id, + &first_page::(fetch_opts.fetch_limit), + ) + .select(Region::as_select()) + .load_async(&*datastore.pool_connection_for_tests().await?) + .await?; + + check_limit(®ions, fetch_opts.fetch_limit, || { + String::from("listing regions") + }); + + if args.id_only { + for region in regions { + println!("{}", region.id()); + } + } else { + #[derive(Tabled)] + struct RegionRow { + id: Uuid, + dataset_id: Uuid, + volume_id: Uuid, + block_size: i64, + blocks_per_extent: u64, + extent_count: u64, + read_only: bool, + } + + let rows: Vec<_> = regions + .into_iter() + .map(|region: Region| RegionRow { + id: region.id(), + dataset_id: region.dataset_id(), + volume_id: region.volume_id(), + block_size: region.block_size().into(), + blocks_per_extent: region.blocks_per_extent(), + extent_count: region.extent_count(), + read_only: region.read_only(), + }) + .collect(); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::psql()) + .to_string(); + + println!("{}", table); + } + + Ok(()) +} + +/// Find what is using a region +async fn cmd_db_region_used_by( + datastore: &DataStore, + fetch_opts: &DbFetchOptions, + args: &RegionUsedByArgs, +) -> Result<(), anyhow::Error> { + use db::schema::region::dsl; + + let regions: Vec = paginated( + dsl::region, + dsl::id, + &first_page::(fetch_opts.fetch_limit), + ) + .filter(dsl::id.eq_any(args.region_id.clone())) + .select(Region::as_select()) + .load_async(&*datastore.pool_connection_for_tests().await?) + .await?; + + check_limit(®ions, fetch_opts.fetch_limit, || { + String::from("listing regions") + }); + + let volumes: Vec = regions.iter().map(|x| x.volume_id()).collect(); + + let disks_used: Vec = { + let volumes = volumes.clone(); + datastore + .pool_connection_for_tests() + .await? + .transaction_async(|conn| async move { + use db::schema::disk::dsl; + + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + + paginated( + dsl::disk, + dsl::id, + &first_page::(fetch_opts.fetch_limit), + ) + .filter(dsl::volume_id.eq_any(volumes)) + .select(Disk::as_select()) + .load_async(&conn) + .await + }) + .await? + }; + + check_limit(&disks_used, fetch_opts.fetch_limit, || { + String::from("listing disks used") + }); + + let snapshots_used: Vec = { + let volumes = volumes.clone(); + datastore + .pool_connection_for_tests() + .await? + .transaction_async(|conn| async move { + use db::schema::snapshot::dsl; + + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + + paginated( + dsl::snapshot, + dsl::id, + &first_page::(fetch_opts.fetch_limit), + ) + .filter( + dsl::volume_id + .eq_any(volumes.clone()) + .or(dsl::destination_volume_id.eq_any(volumes.clone())), + ) + .select(Snapshot::as_select()) + .load_async(&conn) + .await + }) + .await? + }; + + check_limit(&snapshots_used, fetch_opts.fetch_limit, || { + String::from("listing snapshots used") + }); + + let images_used: Vec = { + let volumes = volumes.clone(); + datastore + .pool_connection_for_tests() + .await? + .transaction_async(|conn| async move { + use db::schema::image::dsl; + + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + + paginated( + dsl::image, + dsl::id, + &first_page::(fetch_opts.fetch_limit), + ) + .filter(dsl::volume_id.eq_any(volumes)) + .select(Image::as_select()) + .load_async(&conn) + .await + }) + .await? + }; + + check_limit(&images_used, fetch_opts.fetch_limit, || { + String::from("listing images used") + }); + + #[derive(Tabled)] + struct RegionRow { + id: Uuid, + volume_id: Uuid, + usage_type: String, + usage_id: String, + usage_name: String, + deleted: bool, + } + + let rows: Vec<_> = regions + .into_iter() + .map(|region: Region| { + if let Some(image) = + images_used.iter().find(|x| x.volume_id == region.volume_id()) + { + RegionRow { + id: region.id(), + volume_id: region.volume_id(), + + usage_type: String::from("image"), + usage_id: image.id().to_string(), + usage_name: image.name().to_string(), + deleted: image.time_deleted().is_some(), + } + } else if let Some(snapshot) = snapshots_used + .iter() + .find(|x| x.volume_id == region.volume_id()) + { + RegionRow { + id: region.id(), + volume_id: region.volume_id(), + + usage_type: String::from("snapshot"), + usage_id: snapshot.id().to_string(), + usage_name: snapshot.name().to_string(), + deleted: snapshot.time_deleted().is_some(), + } + } else if let Some(snapshot) = snapshots_used + .iter() + .find(|x| x.destination_volume_id == region.volume_id()) + { + RegionRow { + id: region.id(), + volume_id: region.volume_id(), + + usage_type: String::from("snapshot dest"), + usage_id: snapshot.id().to_string(), + usage_name: snapshot.name().to_string(), + deleted: snapshot.time_deleted().is_some(), + } + } else if let Some(disk) = + disks_used.iter().find(|x| x.volume_id == region.volume_id()) + { + RegionRow { + id: region.id(), + volume_id: region.volume_id(), + + usage_type: String::from("disk"), + usage_id: disk.id().to_string(), + usage_name: disk.name().to_string(), + deleted: disk.time_deleted().is_some(), + } + } else { + RegionRow { + id: region.id(), + volume_id: region.volume_id(), + + usage_type: String::from("unknown!"), + usage_id: String::from(""), + usage_name: String::from(""), + deleted: false, + } + } + }) + .collect(); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::psql()) + .to_string(); + + println!("{}", table); + + Ok(()) +} + +/// Find deleted volume regions +async fn cmd_db_region_find_deleted( + datastore: &DataStore, +) -> Result<(), anyhow::Error> { + let datasets_regions_volumes = + datastore.find_deleted_volume_regions().await?; + + #[derive(Tabled)] + struct Row { + dataset_id: Uuid, + region_id: Uuid, + region_snapshot_id: String, + volume_id: Uuid, + } + + let rows: Vec = datasets_regions_volumes + .into_iter() + .map(|row| { + let (dataset, region, region_snapshot, volume) = row; + + Row { + dataset_id: dataset.id(), + region_id: region.id(), + region_snapshot_id: if let Some(region_snapshot) = + region_snapshot + { + region_snapshot.snapshot_id.to_string() + } else { + String::from("") + }, + volume_id: volume.id(), + } + }) + .collect(); + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::psql()) + .to_string(); + + println!("{}", table); + + Ok(()) +} + /// List all region replacement requests async fn cmd_db_region_replacement_list( datastore: &DataStore, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 8649d15aa6..ec3e519cbc 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -929,6 +929,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { /// number of stale instance metrics that were deleted pruned_instances: usize, + /// update sagas queued due to instance updates. + update_sagas_queued: usize, + /// instance states from completed checks. /// /// this is a mapping of stringified instance states to the number @@ -970,6 +973,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { ), Ok(TaskSuccess { total_instances, + update_sagas_queued, pruned_instances, instance_states, failed_checks, @@ -987,7 +991,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { for (state, count) in &instance_states { println!(" -> {count} instances {state}") } - + println!(" update sagas queued: {update_sagas_queued}"); println!(" failed checks: {total_failures}"); for (failure, count) in &failed_checks { println!(" -> {count} {failure}") @@ -1239,11 +1243,6 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } else if name == "lookup_region_port" { match serde_json::from_value::(details.clone()) { - Err(error) => eprintln!( - "warning: failed to interpret task details: {:?}: {:?}", - error, details - ), - Ok(LookupRegionPortStatus { found_port_ok, errors }) => { println!(" total filled in ports: {}", found_port_ok.len()); for line in &found_port_ok { @@ -1255,6 +1254,83 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } } + + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details, + ), + } + } else if name == "instance_updater" { + #[derive(Deserialize)] + struct UpdaterStatus { + /// number of instances found with destroyed active VMMs + destroyed_active_vmms: usize, + + /// number of instances found with terminated active migrations + terminated_active_migrations: usize, + + /// number of update sagas started. + sagas_started: usize, + + /// number of sagas completed successfully + sagas_completed: usize, + + /// number of sagas which failed + sagas_failed: usize, + + /// number of sagas which could not be started + saga_start_failures: usize, + + /// the last error that occurred during execution. + error: Option, + } + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(UpdaterStatus { + destroyed_active_vmms, + terminated_active_migrations, + sagas_started, + sagas_completed, + sagas_failed, + saga_start_failures, + error, + }) => { + if let Some(error) = error { + println!(" task did not complete successfully!"); + println!(" most recent error: {error}"); + } + + println!( + " total instances in need of updates: {}", + destroyed_active_vmms + terminated_active_migrations + ); + println!( + " instances with destroyed active VMMs: {}", + destroyed_active_vmms, + ); + println!( + " instances with terminated active migrations: {}", + terminated_active_migrations, + ); + println!(" update sagas started: {sagas_started}"); + println!( + " update sagas completed successfully: {}", + sagas_completed, + ); + + let total_failed = sagas_failed + saga_start_failures; + if total_failed > 0 { + println!(" unsuccessful update sagas: {total_failed}"); + println!( + " sagas which could not be started: {}", + saga_start_failures + ); + println!(" sagas failed: {sagas_failed}"); + } + } }; } else { println!( diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index a6bf4d4667..67f113a801 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -86,6 +86,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states @@ -231,6 +235,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states @@ -363,6 +371,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index cec3fa3052..d4c07899f4 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -287,6 +287,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states @@ -482,6 +486,17 @@ task: "external_endpoints" TLS certificates: 0 +task: "instance_updater" + configured period: every s + currently executing: no + last completed activation: , triggered by a periodic timer firing + started at (s ago) and ran for ms + total instances in need of updates: 0 + instances with destroyed active VMMs: 0 + instances with terminated active migrations: 0 + update sagas started: 0 + update sagas completed successfully: 0 + task: "instance_watcher" configured period: every s currently executing: no @@ -490,6 +505,7 @@ task: "instance_watcher" total instances checked: 0 checks completed: 0 successful checks: 0 + update sagas queued: 0 failed checks: 0 checks that could not be completed: 0 stale instance metrics pruned: 0 diff --git a/dev-tools/reconfigurator-cli/src/main.rs b/dev-tools/reconfigurator-cli/src/main.rs index 983dde412d..13e4617679 100644 --- a/dev-tools/reconfigurator-cli/src/main.rs +++ b/dev-tools/reconfigurator-cli/src/main.rs @@ -34,6 +34,7 @@ use omicron_common::api::external::Generation; use omicron_common::api::external::Name; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::VnicUuid; use reedline::{Reedline, Signal}; @@ -435,6 +436,8 @@ enum BlueprintEditCommands { }, /// add a CockroachDB instance to a particular sled AddCockroach { sled_id: SledUuid }, + /// expunge a particular zone from a particular sled + ExpungeZone { sled_id: SledUuid, zone_id: OmicronZoneUuid }, } #[derive(Debug, Args)] @@ -747,8 +750,8 @@ fn cmd_blueprint_edit( let label = match args.edit_command { BlueprintEditCommands::AddNexus { sled_id } => { - let current = - builder.sled_num_zones_of_kind(sled_id, ZoneKind::Nexus); + let current = builder + .sled_num_running_zones_of_kind(sled_id, ZoneKind::Nexus); let added = builder .sled_ensure_zone_multiple_nexus(sled_id, current + 1) .context("failed to add Nexus zone")?; @@ -759,8 +762,8 @@ fn cmd_blueprint_edit( format!("added Nexus zone to sled {}", sled_id) } BlueprintEditCommands::AddCockroach { sled_id } => { - let current = - builder.sled_num_zones_of_kind(sled_id, ZoneKind::CockroachDb); + let current = builder + .sled_num_running_zones_of_kind(sled_id, ZoneKind::CockroachDb); let added = builder .sled_ensure_zone_multiple_cockroachdb(sled_id, current + 1) .context("failed to add CockroachDB zone")?; @@ -770,9 +773,25 @@ fn cmd_blueprint_edit( ); format!("added CockroachDB zone to sled {}", sled_id) } + BlueprintEditCommands::ExpungeZone { sled_id, zone_id } => { + builder + .sled_expunge_zone(sled_id, zone_id) + .context("failed to expunge zone")?; + format!("expunged zone {zone_id} from sled {sled_id}") + } }; - let new_blueprint = builder.build(); + let mut new_blueprint = builder.build(); + + // Normally `builder.build()` would construct the cockroach fingerprint + // based on what we read from CRDB and put into the planning input, but + // since we don't have a CRDB we had to make something up for our planning + // input's CRDB fingerprint. In the absense of a better alternative, we'll + // just copy our parent's CRDB fingerprint and carry it forward. + new_blueprint + .cockroachdb_fingerprint + .clone_from(&blueprint.cockroachdb_fingerprint); + let rv = format!( "blueprint {} created from blueprint {}: {}", new_blueprint.id, blueprint_id, label diff --git a/illumos-utils/src/ipadm.rs b/illumos-utils/src/ipadm.rs index 70662b6ccd..d5e0053ba9 100644 --- a/illumos-utils/src/ipadm.rs +++ b/illumos-utils/src/ipadm.rs @@ -7,17 +7,31 @@ use crate::addrobj::{IPV6_LINK_LOCAL_ADDROBJ_NAME, IPV6_STATIC_ADDROBJ_NAME}; use crate::zone::IPADM; use crate::{execute, ExecutionError, PFEXEC}; -use std::net::Ipv6Addr; +use oxnet::IpNet; +use std::net::{IpAddr, Ipv6Addr}; /// Wraps commands for interacting with interfaces. pub struct Ipadm {} /// Expected error message contents when showing an addrobj that doesn't exist. -const ADDROBJ_NOT_FOUND_ERR: &str = "Address object not found"; +// The message changed to be consistent regardless of the state of the +// system in illumos 16677. It is now always `ERR1` below. Prior to that, it +// would most often be `ERR2` but could sometimes be blank or `ERR1`. +const ADDROBJ_NOT_FOUND_ERR1: &str = "address: Object not found"; +const ADDROBJ_NOT_FOUND_ERR2: &str = "Address object not found"; /// Expected error message when an interface already exists. const INTERFACE_ALREADY_EXISTS: &str = "Interface already exists"; +/// Expected error message when an addrobj already exists. +const ADDROBJ_ALREADY_EXISTS: &str = "Address object already exists"; + +pub enum AddrObjType { + DHCP, + AddrConf, + Static(IpAddr), +} + #[cfg_attr(any(test, feature = "testing"), mockall::automock)] impl Ipadm { /// Ensure that an IP interface exists on the provided datalink. @@ -37,6 +51,96 @@ impl Ipadm { } } + /// Create an address object with the provided parameters. If an object + /// with the requested name already exists, return success. Note that in + /// this case, the existing object is not checked to ensure it is + /// consistent with the provided parameters. + pub fn ensure_ip_addrobj_exists( + addrobj: &str, + addrtype: AddrObjType, + ) -> Result<(), ExecutionError> { + let mut cmd = std::process::Command::new(PFEXEC); + let cmd = cmd.args(&[IPADM, "create-addr", "-t", "-T"]); + let cmd = match addrtype { + AddrObjType::DHCP => cmd.args(&["dhcp"]), + AddrObjType::AddrConf => cmd.args(&["addrconf"]), + AddrObjType::Static(addr) => { + cmd.args(&["static", "-a", &addr.to_string()]) + } + }; + let cmd = cmd.arg(&addrobj); + match execute(cmd) { + Ok(_) => Ok(()), + Err(ExecutionError::CommandFailure(info)) + if info.stderr.contains(ADDROBJ_ALREADY_EXISTS) => + { + Ok(()) + } + Err(e) => Err(e), + } + } + + /// Remove any scope from an IPv6 address. + /// e.g. fe80::8:20ff:fed0:8687%oxControlService1/10 -> + /// fe80::8:20ff:fed0:8687/10 + fn remove_addr_scope(input: &str) -> String { + if let Some(pos) = input.find('%') { + let (base, rest) = input.split_at(pos); + if let Some(slash_pos) = rest.find('/') { + format!("{}{}", base, &rest[slash_pos..]) + } else { + base.to_string() + } + } else { + input.to_string() + } + } + + /// Return the IP network associated with an address object, or None if + /// there is no address object with this name. + pub fn addrobj_addr( + addrobj: &str, + ) -> Result, ExecutionError> { + // Note that additional privileges are not required to list address + // objects, and so there is no `pfexec` here. + let mut cmd = std::process::Command::new(IPADM); + let cmd = cmd.args(&["show-addr", "-po", "addr", addrobj]); + match execute(cmd) { + Err(ExecutionError::CommandFailure(info)) + if [ADDROBJ_NOT_FOUND_ERR1, ADDROBJ_NOT_FOUND_ERR2] + .iter() + .any(|&ss| info.stderr.contains(ss)) => + { + // The address object does not exist. + Ok(None) + } + Err(e) => Err(e), + Ok(output) => { + let out = std::str::from_utf8(&output.stdout).map_err(|e| { + let s = String::from_utf8_lossy(&output.stdout); + ExecutionError::ParseFailure(format!("{}: {}", e, s)) + })?; + let lines: Vec<_> = out.trim().lines().collect(); + if lines.is_empty() { + return Ok(None); + } + match Self::remove_addr_scope(lines[0].trim()).parse() { + Ok(ipnet) => Ok(Some(ipnet)), + Err(e) => Err(ExecutionError::ParseFailure(format!( + "{}: {}", + lines[0].trim(), + e + ))), + } + } + } + } + + /// Determine if a named address object exists + pub fn addrobj_exists(addrobj: &str) -> Result { + Ok(Self::addrobj_addr(addrobj)?.is_some()) + } + // Set MTU to 9000 on both IPv4 and IPv6 pub fn set_interface_mtu(datalink: &str) -> Result<(), ExecutionError> { let mut cmd = std::process::Command::new(PFEXEC); @@ -71,53 +175,18 @@ impl Ipadm { datalink: &str, listen_addr: &Ipv6Addr, ) -> Result<(), ExecutionError> { - // Create auto-configured address on the IP interface if it doesn't already exist + // Create auto-configured address on the IP interface if it doesn't + // already exist let addrobj = format!("{}/{}", datalink, IPV6_LINK_LOCAL_ADDROBJ_NAME); - let mut cmd = std::process::Command::new(PFEXEC); - let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]); - match execute(cmd) { - Err(ExecutionError::CommandFailure(info)) - if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) => - { - let mut cmd = std::process::Command::new(PFEXEC); - let cmd = cmd.args(&[ - IPADM, - "create-addr", - "-t", - "-T", - "addrconf", - &addrobj, - ]); - execute(cmd)?; - } - Err(other) => return Err(other), - Ok(_) => (), - }; + Self::ensure_ip_addrobj_exists(&addrobj, AddrObjType::AddrConf)?; // Create static address on the IP interface if it doesn't already exist let addrobj = format!("{}/{}", datalink, IPV6_STATIC_ADDROBJ_NAME); - let mut cmd = std::process::Command::new(PFEXEC); - let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]); - match execute(cmd) { - Err(ExecutionError::CommandFailure(info)) - if info.stderr.contains(ADDROBJ_NOT_FOUND_ERR) => - { - let mut cmd = std::process::Command::new(PFEXEC); - let cmd = cmd.args(&[ - IPADM, - "create-addr", - "-t", - "-T", - "static", - "-a", - &listen_addr.to_string(), - &addrobj, - ]); - execute(cmd).map(|_| ()) - } - Err(other) => Err(other), - Ok(_) => Ok(()), - } + Self::ensure_ip_addrobj_exists( + &addrobj, + AddrObjType::Static((*listen_addr).into()), + )?; + Ok(()) } // Create gateway on the IP interface if it doesn't already exist @@ -125,23 +194,7 @@ impl Ipadm { opte_iface: &String, ) -> Result<(), ExecutionError> { let addrobj = format!("{}/public", opte_iface); - let mut cmd = std::process::Command::new(PFEXEC); - let cmd = cmd.args(&[IPADM, "show-addr", &addrobj]); - match execute(cmd) { - Err(_) => { - let mut cmd = std::process::Command::new(PFEXEC); - let cmd = cmd.args(&[ - IPADM, - "create-addr", - "-t", - "-T", - "dhcp", - &addrobj, - ]); - execute(cmd)?; - } - Ok(_) => (), - }; + Self::ensure_ip_addrobj_exists(&addrobj, AddrObjType::DHCP)?; Ok(()) } } diff --git a/illumos-utils/src/lib.rs b/illumos-utils/src/lib.rs index 7140c62981..48a5767f41 100644 --- a/illumos-utils/src/lib.rs +++ b/illumos-utils/src/lib.rs @@ -64,6 +64,9 @@ pub enum ExecutionError { #[error("Failed to manipulate process contract: {err}")] ContractFailure { err: std::io::Error }, + #[error("Failed to parse command output")] + ParseFailure(String), + #[error("Zone is not running")] NotRunning, } diff --git a/internal-dns/src/config.rs b/internal-dns/src/config.rs index 43d6c96d2d..a9ff664030 100644 --- a/internal-dns/src/config.rs +++ b/internal-dns/src/config.rs @@ -60,7 +60,7 @@ //! //! This module provides types used to assemble that configuration. -use crate::names::{ServiceName, DNS_ZONE}; +use crate::names::{ServiceName, BOUNDARY_NTP_DNS_NAME, DNS_ZONE}; use anyhow::{anyhow, ensure}; use core::fmt; use dns_service_client::types::{DnsConfigParams, DnsConfigZone, DnsRecord}; @@ -407,6 +407,27 @@ impl DnsConfigBuilder { (name, vec![DnsRecord::Aaaa(sled_ip)]) }); + // Assemble the special boundary NTP name to support chrony on internal + // NTP zones. + // + // We leave this as `None` if there are no `BoundaryNtp` service zones, + // which omits it from the final set of records. + let boundary_ntp_records = self + .service_instances_zones + .get(&ServiceName::BoundaryNtp) + .map(|zone2port| { + let records = zone2port + .iter() + .map(|(zone, _port)| { + let zone_ip = self.zones.get(&zone).expect( + "service_backend_zone() ensures zones are defined", + ); + DnsRecord::Aaaa(*zone_ip) + }) + .collect::>(); + (BOUNDARY_NTP_DNS_NAME.to_string(), records) + }); + // Assemble the set of AAAA records for zones. let zone_records = self.zones.into_iter().map(|(zone, zone_ip)| { (zone.dns_name(), vec![DnsRecord::Aaaa(zone_ip)]) @@ -454,6 +475,7 @@ impl DnsConfigBuilder { let all_records = sled_records .chain(zone_records) + .chain(boundary_ntp_records) .chain(srv_records_sleds) .chain(srv_records_zones) .collect(); @@ -593,6 +615,11 @@ mod test { b.service_backend_zone(ServiceName::Oximeter, &zone2, 125).unwrap(); b.service_backend_zone(ServiceName::Oximeter, &zone3, 126).unwrap(); + // Add a boundary NTP service to one of the zones; this will also + // populate the special `BOUNDARY_NTP_DNS_NAME`. + b.service_backend_zone(ServiceName::BoundaryNtp, &zone2, 127) + .unwrap(); + // A sharded service b.service_backend_sled( ServiceName::SledAgent(sled1_uuid), diff --git a/internal-dns/src/names.rs b/internal-dns/src/names.rs index 3017d3b3fc..f975029d69 100644 --- a/internal-dns/src/names.rs +++ b/internal-dns/src/names.rs @@ -6,6 +6,13 @@ use omicron_uuid_kinds::{OmicronZoneUuid, SledUuid}; +/// Name for the special boundary NTP DNS name +/// +/// chrony does not support SRV records. This name resolves to AAAA records for +/// each boundary NTP zone, and then we can point internal NTP chrony instances +/// at this name for it to find the boundary NTP zones. +pub const BOUNDARY_NTP_DNS_NAME: &str = "boundary-ntp"; + /// Name for the control plane DNS zone pub const DNS_ZONE: &str = "control-plane.oxide.internal"; diff --git a/internal-dns/tests/output/internal-dns-zone.txt b/internal-dns/tests/output/internal-dns-zone.txt index e8c3f01b05..d87805f677 100644 --- a/internal-dns/tests/output/internal-dns-zone.txt +++ b/internal-dns/tests/output/internal-dns-zone.txt @@ -68,6 +68,17 @@ builder: "non_trivial" "data": "::1:4" } ], + "_boundary-ntp._tcp": [ + { + "type": "SRV", + "data": { + "port": 127, + "prio": 0, + "target": "001de000-c04e-4000-8000-000000000002.host.control-plane.oxide.internal", + "weight": 0 + } + } + ], "_nexus._tcp": [ { "type": "SRV", @@ -118,5 +129,11 @@ builder: "non_trivial" "weight": 0 } } + ], + "boundary-ntp": [ + { + "type": "AAAA", + "data": "::1:2" + } ] } diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 6e9d6b0cf0..9d8bf1ac9b 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -379,6 +379,8 @@ pub struct BackgroundTaskConfig { pub region_replacement_driver: RegionReplacementDriverConfig, /// configuration for instance watcher task pub instance_watcher: InstanceWatcherConfig, + /// configuration for instance updater task + pub instance_updater: InstanceUpdaterConfig, /// configuration for service VPC firewall propagation task pub service_firewall_propagation: ServiceFirewallPropagationConfig, /// configuration for v2p mapping propagation task @@ -560,6 +562,23 @@ pub struct InstanceWatcherConfig { pub period_secs: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct InstanceUpdaterConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, + + /// disable background checks for instances in need of updates. + /// + /// This config is intended for use in testing, and should generally not be + /// enabled in real life. + /// + /// Default: Off + #[serde(default)] + pub disable: bool, +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct ServiceFirewallPropagationConfig { @@ -848,6 +867,8 @@ mod test { region_replacement.period_secs = 30 region_replacement_driver.period_secs = 30 instance_watcher.period_secs = 30 + instance_updater.period_secs = 30 + instance_updater.disable = false service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 @@ -995,6 +1016,10 @@ mod test { instance_watcher: InstanceWatcherConfig { period_secs: Duration::from_secs(30), }, + instance_updater: InstanceUpdaterConfig { + period_secs: Duration::from_secs(30), + disable: false, + }, service_firewall_propagation: ServiceFirewallPropagationConfig { period_secs: Duration::from_secs(300), @@ -1081,6 +1106,7 @@ mod test { region_replacement.period_secs = 30 region_replacement_driver.period_secs = 30 instance_watcher.period_secs = 30 + instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/db-model/src/instance_state.rs b/nexus/db-model/src/instance_state.rs index 673b06e2cd..5925e92ae0 100644 --- a/nexus/db-model/src/instance_state.rs +++ b/nexus/db-model/src/instance_state.rs @@ -59,3 +59,8 @@ impl From for omicron_common::api::external::InstanceState { } } } + +impl diesel::query_builder::QueryId for InstanceStateEnum { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index f28f886f6c..82f4b78fa8 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -72,6 +72,8 @@ mod region; mod region_replacement; mod region_replacement_step; mod region_snapshot; +mod region_snapshot_replacement; +mod region_snapshot_replacement_step; mod role_assignment; mod role_builtin; pub mod saga_types; @@ -174,6 +176,8 @@ pub use region::*; pub use region_replacement::*; pub use region_replacement_step::*; pub use region_snapshot::*; +pub use region_snapshot_replacement::*; +pub use region_snapshot_replacement_step::*; pub use role_assignment::*; pub use role_builtin::*; pub use saga_types::*; diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index 4e3ca1b35d..d7c18ae5dd 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -89,4 +89,22 @@ impl Migration { time_target_updated: None, } } + + /// Returns `true` if either side reports that the migration is in a + /// terminal state. + pub fn is_terminal(&self) -> bool { + self.source_state.is_terminal() || self.target_state.is_terminal() + } + + /// Returns `true` if either side of the migration has failed. + pub fn either_side_failed(&self) -> bool { + self.source_state == MigrationState::FAILED + || self.target_state == MigrationState::FAILED + } + + /// Returns `true` if either side of the migration has completed. + pub fn either_side_completed(&self) -> bool { + self.source_state == MigrationState::COMPLETED + || self.target_state == MigrationState::COMPLETED + } } diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs index 694198eb56..e1662f2c28 100644 --- a/nexus/db-model/src/migration_state.rs +++ b/nexus/db-model/src/migration_state.rs @@ -28,6 +28,18 @@ impl_enum_wrapper!( ); impl MigrationState { + pub const COMPLETED: MigrationState = + MigrationState(nexus::MigrationState::Completed); + pub const FAILED: MigrationState = + MigrationState(nexus::MigrationState::Failed); + pub const PENDING: MigrationState = + MigrationState(nexus::MigrationState::Pending); + pub const IN_PROGRESS: MigrationState = + MigrationState(nexus::MigrationState::InProgress); + + pub const TERMINAL_STATES: &'static [MigrationState] = + &[Self::COMPLETED, Self::FAILED]; + /// Returns `true` if this migration state means that the migration is no /// longer in progress (it has either succeeded or failed). #[must_use] diff --git a/nexus/db-model/src/region_replacement.rs b/nexus/db-model/src/region_replacement.rs index 9ae64d6d38..51570cf7f7 100644 --- a/nexus/db-model/src/region_replacement.rs +++ b/nexus/db-model/src/region_replacement.rs @@ -29,6 +29,7 @@ impl_enum_type!( Complete => b"complete" ); +// FromStr impl required for use with clap (aka omdb) impl std::str::FromStr for RegionReplacementState { type Err = String; diff --git a/nexus/db-model/src/region_snapshot.rs b/nexus/db-model/src/region_snapshot.rs index 2ea59f99f0..1b39a5b6f4 100644 --- a/nexus/db-model/src/region_snapshot.rs +++ b/nexus/db-model/src/region_snapshot.rs @@ -40,3 +40,22 @@ pub struct RegionSnapshot { #[serde(default)] pub deleting: bool, } + +impl RegionSnapshot { + pub fn new( + dataset_id: Uuid, + region_id: Uuid, + snapshot_id: Uuid, + snapshot_addr: String, + ) -> Self { + RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + snapshot_addr, + + volume_references: 0, + deleting: false, + } + } +} diff --git a/nexus/db-model/src/region_snapshot_replacement.rs b/nexus/db-model/src/region_snapshot_replacement.rs new file mode 100644 index 0000000000..183c9034c0 --- /dev/null +++ b/nexus/db-model/src/region_snapshot_replacement.rs @@ -0,0 +1,161 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::region_snapshot_replacement; +use crate::RegionSnapshot; +use chrono::DateTime; +use chrono::Utc; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "region_snapshot_replacement_state", schema = "public"))] + pub struct RegionSnapshotReplacementStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = RegionSnapshotReplacementStateEnum)] + pub enum RegionSnapshotReplacementState; + + // Enum values + Requested => b"requested" + Allocating => b"allocating" + ReplacementDone => b"replacement_done" + DeletingOldVolume => b"deleting_old_volume" + Running => b"running" + Complete => b"complete" +); + +// FromStr impl required for use with clap (aka omdb) +impl std::str::FromStr for RegionSnapshotReplacementState { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "requested" => Ok(RegionSnapshotReplacementState::Requested), + "allocating" => Ok(RegionSnapshotReplacementState::Allocating), + "replacement_done" => { + Ok(RegionSnapshotReplacementState::ReplacementDone) + } + "deleting_old_volume" => { + Ok(RegionSnapshotReplacementState::DeletingOldVolume) + } + "running" => Ok(RegionSnapshotReplacementState::Running), + "complete" => Ok(RegionSnapshotReplacementState::Complete), + _ => Err(format!("unrecognized value {} for enum", s)), + } + } +} + +/// Database representation of a RegionSnapshot replacement request. +/// +/// This record stores the data related to the operations required for Nexus to +/// orchestrate replacing a region snapshot. It transitions through the +/// following states: +/// +/// ```text +/// Requested <-- --- +/// | | +/// | | | +/// v | | responsibility of region snapshot +/// | | replacement start saga +/// Allocating -- | +/// | +/// | | +/// v --- +/// --- +/// ReplacementDone <-- | +/// | | +/// | | | +/// v | | responsibility of region snapshot +/// | | replacement garbage collect saga +/// DeletingOldVolume -- | +/// | +/// | | +/// v --- +/// --- +/// Running | +/// | set in region snapshot replacement +/// | | finish background task +/// v | +/// | +/// Complete --- +/// ``` +/// +/// which are captured in the RegionSnapshotReplacementState enum. Annotated on +/// the right are which sagas are responsible for which state transitions. The +/// state transitions themselves are performed by these sagas and all involve a +/// query that: +/// +/// - checks that the starting state (and other values as required) make sense +/// - updates the state while setting a unique operating_saga_id id (and any +/// other fields as appropriate) +/// +/// As multiple background tasks will be waking up, checking to see what sagas +/// need to be triggered, and requesting that these region snapshot replacement +/// sagas run, this is meant to block multiple sagas from running at the same +/// time in an effort to cut down on interference - most will unwind at the +/// first step of performing this state transition instead of somewhere in the +/// middle. +/// +/// See also: RegionSnapshotReplacementStep records +#[derive( + Queryable, + Insertable, + Debug, + Clone, + Selectable, + Serialize, + Deserialize, + PartialEq, +)] +#[diesel(table_name = region_snapshot_replacement)] +pub struct RegionSnapshotReplacement { + pub id: Uuid, + + pub request_time: DateTime, + + // These are a copy of fields from the corresponding region snapshot record + pub old_dataset_id: Uuid, + pub old_region_id: Uuid, + pub old_snapshot_id: Uuid, + + /// A synthetic volume that only is used to later delete the old snapshot + pub old_snapshot_volume_id: Option, + + pub new_region_id: Option, + + pub replacement_state: RegionSnapshotReplacementState, + + pub operating_saga_id: Option, +} + +impl RegionSnapshotReplacement { + pub fn for_region_snapshot(region_snapshot: &RegionSnapshot) -> Self { + Self::new( + region_snapshot.dataset_id, + region_snapshot.region_id, + region_snapshot.snapshot_id, + ) + } + + pub fn new( + old_dataset_id: Uuid, + old_region_id: Uuid, + old_snapshot_id: Uuid, + ) -> Self { + Self { + id: Uuid::new_v4(), + request_time: Utc::now(), + old_dataset_id, + old_region_id, + old_snapshot_id, + old_snapshot_volume_id: None, + new_region_id: None, + replacement_state: RegionSnapshotReplacementState::Requested, + operating_saga_id: None, + } + } +} diff --git a/nexus/db-model/src/region_snapshot_replacement_step.rs b/nexus/db-model/src/region_snapshot_replacement_step.rs new file mode 100644 index 0000000000..3c9a60056e --- /dev/null +++ b/nexus/db-model/src/region_snapshot_replacement_step.rs @@ -0,0 +1,115 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::region_snapshot_replacement_step; +use chrono::DateTime; +use chrono::Utc; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel(postgres_type(name = "region_snapshot_replacement_step_state", schema = "public"))] + pub struct RegionSnapshotReplacementStepStateEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + #[diesel(sql_type = RegionSnapshotReplacementStepStateEnum)] + pub enum RegionSnapshotReplacementStepState; + + // Enum values + Requested => b"requested" + Running => b"running" + Complete => b"complete" + VolumeDeleted => b"volume_deleted" +); + +// FromStr impl required for use with clap (aka omdb) +impl std::str::FromStr for RegionSnapshotReplacementStepState { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "requested" => Ok(RegionSnapshotReplacementStepState::Requested), + "running" => Ok(RegionSnapshotReplacementStepState::Running), + "complete" => Ok(RegionSnapshotReplacementStepState::Complete), + "volume_deleted" => { + Ok(RegionSnapshotReplacementStepState::VolumeDeleted) + } + _ => Err(format!("unrecognized value {} for enum", s)), + } + } +} + +/// Database representation of a RegionSnapshot replacement update step. +/// +/// During region snapshot replacement, after the read-only target has been +/// replaced in the associate snapshot volume's construction request, Nexus +/// needs to update each running Upstairs that constructed an Upstairs using +/// that old target. Each volume that needs updating is recorded as a region +/// snapshot replacement step record. The region snapshot replacement finish +/// saga can be run when all region snapshot replacement steps are completed. +/// This record transitions through the following states: +/// +/// ```text +/// Requested <-- --- +/// | | +/// | | | +/// v | | responsibility of region snapshot +/// | | replacement step saga +/// Running -- | +/// | +/// | | +/// v | +/// --- +/// Complete --- +/// | +/// | | responsibility of region snapshot +/// v | replacement step garbage collect saga +/// | +/// VolumeDeleted --- +/// ``` +/// +/// See also: RegionSnapshotReplacement records +#[derive( + Queryable, + Insertable, + Debug, + Clone, + Selectable, + Serialize, + Deserialize, + PartialEq, +)] +#[diesel(table_name = region_snapshot_replacement_step)] +pub struct RegionSnapshotReplacementStep { + pub id: Uuid, + + pub request_id: Uuid, + pub request_time: DateTime, + + /// A volume that references the snapshot + pub volume_id: Uuid, + + /// A synthetic volume that only is used to later delete the old snapshot + pub old_snapshot_volume_id: Option, + + pub replacement_state: RegionSnapshotReplacementStepState, + + pub operating_saga_id: Option, +} + +impl RegionSnapshotReplacementStep { + pub fn new(request_id: Uuid, volume_id: Uuid) -> Self { + Self { + id: Uuid::new_v4(), + request_id, + request_time: Utc::now(), + volume_id, + old_snapshot_volume_id: None, + replacement_state: RegionSnapshotReplacementStepState::Requested, + operating_saga_id: None, + } + } +} diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index d76d380afb..d270d10959 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -425,6 +425,8 @@ table! { } } +joinable!(instance -> vmm (active_propolis_id)); + table! { vmm (id) { id -> Uuid, @@ -1771,6 +1773,40 @@ table! { } } +table! { + region_snapshot_replacement (id) { + id -> Uuid, + request_time -> Timestamptz, + old_dataset_id -> Uuid, + old_region_id -> Uuid, + old_snapshot_id -> Uuid, + old_snapshot_volume_id -> Nullable, + new_region_id -> Nullable, + replacement_state -> crate::RegionSnapshotReplacementStateEnum, + operating_saga_id -> Nullable, + } +} + +allow_tables_to_appear_in_same_query!(zpool, region_snapshot); + +table! { + region_snapshot_replacement_step (id) { + id -> Uuid, + request_id -> Uuid, + request_time -> Timestamptz, + volume_id -> Uuid, + old_snapshot_volume_id -> Nullable, + replacement_state -> crate::RegionSnapshotReplacementStepStateEnum, + operating_saga_id -> Nullable, + } +} + +allow_tables_to_appear_in_same_query!( + region_snapshot_replacement, + region_snapshot_replacement_step, + volume +); + table! { db_metadata (singleton) { singleton -> Bool, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 921123e41f..ade8e8d2ae 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(87, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(88, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,8 +29,9 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), - KnownVersion::new(87, "inv-dataset"), - KnownVersion::new(86, "dataset-kinds-zone-and-debug"), + KnownVersion::new(88, "inv-dataset"), + KnownVersion::new(87, "dataset-kinds-zone-and-debug"), + KnownVersion::new(86, "snapshot-replacement"), KnownVersion::new(85, "add-migrations-by-time-created-index"), KnownVersion::new(84, "region-read-only"), KnownVersion::new(83, "dataset-address-optional"), diff --git a/nexus/db-model/src/tuf_repo.rs b/nexus/db-model/src/tuf_repo.rs index 4a64566a62..6f5a898a2d 100644 --- a/nexus/db-model/src/tuf_repo.rs +++ b/nexus/db-model/src/tuf_repo.rs @@ -307,7 +307,10 @@ impl FromSql for ArtifactHash { fn from_sql( bytes: diesel::pg::PgValue<'_>, ) -> diesel::deserialize::Result { - let s = String::from_sql(bytes)?; + let s = + >::from_sql( + bytes, + )?; ExternalArtifactHash::from_str(&s) .map(ArtifactHash) .map_err(|e| e.into()) diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index 121daaf7dd..7d44bbedbd 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -8,7 +8,7 @@ use serde::Serialize; use std::fmt; impl_enum_type!( - #[derive(SqlType, Debug)] + #[derive(SqlType, Debug, Clone)] #[diesel(postgres_type(name = "vmm_state", schema = "public"))] pub struct VmmStateEnum; @@ -41,6 +41,11 @@ impl VmmState { VmmState::SagaUnwound => "saga_unwound", } } + + /// States in which it is safe to deallocate a VMM's sled resources and mark + /// it as deleted. + pub const DESTROYABLE_STATES: &'static [Self] = + &[Self::Destroyed, Self::SagaUnwound]; } impl fmt::Display for VmmState { @@ -119,3 +124,8 @@ impl From for omicron_common::api::external::InstanceState { } } } + +impl diesel::query_builder::QueryId for VmmStateEnum { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 617413f172..d413f9507a 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -14,6 +14,7 @@ use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::DbConnection; use crate::db::TransactionError; +use crate::transaction_retry::OptionalError; use anyhow::Context; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; @@ -46,6 +47,7 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintMetadata; use nexus_types::deployment::BlueprintPhysicalDisksConfig; use nexus_types::deployment::BlueprintTarget; +use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZonesConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::external_api::views::SledState; @@ -60,6 +62,8 @@ use omicron_uuid_kinds::SledUuid; use std::collections::BTreeMap; use uuid::Uuid; +mod external_networking; + impl DataStore { /// List blueprints pub async fn blueprints_list( @@ -663,8 +667,12 @@ impl DataStore { .transaction_async(|conn| async move { // Ensure that blueprint we're about to delete is not the // current target. - let current_target = - self.blueprint_current_target_only(&conn).await?; + let current_target = self + .blueprint_current_target_only( + &conn, + SelectFlavor::Standard, + ) + .await?; if current_target.target_id == blueprint_id { return Err(TransactionError::CustomError( Error::conflict(format!( @@ -787,6 +795,147 @@ impl DataStore { Ok(()) } + /// Ensure all external networking IPs and service vNICs described by + /// `blueprint` are allocated (for in-service zones) or deallocated + /// (otherwise), conditional on `blueprint` being the current target + /// blueprint. + /// + /// This method may be safely executed from the blueprint executor RPW; the + /// condition on the current target blueprint ensures a Nexus attempting to + /// realize an out of date blueprint can't overwrite changes made by a Nexus + /// that realized the current target. + pub async fn blueprint_ensure_external_networking_resources( + &self, + opctx: &OpContext, + blueprint: &Blueprint, + ) -> Result<(), Error> { + self.blueprint_ensure_external_networking_resources_impl( + opctx, + blueprint, + #[cfg(test)] + None, + #[cfg(test)] + None, + ) + .await + } + + // The third and fourth arguments to this function only exist when run under + // test, and allows the calling test to control the general timing of the + // transaction executed by this method: + // + // 1. Check that `blueprint` is the current target blueprint + // 2. Set `target_check_done` is set to true (the test can wait on this) + // 3. Run remainder of transaction to allocate/deallocate resources + // 4. Wait until `return_on_completion` is set to true + // 5. Return + // + // If either of these arguments are `None`, steps 2 or 4 will be skipped. + async fn blueprint_ensure_external_networking_resources_impl( + &self, + opctx: &OpContext, + blueprint: &Blueprint, + #[cfg(test)] target_check_done: Option< + std::sync::Arc, + >, + #[cfg(test)] return_on_completion: Option< + std::sync::Arc, + >, + ) -> Result<(), Error> { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "blueprint_ensure_external_networking_resources", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + #[cfg(test)] + let target_check_done = target_check_done.clone(); + #[cfg(test)] + let return_on_completion = return_on_completion.clone(); + + async move { + // Bail out if `blueprint` isn't the current target. + let current_target = self + .blueprint_current_target_only( + &conn, + SelectFlavor::ForUpdate, + ) + .await + .map_err(|e| err.bail(e))?; + if current_target.target_id != blueprint.id { + return Err(err.bail(Error::invalid_request(format!( + "blueprint {} is not the current target blueprint ({})", + blueprint.id, current_target.target_id + )))); + } + + // See the comment on this method; this lets us notify our test + // caller that we've performed our target blueprint check. + #[cfg(test)] + { + use std::sync::atomic::Ordering; + if let Some(gate) = target_check_done { + gate.store(true, Ordering::SeqCst); + } + } + + // Deallocate external networking resources for + // non-externally-reachable zones before allocating resources + // for reachable zones. This will allow allocation to succeed if + // we are swapping an external IP between two zones (e.g., + // moving a specific external IP from an old external DNS zone + // to a new one). + self.ensure_zone_external_networking_deallocated_on_connection( + &conn, + &opctx.log, + blueprint + .all_omicron_zones_not_in( + BlueprintZoneFilter::ShouldBeExternallyReachable, + ) + .map(|(_sled_id, zone)| zone), + ) + .await + .map_err(|e| err.bail(e))?; + self.ensure_zone_external_networking_allocated_on_connection( + &conn, + opctx, + blueprint + .all_omicron_zones( + BlueprintZoneFilter::ShouldBeExternallyReachable, + ) + .map(|(_sled_id, zone)| zone), + ) + .await + .map_err(|e| err.bail(e))?; + + // See the comment on this method; this lets us wait until our + // test caller is ready for us to return. + #[cfg(test)] + { + use std::sync::atomic::Ordering; + use std::time::Duration; + if let Some(gate) = return_on_completion { + while !gate.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(50)).await; + } + } + } + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + /// Set the current target blueprint /// /// In order to become the target blueprint, `target`'s parent blueprint @@ -930,7 +1079,9 @@ impl DataStore { opctx.authorize(authz::Action::Read, &authz::BLUEPRINT_CONFIG).await?; let conn = self.pool_connection_authorized(opctx).await?; - let target = self.blueprint_current_target_only(&conn).await?; + let target = self + .blueprint_current_target_only(&conn, SelectFlavor::Standard) + .await?; // The blueprint for the current target cannot be deleted while it is // the current target, but it's possible someone else (a) made a new @@ -951,7 +1102,7 @@ impl DataStore { ) -> Result { opctx.authorize(authz::Action::Read, &authz::BLUEPRINT_CONFIG).await?; let conn = self.pool_connection_authorized(opctx).await?; - self.blueprint_current_target_only(&conn).await + self.blueprint_current_target_only(&conn, SelectFlavor::Standard).await } // Helper to fetch the current blueprint target (without fetching the entire @@ -961,13 +1112,26 @@ impl DataStore { async fn blueprint_current_target_only( &self, conn: &async_bb8_diesel::Connection, + select_flavor: SelectFlavor, ) -> Result { use db::schema::bp_target::dsl; - let current_target = dsl::bp_target - .order_by(dsl::version.desc()) - .first_async::(conn) - .await + let query_result = match select_flavor { + SelectFlavor::ForUpdate => { + dsl::bp_target + .order_by(dsl::version.desc()) + .for_update() + .first_async::(conn) + .await + } + SelectFlavor::Standard => { + dsl::bp_target + .order_by(dsl::version.desc()) + .first_async::(conn) + .await + } + }; + let current_target = query_result .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -984,6 +1148,14 @@ impl DataStore { } } +#[derive(Debug, Clone, Copy)] +enum SelectFlavor { + /// A normal `SELECT`. + Standard, + /// Acquire a database-level write lock via `SELECT ... FOR UPDATE`. + ForUpdate, +} + // Helper to create an `authz::Blueprint` for a specific blueprint ID fn authz_blueprint_from_id(blueprint_id: Uuid) -> authz::Blueprint { authz::Blueprint::new( @@ -1361,6 +1533,8 @@ mod tests { use omicron_common::address::Ipv6Subnet; use omicron_common::disk::DiskIdentity; use omicron_test_utils::dev; + use omicron_test_utils::dev::poll::wait_for_condition; + use omicron_test_utils::dev::poll::CondCheckError; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; @@ -1371,6 +1545,10 @@ mod tests { use slog::Logger; use std::mem; use std::net::Ipv6Addr; + use std::sync::atomic::AtomicBool; + use std::sync::atomic::Ordering; + use std::sync::Arc; + use std::time::Duration; static EMPTY_PLANNING_INPUT: Lazy = Lazy::new(|| PlanningInputBuilder::empty_input()); @@ -2061,6 +2239,199 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn test_ensure_external_networking_bails_on_bad_target() { + // Setup + let logctx = dev::test_setup_log( + "test_ensure_external_networking_bails_on_bad_target", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create an initial blueprint and a child. + let blueprint1 = BlueprintBuilder::build_empty_with_sleds( + std::iter::empty(), + "test1", + ); + let blueprint2 = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint1, + &EMPTY_PLANNING_INPUT, + "test2", + ) + .expect("failed to create builder") + .build(); + + // Insert both into the blueprint table. + datastore.blueprint_insert(&opctx, &blueprint1).await.unwrap(); + datastore.blueprint_insert(&opctx, &blueprint2).await.unwrap(); + + let bp1_target = BlueprintTarget { + target_id: blueprint1.id, + enabled: true, + time_made_target: now_db_precision(), + }; + let bp2_target = BlueprintTarget { + target_id: blueprint2.id, + enabled: true, + time_made_target: now_db_precision(), + }; + + // Set bp1_target as the current target. + datastore + .blueprint_target_set_current(&opctx, bp1_target) + .await + .unwrap(); + + // Attempting to ensure the (empty) resources for bp1 should succeed. + datastore + .blueprint_ensure_external_networking_resources(&opctx, &blueprint1) + .await + .expect("ensured networking resources for empty blueprint 1"); + + // Attempting to ensure the (empty) resources for bp2 should fail, + // because it isn't the target blueprint. + let err = datastore + .blueprint_ensure_external_networking_resources(&opctx, &blueprint2) + .await + .expect_err("failed because blueprint 2 isn't the target"); + assert!( + err.to_string().contains("is not the current target blueprint"), + "unexpected error: {err}" + ); + + // Create flags to control method execution. + let target_check_done = Arc::new(AtomicBool::new(false)); + let return_on_completion = Arc::new(AtomicBool::new(false)); + + // Spawn a task to execute our method. + let mut ensure_resources_task = tokio::spawn({ + let datastore = datastore.clone(); + let opctx = + OpContext::for_tests(logctx.log.clone(), datastore.clone()); + let target_check_done = target_check_done.clone(); + let return_on_completion = return_on_completion.clone(); + async move { + datastore + .blueprint_ensure_external_networking_resources_impl( + &opctx, + &blueprint1, + Some(target_check_done), + Some(return_on_completion), + ) + .await + } + }); + + // Wait until `task` has proceeded past the point at which it's checked + // the target blueprint. + wait_for_condition( + || async { + if target_check_done.load(Ordering::SeqCst) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_millis(50), + &Duration::from_secs(10), + ) + .await + .expect("`target_check_done` not set to true"); + + // Spawn another task that tries to read the current target. This should + // block at the database level due to the `SELECT ... FOR UPDATE` inside + // `blueprint_ensure_external_networking_resources`. + let mut current_target_task = tokio::spawn({ + let datastore = datastore.clone(); + let opctx = + OpContext::for_tests(logctx.log.clone(), datastore.clone()); + async move { + datastore + .blueprint_target_get_current(&opctx) + .await + .expect("read current target") + } + }); + + // Spawn another task that tries to set the current target. This should + // block at the database level due to the `SELECT ... FOR UPDATE` inside + // `blueprint_ensure_external_networking_resources`. + let mut update_target_task = tokio::spawn({ + let datastore = datastore.clone(); + let opctx = + OpContext::for_tests(logctx.log.clone(), datastore.clone()); + async move { + datastore.blueprint_target_set_current(&opctx, bp2_target).await + } + }); + + // None of our spawned tasks should be able to make progress: + // `ensure_resources_task` is waiting for us to set + // `return_on_completion` to true, and the other two should be + // queued by Cockroach, because + // `blueprint_ensure_external_networking_resources` should have + // performed a `SELECT ... FOR UPDATE` on the current target, forcing + // the query that wants to change it to wait until the transaction + // completes. + // + // We'll somewhat haphazardly test this by trying to wait for any + // task to finish, and succeeding on a timeout of a few seconds. This + // could spuriously succeed if we're executing on a very overloaded + // system where we hit the timeout even though one of the tasks is + // actually making progress, but hopefully will fail often enough if + // we've gotten this wrong. + tokio::select! { + result = &mut ensure_resources_task => { + panic!( + "unexpected completion of \ + `blueprint_ensure_external_networking_resources`: \ + {result:?}", + ); + } + result = &mut update_target_task => { + panic!( + "unexpected completion of \ + `blueprint_target_set_current`: {result:?}", + ); + } + result = &mut current_target_task => { + panic!( + "unexpected completion of \ + `blueprint_target_get_current`: {result:?}", + ); + } + _ = tokio::time::sleep(Duration::from_secs(5)) => (), + } + + // Release `ensure_resources_task` to finish. + return_on_completion.store(true, Ordering::SeqCst); + + tokio::time::timeout(Duration::from_secs(10), ensure_resources_task) + .await + .expect( + "time out waiting for \ + `blueprint_ensure_external_networking_resources`", + ) + .expect("panic in `blueprint_ensure_external_networking_resources") + .expect("ensured networking resources for empty blueprint 2"); + + // Our other tasks should now also complete. + tokio::time::timeout(Duration::from_secs(10), update_target_task) + .await + .expect("time out waiting for `blueprint_target_set_current`") + .expect("panic in `blueprint_target_set_current") + .expect("updated target to blueprint 2"); + tokio::time::timeout(Duration::from_secs(10), current_target_task) + .await + .expect("time out waiting for `blueprint_target_get_current`") + .expect("panic in `blueprint_target_get_current"); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint .all_omicron_zones(BlueprintZoneFilter::All) diff --git a/nexus/reconfigurator/execution/src/external_networking.rs b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs similarity index 60% rename from nexus/reconfigurator/execution/src/external_networking.rs rename to nexus/db-queries/src/db/datastore/deployment/external_networking.rs index 3e98aa4ff0..b6ced8e2c5 100644 --- a/nexus/reconfigurator/execution/src/external_networking.rs +++ b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs @@ -5,17 +5,18 @@ //! Manages allocation and deallocation of external networking resources //! required for blueprint realization -use anyhow::bail; -use anyhow::Context; +use crate::context::OpContext; +use crate::db::fixed_data::vpc_subnet::DNS_VPC_SUBNET; +use crate::db::fixed_data::vpc_subnet::NEXUS_VPC_SUBNET; +use crate::db::fixed_data::vpc_subnet::NTP_VPC_SUBNET; +use crate::db::DataStore; +use crate::db::DbConnection; use nexus_db_model::IncompleteNetworkInterface; -use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::fixed_data::vpc_subnet::DNS_VPC_SUBNET; -use nexus_db_queries::db::fixed_data::vpc_subnet::NEXUS_VPC_SUBNET; -use nexus_db_queries::db::fixed_data::vpc_subnet::NTP_VPC_SUBNET; -use nexus_db_queries::db::DataStore; +use nexus_db_model::IpPool; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::OmicronZoneExternalIp; +use omicron_common::api::external::Error; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; @@ -28,415 +29,394 @@ use slog::warn; use slog::Logger; use slog_error_chain::InlineErrorChain; -pub(crate) async fn ensure_zone_external_networking_allocated( - opctx: &OpContext, - datastore: &DataStore, - zones_to_allocate: impl Iterator, -) -> anyhow::Result<()> { - for z in zones_to_allocate { - let Some((external_ip, nic)) = z.zone_type.external_networking() else { - continue; - }; +impl DataStore { + pub(super) async fn ensure_zone_external_networking_allocated_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + opctx: &OpContext, + zones_to_allocate: impl Iterator, + ) -> Result<(), Error> { + // Looking up the service pool ID requires an opctx; we'll do this once + // up front and reuse the pool ID (which never changes) in the loop + // below. + let (_, pool) = self.ip_pools_service_lookup(opctx).await?; + + for z in zones_to_allocate { + let Some((external_ip, nic)) = z.zone_type.external_networking() + else { + continue; + }; - let log = opctx.log.new(slog::o!( - "action" => "allocate-external-networking", - "zone_kind" => z.zone_type.kind().report_str(), - "zone_id" => z.id.to_string(), - "ip" => format!("{external_ip:?}"), - "nic" => format!("{nic:?}"), - )); - - let kind = z.zone_type.kind(); - ensure_external_service_ip( - opctx, - datastore, - kind, - z.id, - external_ip, - &log, - ) - .await?; - ensure_service_nic(opctx, datastore, kind, z.id, nic, &log).await?; + let log = opctx.log.new(slog::o!( + "action" => "allocate-external-networking", + "zone_kind" => z.zone_type.kind().report_str(), + "zone_id" => z.id.to_string(), + "ip" => format!("{external_ip:?}"), + "nic" => format!("{nic:?}"), + )); + + let kind = z.zone_type.kind(); + self.ensure_external_service_ip( + conn, + &pool, + kind, + z.id, + external_ip, + &log, + ) + .await?; + self.ensure_service_nic(conn, kind, z.id, nic, &log).await?; + } + + Ok(()) } - Ok(()) -} + pub(super) async fn ensure_zone_external_networking_deallocated_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + log: &Logger, + zones_to_deallocate: impl Iterator, + ) -> Result<(), Error> { + for z in zones_to_deallocate { + let Some((external_ip, nic)) = z.zone_type.external_networking() + else { + continue; + }; -pub(crate) async fn ensure_zone_external_networking_deallocated( - opctx: &OpContext, - datastore: &DataStore, - zones_to_deallocate: impl Iterator, -) -> anyhow::Result<()> { - for z in zones_to_deallocate { - let Some((external_ip, nic)) = z.zone_type.external_networking() else { - continue; - }; + let kind = z.zone_type.kind(); + let log = log.new(slog::o!( + "action" => "deallocate-external-networking", + "zone_kind" => kind.report_str(), + "zone_id" => z.id.to_string(), + "ip" => format!("{external_ip:?}"), + "nic" => format!("{nic:?}"), + )); - let kind = z.zone_type.kind(); - let log = opctx.log.new(slog::o!( - "action" => "deallocate-external-networking", - "zone_kind" => kind.report_str(), - "zone_id" => z.id.to_string(), - "ip" => format!("{external_ip:?}"), - "nic" => format!("{nic:?}"), - )); - - let deleted_ip = datastore - .deallocate_external_ip(opctx, external_ip.id().into_untyped_uuid()) - .await - .with_context(|| { - format!( - "failed to delete external IP {external_ip:?} \ - for {} zone {}", - kind.report_str(), - z.id + let deleted_ip = self + .deallocate_external_ip_on_connection( + conn, + external_ip.id().into_untyped_uuid(), ) - })?; - if deleted_ip { - info!(log, "successfully deleted Omicron zone external IP"); - } else { - debug!(log, "Omicron zone external IP already deleted"); - } + .await?; + if deleted_ip { + info!(log, "successfully deleted Omicron zone external IP"); + } else { + debug!(log, "Omicron zone external IP already deleted"); + } - let deleted_nic = datastore - .service_delete_network_interface( - opctx, - z.id.into_untyped_uuid(), - nic.id, - ) - .await - .with_context(|| { - format!( - "failed to delete service VNIC {nic:?} for {} zone {}", - kind.report_str(), - z.id + let deleted_nic = self + .service_delete_network_interface_on_connection( + conn, + z.id.into_untyped_uuid(), + nic.id, ) - })?; - if deleted_nic { - info!(log, "successfully deleted Omicron zone vNIC"); - } else { - debug!(log, "Omicron zone vNIC already deleted"); + .await + .map_err(|err| err.into_external())?; + if deleted_nic { + info!(log, "successfully deleted Omicron zone vNIC"); + } else { + debug!(log, "Omicron zone vNIC already deleted"); + } } - } - Ok(()) -} - -// Helper function to determine whether a given external IP address is -// already allocated to a specific service zone. -async fn is_external_ip_already_allocated( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - zone_id: OmicronZoneUuid, - external_ip: OmicronZoneExternalIp, - log: &Logger, -) -> anyhow::Result { - // localhost is used by many components in the test suite. We can't use - // the normal path because normally a given external IP must only be - // used once. Just treat localhost in the test suite as though it's - // already allocated. We do the same in is_nic_already_allocated(). - if cfg!(test) && external_ip.ip().is_loopback() { - return Ok(true); + Ok(()) } - let allocated_ips = datastore - .external_ip_list_service(opctx, zone_id.into_untyped_uuid()) - .await - .with_context(|| { - format!( - "failed to look up external IPs for {} {zone_id}", - zone_kind.report_str() + // Helper function to determine whether a given external IP address is + // already allocated to a specific service zone. + async fn is_external_ip_already_allocated( + &self, + conn: &async_bb8_diesel::Connection, + zone_id: OmicronZoneUuid, + external_ip: OmicronZoneExternalIp, + log: &Logger, + ) -> Result { + // localhost is used by many components in the test suite. We can't use + // the normal path because normally a given external IP must only be + // used once. Just treat localhost in the test suite as though it's + // already allocated. We do the same in is_nic_already_allocated(). + if cfg!(any(test, feature = "testing")) + && external_ip.ip().is_loopback() + { + return Ok(true); + } + + let allocated_ips = self + .external_ip_list_service_on_connection( + conn, + zone_id.into_untyped_uuid(), ) - })?; + .await?; - // We expect to find either 0 or exactly 1 IP for any given zone. If 0, - // we know the IP isn't allocated; if 1, we'll check that it matches - // below. - let existing_ip = match allocated_ips.as_slice() { - [] => { - info!(log, "external IP allocation required for zone"); + // We expect to find either 0 or exactly 1 IP for any given zone. If 0, + // we know the IP isn't allocated; if 1, we'll check that it matches + // below. + let existing_ip = match allocated_ips.as_slice() { + [] => { + info!(log, "external IP allocation required for zone"); - return Ok(false); - } - [ip] => ip, - _ => { + return Ok(false); + } + [ip] => ip, + _ => { + warn!( + log, "zone has multiple IPs allocated"; + "allocated_ips" => ?allocated_ips, + ); + return Err(Error::invalid_request(format!( + "zone {zone_id} already has {} IPs allocated (expected 1)", + allocated_ips.len() + ))); + } + }; + + // We expect this to always succeed; a failure here means we've stored + // an Omicron zone IP in the database that can't be converted back to an + // Omicron zone IP! + let existing_ip = match OmicronZoneExternalIp::try_from(existing_ip) { + Ok(existing_ip) => existing_ip, + Err(err) => { + error!(log, "invalid IP in database for zone"; &err); + return Err(Error::invalid_request(format!( + "zone {zone_id} has invalid IP database record: {}", + InlineErrorChain::new(&err) + ))); + } + }; + + if existing_ip == external_ip { + info!(log, "found already-allocated external IP"); + Ok(true) + } else { warn!( - log, "zone has multiple IPs allocated"; - "allocated_ips" => ?allocated_ips, - ); - bail!( - "zone {zone_id} already has {} IPs allocated (expected 1)", - allocated_ips.len() - ); - } - }; - - // We expect this to always succeed; a failure here means we've stored - // an Omicron zone IP in the database that can't be converted back to an - // Omicron zone IP! - let existing_ip = match OmicronZoneExternalIp::try_from(existing_ip) { - Ok(existing_ip) => existing_ip, - Err(err) => { - error!(log, "invalid IP in database for zone"; &err); - bail!( - "zone {zone_id} has invalid IP database record: {}", - InlineErrorChain::new(&err) + log, "zone has unexpected IP allocated"; + "allocated_ip" => ?existing_ip, ); + return Err(Error::invalid_request(format!( + "zone {zone_id} has a different IP allocated ({existing_ip:?})", + ))); } - }; - - if existing_ip == external_ip { - info!(log, "found already-allocated external IP"); - Ok(true) - } else { - warn!( - log, "zone has unexpected IP allocated"; - "allocated_ip" => ?existing_ip, - ); - bail!("zone {zone_id} has a different IP allocated ({existing_ip:?})",); } -} -// Helper function to determine whether a given NIC is already allocated to -// a specific service zone. -async fn is_nic_already_allocated( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - zone_id: OmicronZoneUuid, - nic: &NetworkInterface, - log: &Logger, -) -> anyhow::Result { - // See the comment in is_external_ip_already_allocated(). - if cfg!(test) && nic.ip.is_loopback() { - return Ok(true); - } + // Helper function to determine whether a given NIC is already allocated to + // a specific service zone. + async fn is_nic_already_allocated( + &self, + conn: &async_bb8_diesel::Connection, + zone_id: OmicronZoneUuid, + nic: &NetworkInterface, + log: &Logger, + ) -> Result { + // See the comment in is_external_ip_already_allocated(). + if cfg!(any(test, feature = "testing")) && nic.ip.is_loopback() { + return Ok(true); + } - let allocated_nics = datastore - .service_list_network_interfaces(opctx, zone_id.into_untyped_uuid()) - .await - .with_context(|| { - format!( - "failed to look up NICs for {} {zone_id}", - zone_kind.report_str() + let allocated_nics = self + .service_list_network_interfaces_on_connection( + conn, + zone_id.into_untyped_uuid(), ) - })?; - - if !allocated_nics.is_empty() { - // All the service zones that want NICs only expect to have a single - // one. Bail out here if this zone already has one or more allocated - // NICs but not the one we think it needs. - // - // This doesn't check the allocated NIC's subnet against our NICs, - // because that would require an extra DB lookup. We'll assume if - // these main properties are correct, the subnet is too. - for allocated_nic in &allocated_nics { - if allocated_nic.ip.ip() == nic.ip - && *allocated_nic.mac == nic.mac - && *allocated_nic.slot == nic.slot - && allocated_nic.primary == nic.primary - { - info!(log, "found already-allocated NIC"); - return Ok(true); + .await?; + + if !allocated_nics.is_empty() { + // All the service zones that want NICs only expect to have a single + // one. Bail out here if this zone already has one or more allocated + // NICs but not the one we think it needs. + // + // This doesn't check the allocated NIC's subnet against our NICs, + // because that would require an extra DB lookup. We'll assume if + // these main properties are correct, the subnet is too. + for allocated_nic in &allocated_nics { + if allocated_nic.ip.ip() == nic.ip + && *allocated_nic.mac == nic.mac + && *allocated_nic.slot == nic.slot + && allocated_nic.primary == nic.primary + { + info!(log, "found already-allocated NIC"); + return Ok(true); + } } - } - - warn!( - log, "zone has unexpected NICs allocated"; - "allocated_nics" => ?allocated_nics, - ); - bail!( - "zone {zone_id} already has {} non-matching NIC(s) allocated", - allocated_nics.len() - ); - } + warn!( + log, "zone has unexpected NICs allocated"; + "allocated_nics" => ?allocated_nics, + ); - info!(log, "NIC allocation required for zone"); + return Err(Error::invalid_request(format!( + "zone {zone_id} already has {} non-matching NIC(s) allocated", + allocated_nics.len() + ))); + } - Ok(false) -} + info!(log, "NIC allocation required for zone"); -async fn ensure_external_service_ip( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - zone_id: OmicronZoneUuid, - external_ip: OmicronZoneExternalIp, - log: &Logger, -) -> anyhow::Result<()> { - // Only attempt to allocate `external_ip` if it isn't already assigned - // to this zone. - // - // Checking for the existing of the external IP and then creating it - // if not found inserts a classic TOCTOU race: what if another Nexus - // is running concurrently, we both check and see that the IP is not - // allocated, then both attempt to create it? We believe this is - // okay: the loser of the race (i.e., the one whose create tries to - // commit second) will fail to allocate the IP, which will bubble - // out and prevent realization of the current blueprint. That's - // exactly what we want if two Nexuses try to realize the same - // blueprint at the same time. - if is_external_ip_already_allocated( - opctx, - datastore, - zone_kind, - zone_id, - external_ip, - log, - ) - .await? - { - return Ok(()); + Ok(false) } - datastore - .external_ip_allocate_omicron_zone( - opctx, + + async fn ensure_external_service_ip( + &self, + conn: &async_bb8_diesel::Connection, + pool: &IpPool, + zone_kind: ZoneKind, + zone_id: OmicronZoneUuid, + external_ip: OmicronZoneExternalIp, + log: &Logger, + ) -> Result<(), Error> { + // Only attempt to allocate `external_ip` if it isn't already assigned + // to this zone. + // + // Checking for the existing of the external IP and then creating it + // if not found inserts a classic TOCTOU race: what if another Nexus + // is running concurrently, we both check and see that the IP is not + // allocated, then both attempt to create it? We believe this is + // okay: the loser of the race (i.e., the one whose create tries to + // commit second) will fail to allocate the IP, which will bubble + // out and prevent realization of the current blueprint. That's + // exactly what we want if two Nexuses try to realize the same + // blueprint at the same time. + if self + .is_external_ip_already_allocated(conn, zone_id, external_ip, log) + .await? + { + return Ok(()); + } + self.external_ip_allocate_omicron_zone_on_connection( + conn, + pool, zone_id, zone_kind, external_ip, ) - .await - .with_context(|| { - format!( - "failed to allocate IP to {} {zone_id}: {external_ip:?}", - zone_kind.report_str() - ) - })?; + .await?; - info!(log, "successfully allocated external IP"); + info!(log, "successfully allocated external IP"); - Ok(()) -} + Ok(()) + } -// All service zones with external connectivity get service vNICs. -async fn ensure_service_nic( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - service_id: OmicronZoneUuid, - nic: &NetworkInterface, - log: &Logger, -) -> anyhow::Result<()> { - // We don't pass `nic.kind` into the database below, but instead - // explicitly call `service_create_network_interface`. Ensure this is - // indeed a service NIC. - match &nic.kind { - NetworkInterfaceKind::Instance { .. } => { - bail!("invalid NIC kind (expected service, got instance)") - } - NetworkInterfaceKind::Probe { .. } => { - bail!("invalid NIC kind (expected service, got probe)") + // All service zones with external connectivity get service vNICs. + async fn ensure_service_nic( + &self, + conn: &async_bb8_diesel::Connection, + zone_kind: ZoneKind, + service_id: OmicronZoneUuid, + nic: &NetworkInterface, + log: &Logger, + ) -> Result<(), Error> { + // We don't pass `nic.kind` into the database below, but instead + // explicitly call `service_create_network_interface`. Ensure this is + // indeed a service NIC. + match &nic.kind { + NetworkInterfaceKind::Instance { .. } => { + return Err(Error::invalid_request( + "invalid NIC kind (expected service, got instance)", + )); + } + NetworkInterfaceKind::Probe { .. } => { + return Err(Error::invalid_request( + "invalid NIC kind (expected service, got probe)", + )); + } + NetworkInterfaceKind::Service { .. } => (), } - NetworkInterfaceKind::Service { .. } => (), - } - let nic_subnet = match zone_kind { - ZoneKind::BoundaryNtp => &*NTP_VPC_SUBNET, - ZoneKind::ExternalDns => &*DNS_VPC_SUBNET, - ZoneKind::Nexus => &*NEXUS_VPC_SUBNET, - ZoneKind::Clickhouse - | ZoneKind::ClickhouseKeeper - | ZoneKind::CockroachDb - | ZoneKind::Crucible - | ZoneKind::CruciblePantry - | ZoneKind::InternalDns - | ZoneKind::InternalNtp - | ZoneKind::Oximeter => { - bail!("no VPC subnet available for {} zone", zone_kind.report_str()) + let nic_subnet = match zone_kind { + ZoneKind::BoundaryNtp => &*NTP_VPC_SUBNET, + ZoneKind::ExternalDns => &*DNS_VPC_SUBNET, + ZoneKind::Nexus => &*NEXUS_VPC_SUBNET, + ZoneKind::Clickhouse + | ZoneKind::ClickhouseKeeper + | ZoneKind::CockroachDb + | ZoneKind::Crucible + | ZoneKind::CruciblePantry + | ZoneKind::InternalDns + | ZoneKind::InternalNtp + | ZoneKind::Oximeter => { + return Err(Error::invalid_request(format!( + "no VPC subnet available for {} zone", + zone_kind.report_str() + ))); + } + }; + + // Only attempt to allocate `nic` if it isn't already assigned to this + // zone. + // + // This is subject to the same kind of TOCTOU race as described for IP + // allocation in `ensure_external_service_ip`, and we believe it's okay + // for the same reasons as described there. + if self.is_nic_already_allocated(conn, service_id, nic, log).await? { + return Ok(()); } - }; - - // Only attempt to allocate `nic` if it isn't already assigned to this - // zone. - // - // This is subject to the same kind of TOCTOU race as described for IP - // allocation in `ensure_external_service_ip`, and we believe it's okay - // for the same reasons as described there. - if is_nic_already_allocated( - opctx, datastore, zone_kind, service_id, nic, log, - ) - .await? - { - return Ok(()); - } - let nic_arg = IncompleteNetworkInterface::new_service( - nic.id, - service_id.into_untyped_uuid(), - nic_subnet.clone(), - IdentityMetadataCreateParams { - name: nic.name.clone(), - description: format!("{} service vNIC", zone_kind.report_str()), - }, - nic.ip, - nic.mac, - nic.slot, - ) - .with_context(|| { - format!( - "failed to convert NIC into IncompleteNetworkInterface: {nic:?}" - ) - })?; - let created_nic = datastore - .service_create_network_interface(opctx, nic_arg) - .await - .map_err(|err| err.into_external()) - .with_context(|| { - format!( - "failed to allocate NIC to {} {service_id}: {nic:?}", - zone_kind.report_str() - ) - })?; - - // We don't pass all the properties of `nic` into the create request - // above. Double-check that the properties the DB assigned match - // what we expect. - // - // We do not check `nic.vni`, because it's not stored in the - // database. (All services are given the constant vni - // `Vni::SERVICES_VNI`.) - if created_nic.primary != nic.primary || *created_nic.slot != nic.slot { - warn!( - log, "unexpected property on allocated NIC"; - "allocated_primary" => created_nic.primary, - "allocated_slot" => *created_nic.slot, - ); - - // Now what? We've allocated a NIC in the database but it's - // incorrect. Should we try to delete it? That would be best - // effort (we could fail to delete, or we could crash between - // creation and deletion). + let nic_arg = IncompleteNetworkInterface::new_service( + nic.id, + service_id.into_untyped_uuid(), + nic_subnet.clone(), + IdentityMetadataCreateParams { + name: nic.name.clone(), + description: format!("{} service vNIC", zone_kind.report_str()), + }, + nic.ip, + nic.mac, + nic.slot, + )?; + let created_nic = self + .create_network_interface_raw_conn(conn, nic_arg) + .await + .map_err(|err| err.into_external())?; + + // We don't pass all the properties of `nic` into the create request + // above. Double-check that the properties the DB assigned match + // what we expect. // - // We only expect services to have one NIC, so the only way it - // should be possible to get a different primary/slot value is - // if somehow this same service got a _different_ NIC allocated - // to it in the TOCTOU race window above. That should be - // impossible with the way we generate blueprints, so we'll just - // return a scary error here and expect to never see it. - bail!( - "database cleanup required: unexpected NIC ({created_nic:?}) \ - allocated for {} {service_id}", - zone_kind.report_str(), - ); - } + // We do not check `nic.vni`, because it's not stored in the + // database. (All services are given the constant vni + // `Vni::SERVICES_VNI`.) + if created_nic.primary != nic.primary || *created_nic.slot != nic.slot { + warn!( + log, "unexpected property on allocated NIC"; + "allocated_primary" => created_nic.primary, + "allocated_slot" => *created_nic.slot, + ); - info!(log, "successfully allocated service vNIC"); + // Now what? We've allocated a NIC in the database but it's + // incorrect. Should we try to delete it? That would be best + // effort (we could fail to delete, or we could crash between + // creation and deletion). + // + // We only expect services to have one NIC, so the only way it + // should be possible to get a different primary/slot value is + // if somehow this same service got a _different_ NIC allocated + // to it in the TOCTOU race window above. That should be + // impossible with the way we generate blueprints, so we'll just + // return a scary error here and expect to never see it. + return Err(Error::invalid_request(format!( + "database cleanup required: unexpected NIC ({created_nic:?}) \ + allocated for {} {service_id}", + zone_kind.report_str(), + ))); + } + + info!(log, "successfully allocated service vNIC"); - Ok(()) + Ok(()) + } } #[cfg(test)] mod tests { use super::*; + use crate::db::datastore::test_utils::datastore_test; + use crate::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; + use anyhow::Context as _; use async_bb8_diesel::AsyncSimpleConnection; use chrono::DateTime; use chrono::Utc; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use nexus_db_model::SqlU16; - use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; - use nexus_test_utils_macros::nexus_test; + use nexus_test_utils::db::test_setup_database; use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; @@ -455,6 +435,7 @@ mod tests { use omicron_common::api::external::MacAddr; use omicron_common::api::external::Vni; use omicron_common::zpool_name::ZpoolName; + use omicron_test_utils::dev; use omicron_uuid_kinds::ExternalIpUuid; use omicron_uuid_kinds::ZpoolUuid; use oxnet::IpNet; @@ -463,9 +444,6 @@ mod tests { use std::net::SocketAddr; use uuid::Uuid; - type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - struct Harness { external_ips_range: IpRange, external_ips: IpRangeIter, @@ -658,14 +636,11 @@ mod tests { ] } - async fn assert_ips_exist_in_datastore( - &self, - opctx: &OpContext, - datastore: &DataStore, - ) { + async fn assert_ips_exist_in_datastore(&self, datastore: &DataStore) { + let conn = datastore.pool_connection_for_tests().await.unwrap(); let db_nexus_ips = datastore - .external_ip_list_service( - &opctx, + .external_ip_list_service_on_connection( + &conn, self.nexus_id.into_untyped_uuid(), ) .await @@ -685,8 +660,8 @@ mod tests { assert_eq!(db_nexus_ips[0].last_port, SqlU16(65535)); let db_dns_ips = datastore - .external_ip_list_service( - &opctx, + .external_ip_list_service_on_connection( + &conn, self.dns_id.into_untyped_uuid(), ) .await @@ -709,8 +684,8 @@ mod tests { assert_eq!(db_dns_ips[0].last_port, SqlU16(65535)); let db_ntp_ips = datastore - .external_ip_list_service( - &opctx, + .external_ip_list_service_on_connection( + &conn, self.ntp_id.into_untyped_uuid(), ) .await @@ -735,14 +710,11 @@ mod tests { ); } - async fn assert_nics_exist_in_datastore( - &self, - opctx: &OpContext, - datastore: &DataStore, - ) { + async fn assert_nics_exist_in_datastore(&self, datastore: &DataStore) { + let conn = datastore.pool_connection_for_tests().await.unwrap(); let db_nexus_nics = datastore - .service_list_network_interfaces( - &opctx, + .service_list_network_interfaces_on_connection( + &conn, self.nexus_id.into_untyped_uuid(), ) .await @@ -761,8 +733,8 @@ mod tests { assert_eq!(db_nexus_nics[0].primary, self.nexus_nic.primary); let db_dns_nics = datastore - .service_list_network_interfaces( - &opctx, + .service_list_network_interfaces_on_connection( + &conn, self.dns_id.into_untyped_uuid(), ) .await @@ -781,8 +753,8 @@ mod tests { assert_eq!(db_dns_nics[0].primary, self.dns_nic.primary); let db_ntp_nics = datastore - .service_list_network_interfaces( - &opctx, + .service_list_network_interfaces_on_connection( + &conn, self.ntp_id.into_untyped_uuid(), ) .await @@ -898,21 +870,17 @@ mod tests { } } - #[nexus_test] - async fn test_allocate_external_networking( - cptestctx: &ControlPlaneTestContext, - ) { + #[tokio::test] + async fn test_allocate_external_networking() { // Set up. - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); + usdt::register_probes().unwrap(); + let logctx = dev::test_setup_log("test_service_ip_list"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; // Generate the test values we care about. let mut harness = Harness::new(); - harness.set_up_service_ip_pool(&opctx, datastore).await; + harness.set_up_service_ip_pool(&opctx, &datastore).await; // Build the `zones` map needed by `ensure_zone_resources_allocated`, // with an arbitrary sled_id. @@ -920,31 +888,33 @@ mod tests { // Initialize resource allocation: this should succeed and create all // the relevant db records. - ensure_zone_external_networking_allocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); // Check that the external IP and NIC records were created. - harness.assert_ips_exist_in_datastore(&opctx, datastore).await; - harness.assert_nics_exist_in_datastore(&opctx, datastore).await; + harness.assert_ips_exist_in_datastore(&datastore).await; + harness.assert_nics_exist_in_datastore(&datastore).await; // We should be able to run the function again with the same inputs, and // it should succeed without inserting any new records. - ensure_zone_external_networking_allocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); - harness.assert_ips_exist_in_datastore(&opctx, datastore).await; - harness.assert_nics_exist_in_datastore(&opctx, datastore).await; + datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); + harness.assert_ips_exist_in_datastore(&datastore).await; + harness.assert_nics_exist_in_datastore(&datastore).await; // Now that we've tested the happy path, try some requests that ought to // fail because the request includes an external IP that doesn't match @@ -1027,13 +997,14 @@ mod tests { }; // and check that we get the error we expect. - let err = ensure_zone_external_networking_allocated( - &opctx, - datastore, - mutated_zones.iter(), - ) - .await - .expect_err("unexpected success"); + let err = datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + mutated_zones.iter(), + ) + .await + .expect_err("unexpected success"); assert!( err.to_string().contains(&expected_error), "expected {expected_error:?}, got {err:#}" @@ -1085,9 +1056,9 @@ mod tests { { let expected_error = mutate_nic_fn(zone.id, nic); - let err = ensure_zone_external_networking_allocated( + let err = datastore.ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), &opctx, - datastore, mutated_zones.iter(), ) .await @@ -1111,9 +1082,9 @@ mod tests { { let expected_error = mutate_nic_fn(zone.id, nic); - let err = ensure_zone_external_networking_allocated( + let err = datastore.ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), &opctx, - datastore, mutated_zones.iter(), ) .await @@ -1137,9 +1108,9 @@ mod tests { { let expected_error = mutate_nic_fn(zone.id, nic); - let err = ensure_zone_external_networking_allocated( + let err = datastore.ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), &opctx, - datastore, mutated_zones.iter(), ) .await @@ -1154,23 +1125,23 @@ mod tests { } } } + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); } - #[nexus_test] - async fn test_deallocate_external_networking( - cptestctx: &ControlPlaneTestContext, - ) { + #[tokio::test] + async fn test_deallocate_external_networking() { // Set up. - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); + usdt::register_probes().unwrap(); + let logctx = dev::test_setup_log("test_service_ip_list"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; // Generate the test values we care about. let harness = Harness::new(); - harness.set_up_service_ip_pool(&opctx, datastore).await; + harness.set_up_service_ip_pool(&opctx, &datastore).await; // Build the `zones` map needed by `ensure_zone_resources_allocated`, // with an arbitrary sled_id. @@ -1178,45 +1149,52 @@ mod tests { // Initialize resource allocation: this should succeed and create all // the relevant db records. - ensure_zone_external_networking_allocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); // Check that the external IP and NIC records were created. - harness.assert_ips_exist_in_datastore(&opctx, datastore).await; - harness.assert_nics_exist_in_datastore(&opctx, datastore).await; + harness.assert_ips_exist_in_datastore(&datastore).await; + harness.assert_nics_exist_in_datastore(&datastore).await; // Deallocate resources: this should succeed and mark all relevant db // records deleted. - ensure_zone_external_networking_deallocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_deallocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &logctx.log, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); - harness.assert_ips_are_deleted_in_datastore(datastore).await; - harness.assert_nics_are_deleted_in_datastore(datastore).await; + harness.assert_ips_are_deleted_in_datastore(&datastore).await; + harness.assert_nics_are_deleted_in_datastore(&datastore).await; // This operation should be idempotent: we can run it again, and the // records remain deleted. - ensure_zone_external_networking_deallocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_deallocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &logctx.log, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); + + harness.assert_ips_are_deleted_in_datastore(&datastore).await; + harness.assert_nics_are_deleted_in_datastore(&datastore).await; - harness.assert_ips_are_deleted_in_datastore(datastore).await; - harness.assert_nics_are_deleted_in_datastore(datastore).await; + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); } } diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 9a3928dd58..4b7f4a3825 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -23,6 +23,7 @@ use crate::db::model::ExternalIp; use crate::db::model::FloatingIp; use crate::db::model::IncompleteExternalIp; use crate::db::model::IpKind; +use crate::db::model::IpPool; use crate::db::model::Name; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; @@ -169,9 +170,9 @@ impl DataStore { } /// Fetch all external IP addresses of any kind for the provided service. - pub async fn external_ip_list_service( + pub async fn external_ip_list_service_on_connection( &self, - opctx: &OpContext, + conn: &async_bb8_diesel::Connection, service_id: Uuid, ) -> LookupResult> { use db::schema::external_ip::dsl; @@ -180,7 +181,7 @@ impl DataStore { .filter(dsl::parent_id.eq(service_id)) .filter(dsl::time_deleted.is_null()) .select(ExternalIp::as_select()) - .get_results_async(&*self.pool_connection_authorized(opctx).await?) + .get_results_async(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -329,6 +330,25 @@ impl DataStore { self.allocate_external_ip(opctx, data).await } + /// Variant of [Self::external_ip_allocate_omicron_zone] which may be called + /// from a transaction context. + pub(crate) async fn external_ip_allocate_omicron_zone_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + service_pool: &IpPool, + zone_id: OmicronZoneUuid, + zone_kind: ZoneKind, + external_ip: OmicronZoneExternalIp, + ) -> Result> { + let data = IncompleteExternalIp::for_omicron_zone( + service_pool.id(), + external_ip, + zone_id, + zone_kind, + ); + Self::allocate_external_ip_on_connection(conn, data).await + } + /// List one page of all external IPs allocated to internal services pub async fn external_ip_list_service_all( &self, @@ -636,6 +656,17 @@ impl DataStore { &self, opctx: &OpContext, ip_id: Uuid, + ) -> Result { + let conn = self.pool_connection_authorized(opctx).await?; + self.deallocate_external_ip_on_connection(&conn, ip_id).await + } + + /// Variant of [Self::deallocate_external_ip] which may be called from a + /// transaction context. + pub(crate) async fn deallocate_external_ip_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + ip_id: Uuid, ) -> Result { use db::schema::external_ip::dsl; let now = Utc::now(); @@ -644,7 +675,7 @@ impl DataStore { .filter(dsl::id.eq(ip_id)) .set(dsl::time_deleted.eq(now)) .check_if_exists::(ip_id) - .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .execute_and_check(conn) .await .map(|r| match r.status { UpdateStatus::Updated => true, diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 9fb94f043e..455aa62192 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -22,10 +22,12 @@ use crate::db::model::Generation; use crate::db::model::Instance; use crate::db::model::InstanceRuntimeState; use crate::db::model::Migration; +use crate::db::model::MigrationState; use crate::db::model::Name; use crate::db::model::Project; use crate::db::model::Sled; use crate::db::model::Vmm; +use crate::db::model::VmmState; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateAndQueryResult; @@ -35,9 +37,9 @@ use chrono::Utc; use diesel::prelude::*; use nexus_db_model::ApplySledFilterExt; use nexus_db_model::Disk; -use nexus_db_model::VmmRuntimeState; use nexus_types::deployment::SledFilter; use omicron_common::api; +use omicron_common::api::external; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -46,8 +48,8 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; +use omicron_common::api::external::MessagePair; use omicron_common::api::external::ResourceType; -use omicron_common::api::internal::nexus::MigrationRuntimeState; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -59,8 +61,8 @@ use uuid::Uuid; /// Wraps a record of an `Instance` along with its active `Vmm`, if it has one. #[derive(Clone, Debug)] pub struct InstanceAndActiveVmm { - instance: Instance, - vmm: Option, + pub instance: Instance, + pub vmm: Option, } impl InstanceAndActiveVmm { @@ -76,13 +78,98 @@ impl InstanceAndActiveVmm { self.vmm.as_ref().map(|v| SledUuid::from_untyped_uuid(v.sled_id)) } - pub fn effective_state( - &self, - ) -> omicron_common::api::external::InstanceState { - if let Some(vmm) = &self.vmm { - vmm.runtime.state.into() - } else { - self.instance.runtime().nexus_state.into() + /// Returns the operator-visible [external API + /// `InstanceState`](external::InstanceState) for this instance and its + /// active VMM. + pub fn effective_state(&self) -> external::InstanceState { + Self::determine_effective_state(&self.instance, self.vmm.as_ref()) + } + + /// Returns the operator-visible [external API + /// `InstanceState`](external::InstanceState) for the provided [`Instance`] + /// and its active [`Vmm`], if one exists. + /// + /// # Arguments + /// + /// - `instance`: the instance + /// - `active_vmm`: the instance's active VMM, if one exists. + /// + /// # Notes + /// + /// Generally, the value of `active_vmm` should be + /// the VMM pointed to by `instance.runtime_state.propolis_id`. However, + /// this is not enforced by this function, as the `instance_migrate` saga + /// must in some cases determine an effective instance state from the + /// instance and *target* VMM states. + pub fn determine_effective_state( + instance: &Instance, + active_vmm: Option<&Vmm>, + ) -> external::InstanceState { + use crate::db::model::InstanceState; + use crate::db::model::VmmState; + + let instance_state = instance.runtime_state.nexus_state; + let vmm_state = active_vmm.map(|vmm| vmm.runtime.state); + + // We want to only report that an instance is `Stopped` when a new + // `instance-start` saga is able to proceed. That means that: + match (instance_state, vmm_state) { + // - If there's an active migration ID for the instance, *always* + // treat its state as "migration" regardless of the VMM's state. + // + // This avoids an issue where an instance whose previous active + // VMM has been destroyed as a result of a successful migration + // out will appear to be "stopping" for the time between when that + // VMM was reported destroyed and when the instance record was + // updated to reflect the migration's completion. + // + // Instead, we'll continue to report the instance's state as + // "migrating" until an instance-update saga has resolved the + // outcome of the migration, since only the instance-update saga + // can complete the migration and update the instance record to + // point at its new active VMM. No new instance-migrate, + // instance-stop, or instance-delete saga can be started + // until this occurs. + // + // If the instance actually *has* stopped or failed before a + // successful migration out, this is fine, because an + // instance-update saga will come along and remove the active VMM + // and migration IDs. + // + (InstanceState::Vmm, Some(_)) + if instance.runtime_state.migration_id.is_some() => + { + external::InstanceState::Migrating + } + // - An instance with a "stopped" or "destroyed" VMM needs to be + // recast as a "stopping" instance, as the virtual provisioning + // resources for that instance have not been deallocated until the + // active VMM ID has been unlinked by an update saga. + ( + InstanceState::Vmm, + Some(VmmState::Stopped | VmmState::Destroyed), + ) => external::InstanceState::Stopping, + // - An instance with a "saga unwound" VMM, on the other hand, can + // be treated as "stopped", since --- unlike "destroyed" --- a new + // start saga can run at any time by just clearing out the old VMM + // ID. + (InstanceState::Vmm, Some(VmmState::SagaUnwound)) => { + external::InstanceState::Stopped + } + // - An instance with no VMM is always "stopped" (as long as it's + // not "starting" etc.) + (InstanceState::NoVmm, _vmm_state) => { + debug_assert_eq!(_vmm_state, None); + external::InstanceState::Stopped + } + // If there's a VMM state, and none of the above rules apply, use + // that. + (_instance_state, Some(vmm_state)) => { + debug_assert_eq!(_instance_state, InstanceState::Vmm); + vmm_state.into() + } + // If there's no VMM state, use the instance's state. + (instance_state, None) => instance_state.into(), } } } @@ -93,18 +180,13 @@ impl From<(Instance, Option)> for InstanceAndActiveVmm { } } -impl From for omicron_common::api::external::Instance { +impl From for external::Instance { fn from(value: InstanceAndActiveVmm) -> Self { - let run_state: omicron_common::api::external::InstanceState; - let time_run_state_updated: chrono::DateTime; - (run_state, time_run_state_updated) = if let Some(vmm) = value.vmm { - (vmm.runtime.state.into(), vmm.runtime.time_state_updated) - } else { - ( - value.instance.runtime_state.nexus_state.into(), - value.instance.runtime_state.time_updated, - ) - }; + let time_run_state_updated = value + .vmm + .as_ref() + .map(|vmm| vmm.runtime.time_state_updated) + .unwrap_or(value.instance.runtime_state.time_updated); Self { identity: value.instance.identity(), @@ -116,21 +198,21 @@ impl From for omicron_common::api::external::Instance { .hostname .parse() .expect("found invalid hostname in the database"), - runtime: omicron_common::api::external::InstanceRuntimeState { - run_state, + runtime: external::InstanceRuntimeState { + run_state: value.effective_state(), time_run_state_updated, }, } } } -/// A complete snapshot of the database records describing the current state of +/// The totality of database records describing the current state of /// an instance: the [`Instance`] record itself, along with its active [`Vmm`], /// target [`Vmm`], and current [`Migration`], if they exist. /// /// This is returned by [`DataStore::instance_fetch_all`]. #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] -pub struct InstanceSnapshot { +pub struct InstanceGestalt { /// The instance record. pub instance: Instance, /// The [`Vmm`] record pointed to by the instance's `active_propolis_id`, if @@ -152,12 +234,14 @@ pub struct InstanceSnapshot { /// when the lock is released. #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct UpdaterLock { - saga_lock_id: Uuid, + pub updater_id: Uuid, locked_gen: Generation, } /// Errors returned by [`DataStore::instance_updater_lock`]. -#[derive(Debug, thiserror::Error, PartialEq)] +#[derive( + Debug, thiserror::Error, PartialEq, serde::Serialize, serde::Deserialize, +)] pub enum UpdaterLockError { /// The instance was already locked by another saga. #[error("instance already locked by another saga")] @@ -167,25 +251,6 @@ pub enum UpdaterLockError { Query(#[from] Error), } -/// The result of an [`DataStore::instance_and_vmm_update_runtime`] call, -/// indicating which records were updated. -#[derive(Copy, Clone, Debug)] -pub struct InstanceUpdateResult { - /// `true` if the instance record was updated, `false` otherwise. - pub instance_updated: bool, - /// `true` if the VMM record was updated, `false` otherwise. - pub vmm_updated: bool, - /// Indicates whether a migration record for this instance was updated, if a - /// [`MigrationRuntimeState`] was provided to - /// [`DataStore::instance_and_vmm_update_runtime`]. - /// - /// - `Some(true)` if a migration record was updated - /// - `Some(false)` if a [`MigrationRuntimeState`] was provided, but the - /// migration record was not updated - /// - `None` if no [`MigrationRuntimeState`] was provided - pub migration_updated: Option, -} - impl DataStore { /// Idempotently insert a database record for an Instance /// @@ -295,6 +360,74 @@ impl DataStore { .collect()) } + /// List all instances with active VMMs in the `Destroyed` state that don't + /// have currently-running instance-updater sagas. + /// + /// This is used by the `instance_updater` background task to ensure that + /// update sagas are scheduled for these instances. + pub async fn find_instances_with_destroyed_active_vmms( + &self, + opctx: &OpContext, + ) -> ListResultVec { + use db::model::VmmState; + use db::schema::instance::dsl; + use db::schema::vmm::dsl as vmm_dsl; + + vmm_dsl::vmm + .filter(vmm_dsl::state.eq(VmmState::Destroyed)) + // If the VMM record has already been deleted, we don't need to do + // anything about it --- someone already has. + .filter(vmm_dsl::time_deleted.is_null()) + .inner_join( + dsl::instance.on(dsl::active_propolis_id + .eq(vmm_dsl::id.nullable()) + .and(dsl::time_deleted.is_null()) + .and(dsl::updater_id.is_null())), + ) + .select(Instance::as_select()) + .load_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// List all instances with active migrations that have terminated (either + /// completed or failed) and don't have currently-running instance-updater + /// sagas. + /// + /// This is used by the `instance_updater` background task to ensure that + /// update sagas are scheduled for these instances. + pub async fn find_instances_with_terminated_active_migrations( + &self, + opctx: &OpContext, + ) -> ListResultVec { + use db::model::MigrationState; + use db::schema::instance::dsl; + use db::schema::migration::dsl as migration_dsl; + + dsl::instance + .filter(dsl::time_deleted.is_null()) + .filter(dsl::migration_id.is_not_null()) + .filter(dsl::updater_id.is_null()) + .inner_join( + migration_dsl::migration.on(dsl::migration_id + .eq(migration_dsl::id.nullable()) + .and( + migration_dsl::target_state + .eq_any(MigrationState::TERMINAL_STATES) + .or(migration_dsl::source_state + .eq_any(MigrationState::TERMINAL_STATES)), + )), + ) + .select(Instance::as_select()) + .load_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Fetches information about an Instance that the caller has previously /// fetched /// @@ -359,7 +492,7 @@ impl DataStore { /// instance in a single atomic query. /// /// If an instance with the provided UUID exists, this method returns an - /// [`InstanceSnapshot`], which contains the following: + /// [`InstanceGestalt`], which contains the following: /// /// - The [`Instance`] record itself, /// - The instance's active [`Vmm`] record, if the `active_propolis_id` @@ -372,7 +505,7 @@ impl DataStore { &self, opctx: &OpContext, authz_instance: &authz::Instance, - ) -> LookupResult { + ) -> LookupResult { opctx.authorize(authz::Action::Read, authz_instance).await?; use db::schema::instance::dsl as instance_dsl; @@ -438,7 +571,7 @@ impl DataStore { ) })?; - Ok(InstanceSnapshot { instance, migration, active_vmm, target_vmm }) + Ok(InstanceGestalt { instance, migration, active_vmm, target_vmm }) } // TODO-design It's tempting to return the updated state of the Instance @@ -484,83 +617,180 @@ impl DataStore { Ok(updated) } - /// Updates an instance record and a VMM record with a single database - /// command. + /// Updates an instance record by setting the instance's migration ID to the + /// provided `migration_id` and the target VMM ID to the provided + /// `target_propolis_id`, if the instance does not currently have an active + /// migration, and the active VMM is in the [`VmmState::Running`] or + /// [`VmmState::Rebooting`] states. /// - /// This is intended to be used to apply updates from sled agent that - /// may change a VMM's runtime state (e.g. moving an instance from Running - /// to Stopped) and its corresponding instance's state (e.g. changing the - /// active Propolis ID to reflect a completed migration) in a single - /// transaction. The caller is responsible for ensuring the instance and - /// VMM states are consistent with each other before calling this routine. - /// - /// # Arguments - /// - /// - instance_id: The ID of the instance to update. - /// - new_instance: The new instance runtime state to try to write. - /// - vmm_id: The ID of the VMM to update. - /// - new_vmm: The new VMM runtime state to try to write. - /// - /// # Return value - /// - /// - `Ok(`[`InstanceUpdateResult`]`)` if the query was issued - /// successfully. The returned [`InstanceUpdateResult`] indicates which - /// database record(s) were updated. Note that an update can fail because - /// it was inapplicable (i.e. the database has state with a newer - /// generation already) or because the relevant record was not found. - /// - `Err` if another error occurred while accessing the database. - pub async fn instance_and_vmm_update_runtime( + /// Note that a non-NULL `target_propolis_id` will be overwritten, if (and + /// only if) the target VMM record is in [`VmmState::SagaUnwound`], + /// indicating that it was left behind by a failed `instance-migrate` saga + /// unwinding. + pub async fn instance_set_migration_ids( &self, - instance_id: &InstanceUuid, - new_instance: &InstanceRuntimeState, - vmm_id: &PropolisUuid, - new_vmm: &VmmRuntimeState, - migration: &Option, - ) -> Result { - let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( - *instance_id, - new_instance.clone(), - *vmm_id, - new_vmm.clone(), - migration.clone(), - ); + opctx: &OpContext, + instance_id: InstanceUuid, + src_propolis_id: PropolisUuid, + migration_id: Uuid, + target_propolis_id: PropolisUuid, + ) -> Result { + use db::schema::instance::dsl; + use db::schema::migration::dsl as migration_dsl; + use db::schema::vmm::dsl as vmm_dsl; - // The InstanceAndVmmUpdate query handles and indicates failure to find - // either the instance or the VMM, so a query failure here indicates - // some kind of internal error and not a failed lookup. - let result = query - .execute_and_check(&*self.pool_connection_unauthorized().await?) + // Only allow migrating out if the active VMM is running or rebooting. + const ALLOWED_ACTIVE_VMM_STATES: &[VmmState] = + &[VmmState::Running, VmmState::Rebooting]; + + let instance_id = instance_id.into_untyped_uuid(); + let target_propolis_id = target_propolis_id.into_untyped_uuid(); + let src_propolis_id = src_propolis_id.into_untyped_uuid(); + + // Subquery for determining whether the active VMM is in a state where + // it can be migrated out of. This returns the VMM row's instance ID, so + // that we can use it in a `filter` on the update query. + let vmm_ok = vmm_dsl::vmm + .filter(vmm_dsl::id.eq(src_propolis_id)) + .filter(vmm_dsl::time_deleted.is_null()) + .filter(vmm_dsl::state.eq_any(ALLOWED_ACTIVE_VMM_STATES)) + .select(vmm_dsl::instance_id); + // Subquery for checking if a present target VMM ID points at a VMM + // that's in the saga-unwound state (in which it would be okay to clear + // out that VMM). + let target_vmm_unwound = vmm_dsl::vmm + .filter(vmm_dsl::id.nullable().eq(dsl::target_propolis_id)) + // Don't filter out target VMMs with `time_deleted` set here --- we + // *shouldn't* have deleted the VMM without unlinking it from the + // instance record, but if something did, we should still allow the + // ID to be clobbered. + .filter(vmm_dsl::state.eq(VmmState::SagaUnwound)) + .select(vmm_dsl::instance_id); + // Subquery for checking if an already present migration ID points at a + // migration where both the source- and target-sides are marked as + // failed. If both are failed, *and* the target VMM is `SagaUnwound` as + // determined by the query above, then it's okay to clobber that + // migration, as it was left behind by a previous migrate saga unwinding. + let current_migration_failed = migration_dsl::migration + .filter(migration_dsl::id.nullable().eq(dsl::migration_id)) + .filter(migration_dsl::target_state.eq(MigrationState::FAILED)) + .filter(migration_dsl::source_state.eq(MigrationState::FAILED)) + .select(migration_dsl::instance_id); + + diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter( + // Update the row if and only if one of the following is true: + // + // - The migration and target VMM IDs are not present + (dsl::migration_id + .is_null() + .and(dsl::target_propolis_id.is_null())) + // - The migration and target VMM IDs are set to the values + // we are trying to set. + // + // This way, we can use a `RETURNING` clause to fetch the + // current state after the update, rather than + // `check_if_exists` which returns the prior state, and still + // fail to update the record if another migration/target VMM + // ID is already there. + .or(dsl::migration_id + .eq(Some(migration_id)) + .and(dsl::target_propolis_id.eq(Some(target_propolis_id)))) + // - The migration and target VMM IDs are set to another + // migration, but the target VMM state is `SagaUnwound` and + // the migration is `Failed` on both sides. + // + // This would indicate that the migration/VMM IDs are left + // behind by another migrate saga failing, and are okay to get + // rid of. + .or( + // Note that both of these queries return the instance ID + // from the VMM and migration records, so we check if one was + // found by comparing it to the actual instance ID. + dsl::id + .eq_any(target_vmm_unwound) + .and(dsl::id.eq_any(current_migration_failed)), + ), + ) + .filter(dsl::active_propolis_id.eq(src_propolis_id)) + .filter(dsl::id.eq_any(vmm_ok)) + .set(( + dsl::migration_id.eq(Some(migration_id)), + dsl::target_propolis_id.eq(Some(target_propolis_id)), + // advance the generation + dsl::state_generation.eq(dsl::state_generation + 1), + dsl::time_state_updated.eq(Utc::now()), + )) + .returning(Instance::as_returning()) + .get_result_async::( + &*self.pool_connection_authorized(opctx).await?, + ) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - let instance_updated = match result.instance_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }; + .map_err(|error| Error::Conflict { + message: MessagePair::new_full( + "another migration is already in progress".to_string(), + format!( + "cannot set migration ID {migration_id} for instance \ + {instance_id} (perhaps another migration ID is \ + already present): {error:#}" + ), + ), + }) + } - let vmm_updated = match result.vmm_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }; + /// Unsets the migration IDs set by + /// [`DataStore::instance_set_migration_ids`]. + /// + /// This method will only unset the instance's migration IDs if they match + /// the provided ones. + /// # Returns + /// + /// - `Ok(true)` if the migration IDs were unset, + /// - `Ok(false)` if the instance IDs have *already* been unset (this method + /// is idempotent) + /// - `Err` if the database query returned an error. + pub async fn instance_unset_migration_ids( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + migration_id: Uuid, + target_propolis_id: PropolisUuid, + ) -> Result { + use db::schema::instance::dsl; - let migration_updated = if migration.is_some() { - Some(match result.migration_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, + let instance_id = instance_id.into_untyped_uuid(); + let target_propolis_id = target_propolis_id.into_untyped_uuid(); + let updated = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter(dsl::migration_id.eq(migration_id)) + .filter(dsl::target_propolis_id.eq(target_propolis_id)) + .set(( + dsl::migration_id.eq(None::), + dsl::target_propolis_id.eq(None::), + // advance the generation + dsl::state_generation.eq(dsl::state_generation + 1), + dsl::time_state_updated.eq(Utc::now()), + )) + .check_if_exists::(instance_id.into_untyped_uuid()) + .execute_and_check(&*self.pool_connection_authorized(&opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, }) - } else { - debug_assert_eq!(result.migration_status, None); - None - }; - - Ok(InstanceUpdateResult { - instance_updated, - vmm_updated, - migration_updated, - }) + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + Ok(updated) } /// Lists all instances on in-service sleds with active Propolis VMM @@ -706,23 +936,28 @@ impl DataStore { } /// Attempts to lock an instance's record to apply state updates in an - /// instance-update saga, returning the state of the instance when the lock - /// was acquired. + /// instance-update saga, returning an [`UpdaterLock`] if the lock is + /// successfully acquired. /// /// # Notes /// /// This method MUST only be called from the context of a saga! The /// calling saga must ensure that the reverse action for the action that /// acquires the lock must call [`DataStore::instance_updater_unlock`] to - /// ensure that the lock is always released if the saga unwinds. + /// ensure that the lock is always released if the saga unwinds. If the saga + /// locking the instance completes successfully, it must release the lock + /// using [`DataStore::instance_updater_unlock`], or use + /// [`DataStore::instance_commit_update`] to release the lock and write back + /// a new [`InstanceRuntimeState`] in a single atomic query. /// /// This method is idempotent: if the instance is already locked by the same /// saga, it will succeed, as though the lock was acquired. /// /// # Arguments /// - /// - `authz_instance`: the instance to attempt to lock to lock - /// - `saga_lock_id`: the UUID of the saga that's attempting to lock this + /// - `opctx`: the [`OpContext`] for this operation. + /// - `authz_instance`: the instance to attempt to lock. + /// - `updater_id`: the UUID of the saga that's attempting to lock this /// instance. /// /// # Returns @@ -737,7 +972,7 @@ impl DataStore { &self, opctx: &OpContext, authz_instance: &authz::Instance, - saga_lock_id: Uuid, + updater_id: Uuid, ) -> Result { use db::schema::instance::dsl; @@ -758,22 +993,21 @@ impl DataStore { // *same* instance at the same time. So, idempotency is probably more // important than handling that extremely unlikely edge case. let mut did_lock = false; + let mut locked_gen = instance.updater_gen; loop { match instance.updater_id { // If the `updater_id` field is not null and the ID equals this // saga's ID, we already have the lock. We're done here! - Some(lock_id) if lock_id == saga_lock_id => { - slog::info!( + Some(lock_id) if lock_id == updater_id => { + slog::debug!( &opctx.log, "instance updater lock acquired!"; "instance_id" => %instance_id, - "saga_id" => %saga_lock_id, + "updater_id" => %updater_id, + "locked_gen" => ?locked_gen, "already_locked" => !did_lock, ); - return Ok(UpdaterLock { - saga_lock_id, - locked_gen: instance.updater_gen, - }); + return Ok(UpdaterLock { updater_id, locked_gen }); } // The `updater_id` field is set, but it's not our ID. The instance // is locked by a different saga, so give up. @@ -783,7 +1017,7 @@ impl DataStore { "instance is locked by another saga"; "instance_id" => %instance_id, "locked_by" => %lock_id, - "saga_id" => %saga_lock_id, + "updater_id" => %updater_id, ); return Err(UpdaterLockError::AlreadyLocked); } @@ -794,11 +1028,12 @@ impl DataStore { // Okay, now attempt to acquire the lock let current_gen = instance.updater_gen; + locked_gen = Generation(current_gen.0.next()); slog::debug!( &opctx.log, "attempting to acquire instance updater lock"; "instance_id" => %instance_id, - "saga_id" => %saga_lock_id, + "updater_id" => %updater_id, "current_gen" => ?current_gen, ); @@ -816,8 +1051,8 @@ impl DataStore { // of a non-distributed, single-process mutex. .filter(dsl::updater_gen.eq(current_gen)) .set(( - dsl::updater_gen.eq(dsl::updater_gen + 1), - dsl::updater_id.eq(Some(saga_lock_id)), + dsl::updater_gen.eq(locked_gen), + dsl::updater_id.eq(Some(updater_id)), )) .check_if_exists::(instance_id) .execute_and_check( @@ -846,11 +1081,290 @@ impl DataStore { } } - /// Release the instance-updater lock acquired by - /// [`DataStore::instance_updater_lock`]. + /// Attempts to "inherit" the lock acquired by + /// [`DataStore::instance_updater_lock`] by setting a new `child_lock_id` as + /// the current updater, if (and only if) the lock is held by the provided + /// `parent_lock`. + /// + /// This essentially performs the equivalent of a [compare-exchange] + /// operation on the instance record's lock ID field, which succeeds if the + /// current lock ID matches the parent. Using this method ensures that, if a + /// parent saga starts multiple child sagas, only one of them can + /// successfully acquire the lock. + /// + /// # Notes + /// + /// This method MUST only be called from the context of a saga! The + /// calling saga must ensure that the reverse action for the action that + /// acquires the lock must call [`DataStore::instance_updater_unlock`] to + /// ensure that the lock is always released if the saga unwinds. If the saga + /// locking the instance completes successfully, it must release the lock + /// using [`DataStore::instance_updater_unlock`], or use + /// [`DataStore::instance_commit_update`] to release the lock and write back + /// a new [`InstanceRuntimeState`] in a single atomic query. + + /// + /// This method is idempotent: if the instance is already locked by the same + /// saga, it will succeed, as though the lock was acquired. + /// + /// # Arguments + /// + /// - `opctx`: the [`OpContext`] for this operation. + /// - `authz_instance`: the instance to attempt to inherit the lock on. + /// - `parent_lock`: the [`UpdaterLock`] to attempt to inherit the lock + /// from. If the current updater UUID and generation matches this, the + /// lock can be inherited by `child_id`. + /// - `child_lock_id`: the UUID of the saga that's attempting to lock this + /// instance. + /// + /// # Returns + /// + /// - [`Ok`]`(`[`UpdaterLock`]`)` if the lock was successfully inherited. + /// - [`Err`]`([`UpdaterLockError::AlreadyLocked`])` if the instance was + /// locked by a different saga, other than the provided `parent_lock`. + /// - [`Err`]`([`UpdaterLockError::Query`]`(...))` if the query to fetch + /// the instance or lock it returned another error (such as if the + /// instance no longer exists, or if the database connection failed). + pub async fn instance_updater_inherit_lock( + &self, + opctx: &OpContext, + authz_instance: &authz::Instance, + parent_lock: UpdaterLock, + child_lock_id: Uuid, + ) -> Result { + use db::schema::instance::dsl; + let UpdaterLock { updater_id: parent_id, locked_gen } = parent_lock; + let instance_id = authz_instance.id(); + let new_gen = Generation(locked_gen.0.next()); + + let result = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter(dsl::updater_gen.eq(locked_gen)) + .filter(dsl::updater_id.eq(parent_id)) + .set(( + dsl::updater_gen.eq(new_gen), + dsl::updater_id.eq(Some(child_lock_id)), + )) + .check_if_exists::(instance_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + + match result { + // If we updated the record, the lock has been successfully + // inherited! Return `Ok(true)` to indicate that we have acquired + // the lock successfully. + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + slog::debug!( + &opctx.log, + "inherited lock from {parent_id} to {child_lock_id}"; + "instance_id" => %instance_id, + "updater_id" => %child_lock_id, + "locked_gen" => ?new_gen, + "parent_id" => %parent_id, + "parent_gen" => ?locked_gen, + ); + Ok(UpdaterLock { + updater_id: child_lock_id, + locked_gen: new_gen, + }) + } + // The generation has advanced past the generation at which the + // lock was held. This means that we have already inherited the + // lock. Return `Ok(false)` here for idempotency. + UpdateAndQueryResult { + status: UpdateStatus::NotUpdatedButExists, + ref found, + } if found.updater_id == Some(child_lock_id) => { + slog::debug!( + &opctx.log, + "previously inherited lock from {parent_id} to \ + {child_lock_id}"; + "instance_id" => %instance_id, + "updater_id" => %child_lock_id, + "locked_gen" => ?found.updater_gen, + "parent_id" => %parent_id, + "parent_gen" => ?locked_gen, + ); + debug_assert_eq!(found.updater_gen, new_gen); + Ok(UpdaterLock { + updater_id: child_lock_id, + locked_gen: new_gen, + }) + } + // The instance exists, but it's locked by a different saga than the + // parent we were trying to inherit the lock from. We cannot acquire + // the lock at this time. + UpdateAndQueryResult { ref found, .. } => { + slog::debug!( + &opctx.log, + "cannot inherit instance-updater lock from {parent_id} to \ + {child_lock_id}: this instance is not locked by the \ + expected parent saga"; + "instance_id" => %instance_id, + "updater_id" => %child_lock_id, + "parent_id" => %parent_id, + "actual_lock_id" => ?found.updater_id, + ); + Err(UpdaterLockError::AlreadyLocked) + } + } + } + + /// Release the instance-updater lock on this instance, if (and only if) the + /// lock is currently held by the saga represented by the provided + /// [`UpdaterLock`] token. + pub async fn instance_updater_unlock( + &self, + opctx: &OpContext, + authz_instance: &authz::Instance, + lock: &UpdaterLock, + ) -> Result { + use db::schema::instance::dsl; + + let instance_id = authz_instance.id(); + let UpdaterLock { updater_id, locked_gen } = *lock; + + let result = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + // Only unlock the instance if: + // - the provided updater ID matches that of the saga that has + // currently locked this instance. + .filter(dsl::updater_id.eq(Some(updater_id))) + // - the provided updater generation matches the current updater + // generation. + .filter(dsl::updater_gen.eq(locked_gen)) + .set(( + dsl::updater_gen.eq(Generation(locked_gen.0.next())), + dsl::updater_id.eq(None::), + )) + .check_if_exists::(instance_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + + match result { + // If we updated the record, the lock has been released! Return + // `Ok(true)` to indicate that we released the lock successfully. + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + return Ok(true); + } + + // The instance exists, but we didn't unlock it. In almost all + // cases, that's actually *fine*, since this suggests we didn't + // actually have the lock to release, so we don't need to worry + // about unlocking the instance. However, depending on the + // particular reason we didn't actually unlock the instance, this + // may be more or less likely to indicate a bug. Remember that saga + // actions --- even unwind actions --- must be idempotent, so we + // *may* just be trying to unlock an instance we already + // successfully unlocked, which is fine. + UpdateAndQueryResult { ref found, .. } + if found.time_deleted().is_some() => + { + debug!( + &opctx.log, + "attempted to unlock an instance that has been deleted"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "time_deleted" => ?found.time_deleted(), + ); + return Ok(false); + } + + // If the instance is no longer locked by this saga, that's probably fine. + // We don't need to unlock it. + UpdateAndQueryResult { ref found, .. } + if found.updater_id != Some(updater_id) => + { + if found.updater_gen > locked_gen { + // The generation has advanced past the generation where we + // acquired the lock. That's totally fine: a previous + // execution of the same saga action must have unlocked it, + // and now it is either unlocked, or locked by a different + // saga. + debug!( + &opctx.log, + "attempted to unlock an instance that is no longer \ + locked by this saga"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => ?found.updater_id.as_ref(), + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + } else { + // On the other hand, if the generation is less than or + // equal to the generation at which we locked the instance, + // that eems kinda suspicious --- perhaps we believed we + // held the lock, but didn't actually, which could be + // programmer error. + // + // However, this *could* conceivably happen: the same saga + // node could have executed previously and released the + // lock, and then the generation counter advanced enough + // times to wrap around, and then the same action tried to + // release its lock again. 64-bit generation counters + // overflowing in an instance's lifetime seems unlikely, but + // nothing is impossible... + warn!( + &opctx.log, + "attempted to release a lock held by another saga \ + at the same generation! this seems suspicious..."; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => ?found.updater_id.as_ref(), + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + } + + Ok(false) + } + + // If we *are* still holding the lock, we must be trying to + // release it at the wrong generation. That seems quite + // suspicious. + UpdateAndQueryResult { ref found, .. } => { + warn!( + &opctx.log, + "attempted to release a lock at the wrong generation"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + Err(Error::internal_error( + "instance is locked by this saga, but at a different \ + generation", + )) + } + } + } + + /// Write the provided `new_runtime_state` for this instance, and release + /// the provided `lock`. /// /// This method will unlock the instance if (and only if) the lock is - /// currently held by the provided `saga_lock_id`. If the lock is held by a + /// currently held by the provided `updater_id`. If the lock is held by a /// different saga UUID, the instance will remain locked. If the instance /// has already been unlocked, this method will return `false`. /// @@ -859,15 +1373,20 @@ impl DataStore { /// - `authz_instance`: the instance to attempt to unlock /// - `updater_lock`: an [`UpdaterLock`] token representing the acquired /// lock to release. - pub async fn instance_updater_unlock( + /// - `new_runtime`: an [`InstanceRuntimeState`] to write + /// back to the database when the lock is released. If this is [`None`], + /// the instance's runtime state will not be modified. + pub async fn instance_commit_update( &self, opctx: &OpContext, authz_instance: &authz::Instance, - UpdaterLock { saga_lock_id, locked_gen }: UpdaterLock, + lock: &UpdaterLock, + new_runtime: &InstanceRuntimeState, ) -> Result { use db::schema::instance::dsl; let instance_id = authz_instance.id(); + let UpdaterLock { updater_id, locked_gen } = *lock; let result = diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) @@ -875,13 +1394,15 @@ impl DataStore { // Only unlock the instance if: // - the provided updater ID matches that of the saga that has // currently locked this instance. - .filter(dsl::updater_id.eq(Some(saga_lock_id))) + .filter(dsl::updater_id.eq(Some(updater_id))) // - the provided updater generation matches the current updater // generation. .filter(dsl::updater_gen.eq(locked_gen)) + .filter(dsl::state_generation.lt(new_runtime.r#gen)) .set(( dsl::updater_gen.eq(Generation(locked_gen.0.next())), dsl::updater_id.eq(None::), + new_runtime.clone(), )) .check_if_exists::(instance_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) @@ -896,49 +1417,127 @@ impl DataStore { ) })?; + // The expected state generation number of the instance record *before* + // applying the update. + let prev_state_gen = u64::from(new_runtime.r#gen.0).saturating_sub(1); match result { // If we updated the record, the lock has been released! Return // `Ok(true)` to indicate that we released the lock successfully. UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { Ok(true) } - // The generation has advanced past the generation at which the - // lock was held. This means that we have already released the - // lock. Return `Ok(false)` here for idempotency. - UpdateAndQueryResult { - status: UpdateStatus::NotUpdatedButExists, - ref found, - } if found.updater_gen > locked_gen => Ok(false), - // The instance exists, but the lock ID doesn't match our lock ID. - // This means we were trying to release a lock we never held, whcih - // is almost certainly a programmer error. - UpdateAndQueryResult { ref found, .. } => { - match found.updater_id { - Some(lock_holder) => { - debug_assert_ne!(lock_holder, saga_lock_id); - Err(Error::internal_error( - "attempted to release a lock held by another saga! this is a bug!", - )) - }, - None => Err(Error::internal_error( - "attempted to release a lock on an instance that is not locked! this is a bug!", - )), - } + + // The instance has been marked as deleted, so no updates were + // committed! + UpdateAndQueryResult { ref found, .. } + if found.time_deleted().is_some() => + { + warn!( + &opctx.log, + "cannot commit instance update, as the instance no longer \ + exists"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "time_deleted" => ?found.time_deleted() + ); + + Err(LookupType::ById(instance_id) + .into_not_found(ResourceType::Instance)) } - } - } -} -#[cfg(test)] -mod tests { - use super::*; - use crate::db::datastore::test_utils::datastore_test; - use crate::db::lookup::LookupPath; + // The instance exists, but both the lock generation *and* the state + // generation no longer matches ours. That's fine --- presumably, + // another execution of the same saga action has already updated the + // instance record. + UpdateAndQueryResult { ref found, .. } + if u64::from(found.runtime().r#gen.0) != prev_state_gen + && found.updater_gen != locked_gen => + { + debug_assert_ne!(found.updater_id, Some(updater_id)); + debug!( + &opctx.log, + "cannot commit instance updates, as the state generation \ + and lock generation have advanced: the required updates \ + have probably already been committed."; + "instance_id" => %instance_id, + "expected_state_gen" => ?new_runtime.r#gen, + "actual_state_gen" => ?found.runtime().r#gen, + "updater_id" => %updater_id, + "updater_gen" => ?locked_gen, + "actual_updater_gen" => ?found.updater_gen, + ); + Ok(false) + } + + // The state generation has advanced, but the instance is *still* + // locked by this saga. That's bad --- this update saga may no + // longer update the instance, as its state has changed, potentially + // invalidating the updates. We need to unwind. + UpdateAndQueryResult { ref found, .. } + if u64::from(found.runtime().r#gen.0) != prev_state_gen + && found.updater_gen == locked_gen + && found.updater_id == Some(updater_id) => + { + info!( + &opctx.log, + "cannot commit instance update, as the state generation \ + has advanced, potentially invalidating the update"; + "instance_id" => %instance_id, + "expected_state_gen" => ?new_runtime.r#gen, + "actual_state_gen" => ?found.runtime().r#gen, + ); + Err(Error::conflict("instance state has changed")) + } + + // The instance exists, but we could not update it because the lock + // did not match. + UpdateAndQueryResult { ref found, .. } => match found.updater_id { + Some(actual_id) => { + const MSG: &'static str = + "cannot commit instance updates: the instance is \ + locked by another saga!"; + error!( + &opctx.log, + "{MSG}"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => %actual_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + Err(Error::internal_error(MSG)) + } + None => { + const MSG: &'static str = + "cannot commit instance updates: the instance is \ + not locked"; + error!( + &opctx.log, + "{MSG}"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + Err(Error::internal_error(MSG)) + } + }, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::datastore::test_utils::datastore_test; + use crate::db::lookup::LookupPath; use nexus_db_model::InstanceState; use nexus_db_model::Project; + use nexus_db_model::VmmRuntimeState; use nexus_db_model::VmmState; use nexus_test_utils::db::test_setup_database; use nexus_types::external_api::params; + use omicron_common::api::external; use omicron_common::api::external::ByteCount; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_test_utils::dev; @@ -1025,7 +1624,7 @@ mod tests { stringify!($id) )); assert_eq!( - lock.saga_lock_id, + lock.updater_id, $id, "instance's `updater_id` must be set to {}", stringify!($id), @@ -1055,7 +1654,7 @@ mod tests { // unlock the instance from saga 1 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1) + .instance_updater_unlock(&opctx, &authz_instance, &lock1) .await .expect("instance must be unlocked by saga 1"); assert!(unlocked, "instance must actually be unlocked"); @@ -1068,7 +1667,7 @@ mod tests { // unlock the instance from saga 2 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, lock2) + .instance_updater_unlock(&opctx, &authz_instance, &lock2) .await .expect("instance must be unlocked by saga 2"); assert!(unlocked, "instance must actually be unlocked"); @@ -1095,7 +1694,7 @@ mod tests { .await ) .expect("instance should be locked"); - assert_eq!(lock1.saga_lock_id, saga1); + assert_eq!(lock1.updater_id, saga1); // doing it again should be fine. let lock2 = dbg!( @@ -1106,7 +1705,7 @@ mod tests { .expect( "instance_updater_lock should succeed again with the same saga ID", ); - assert_eq!(lock2.saga_lock_id, saga1); + assert_eq!(lock2.updater_id, saga1); // the generation should not have changed as a result of the second // update. assert_eq!(lock1.locked_gen, lock2.locked_gen); @@ -1114,7 +1713,7 @@ mod tests { // now, unlock the instance. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1) + .instance_updater_unlock(&opctx, &authz_instance, &lock1) .await ) .expect("instance should unlock"); @@ -1123,7 +1722,7 @@ mod tests { // unlocking it again should also succeed... let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock2) + .instance_updater_unlock(&opctx, &authz_instance, &lock2,) .await ) .expect("instance should unlock again"); @@ -1136,10 +1735,10 @@ mod tests { } #[tokio::test] - async fn test_instance_updater_unlocking_someone_elses_instance_errors() { + async fn test_instance_updater_cant_unlock_someone_elses_instance_() { // Setup let logctx = dev::test_setup_log( - "test_instance_updater_unlocking_someone_elses_instance_errors", + "test_instance_updater_cant_unlock_someone_elses_instance_", ); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; @@ -1155,8 +1754,8 @@ mod tests { ) .expect("instance should be locked"); - // attempting to unlock with a different saga ID should be an error. - let err = dbg!( + // attempting to unlock with a different saga ID shouldn't do anything. + let unlocked = dbg!( datastore .instance_updater_unlock( &opctx, @@ -1166,37 +1765,42 @@ mod tests { // what we're doing here. But this simulates a case where // an incorrect one is constructed, or a raw database query // attempts an invalid unlock operation. - UpdaterLock { - saga_lock_id: saga2, + &UpdaterLock { + updater_id: saga2, locked_gen: lock1.locked_gen, }, ) .await ) - .expect_err( - "unlocking the instance with someone else's ID should fail", - ); - assert_eq!( - err, - Error::internal_error( - "attempted to release a lock held by another saga! \ - this is a bug!", - ), - ); + .unwrap(); + assert!(!unlocked); + + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.updater_id, Some(saga1)); + assert_eq!(instance.updater_gen, lock1.locked_gen); + let next_gen = Generation(lock1.locked_gen.0.next()); // unlocking with the correct ID should succeed. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1) + .instance_updater_unlock(&opctx, &authz_instance, &lock1) .await ) .expect("instance should unlock"); assert!(unlocked, "instance should have unlocked"); + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.updater_id, None); + assert_eq!(instance.updater_gen, next_gen); + // unlocking with the lock holder's ID *again* at a new generation - // (where the lock is no longer held) should fail. - let err = dbg!( + // (where the lock is no longer held) shouldn't do anything + let unlocked = dbg!( datastore .instance_updater_unlock( &opctx, @@ -1204,20 +1808,234 @@ mod tests { // Again, these fields are private specifically to prevent // you from doing this exact thing. But, we should still // test that we handle it gracefully. - UpdaterLock { saga_lock_id: saga1, locked_gen: next_gen }, + &UpdaterLock { updater_id: saga1, locked_gen: next_gen }, + ) + .await + ) + .unwrap(); + assert!(!unlocked); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_unlocking_a_deleted_instance_is_okay() { + // Setup + let logctx = + dev::test_setup_log("test_unlocking_a_deleted_instance_is_okay"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + let saga1 = Uuid::new_v4(); + + // put the instance in a state where it will be okay to delete later... + datastore + .instance_update_runtime( + &InstanceUuid::from_untyped_uuid(authz_instance.id()), + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, + ) + .await + .expect("should update state successfully"); + + // lock the instance once. + let lock = dbg!( + datastore + .instance_updater_lock(&opctx, &authz_instance, saga1) + .await + ) + .expect("instance should be locked"); + + // mark the instance as deleted + dbg!(datastore.project_delete_instance(&opctx, &authz_instance).await) + .expect("instance should be deleted"); + + // unlocking should still succeed. + dbg!( + datastore + .instance_updater_unlock(&opctx, &authz_instance, &lock) + .await + ) + .expect("instance should unlock"); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_instance_commit_update_is_idempotent() { + // Setup + let logctx = + dev::test_setup_log("test_instance_commit_update_is_idempotent"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + let saga1 = Uuid::new_v4(); + + // lock the instance once. + let lock = dbg!( + datastore + .instance_updater_lock(&opctx, &authz_instance, saga1) + .await + ) + .expect("instance should be locked"); + let new_runtime = &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: Some(Uuid::new_v4()), + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::Vmm, + }; + + let updated = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &new_runtime + ) + .await + ) + .expect("instance_commit_update should succeed"); + assert!(updated, "it should be updated"); + + // okay, let's do it again at the same generation. + let updated = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &new_runtime + ) + .await + ) + .expect("instance_commit_update should succeed"); + assert!(!updated, "it was already updated"); + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.runtime().propolis_id, new_runtime.propolis_id); + assert_eq!(instance.runtime().r#gen, new_runtime.r#gen); + + // Doing it again at the same generation with a *different* state + // shouldn't change the instance at all. + let updated = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &InstanceRuntimeState { + propolis_id: Some(Uuid::new_v4()), + migration_id: Some(Uuid::new_v4()), + dst_propolis_id: Some(Uuid::new_v4()), + ..new_runtime.clone() + } + ) + .await + ) + .expect("instance_commit_update should succeed"); + assert!(!updated, "it was already updated"); + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.runtime().propolis_id, new_runtime.propolis_id); + assert_eq!(instance.runtime().dst_propolis_id, None); + assert_eq!(instance.runtime().migration_id, None); + assert_eq!(instance.runtime().r#gen, new_runtime.r#gen); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_instance_update_invalidated_while_locked() { + // Setup + let logctx = dev::test_setup_log( + "test_instance_update_invalidated_while_locked", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + let saga1 = Uuid::new_v4(); + + // Lock the instance + let lock = dbg!( + datastore + .instance_updater_lock(&opctx, &authz_instance, saga1) + .await + ) + .expect("instance should be locked"); + + // Mutate the instance state, invalidating the state when the lock was + // acquired. + let new_runtime = &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: Some(Uuid::new_v4()), + dst_propolis_id: Some(Uuid::new_v4()), + migration_id: Some(Uuid::new_v4()), + nexus_state: InstanceState::Vmm, + }; + let updated = dbg!( + datastore + .instance_update_runtime( + &InstanceUuid::from_untyped_uuid(authz_instance.id()), + &new_runtime + ) + .await + ) + .expect("instance_update_runtime should succeed"); + assert!(updated, "it should be updated"); + + // Okay, now try to commit the result of an update saga. This must fail, + // because the state generation has changed while we had locked the + // instance. + let _err = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, ) .await ) .expect_err( - "unlocking the instance with someone else's ID should fail", + "instance_commit_update should fail if the state generation is \ + stale", ); + + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.runtime().propolis_id, new_runtime.propolis_id); assert_eq!( - err, - Error::internal_error( - "attempted to release a lock on an instance \ - that is not locked! this is a bug!" - ), + instance.runtime().dst_propolis_id, + new_runtime.dst_propolis_id ); + assert_eq!(instance.runtime().migration_id, new_runtime.migration_id); + assert_eq!(instance.runtime().nexus_state, new_runtime.nexus_state); // Clean up. db.cleanup().await.unwrap(); @@ -1395,4 +2213,264 @@ mod tests { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_instance_set_migration_ids() { + // Setup + let logctx = dev::test_setup_log("test_instance_set_migration_ids"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + + // Create the first VMM in a state where `set_migration_ids` should + // *fail* (Stopped). We will assert that we cannot set the migration + // IDs, and then advance it to Running, when we can start the migration. + let vmm1 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: authz_instance.id(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.32".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Stopped, + }, + }, + ) + .await + .expect("active VMM should be inserted successfully!"); + + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + let instance = datastore + .instance_refetch(&opctx, &authz_instance) + .await + .expect("instance should be there"); + datastore + .instance_update_runtime( + &instance_id, + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(instance.runtime_state.gen.0.next()), + nexus_state: InstanceState::Vmm, + propolis_id: Some(vmm1.id), + ..instance.runtime_state.clone() + }, + ) + .await + .expect("instance update should work"); + + let vmm2 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: authz_instance.id(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.42".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("second VMM should insert"); + + // make a migration... + let migration = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm1.id, vmm2.id), + ) + .await + .expect("migration should be inserted successfully!"); + + // Our first attempt to set migration IDs should fail, because the + // active VMM is Stopped. + let res = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration.id, + PropolisUuid::from_untyped_uuid(vmm2.id), + ) + .await + ); + assert!(res.is_err()); + + // Okay, now, advance the active VMM to Running, and try again. + let updated = dbg!( + datastore + .vmm_update_runtime( + &PropolisUuid::from_untyped_uuid(vmm1.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next()), + state: VmmState::Running, + }, + ) + .await + ) + .expect("updating VMM state should be fine"); + assert!(updated); + + // Now, it should work! + let instance = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration.id, + PropolisUuid::from_untyped_uuid(vmm2.id), + ) + .await + ) + .expect("setting migration IDs should succeed"); + assert_eq!(instance.runtime().dst_propolis_id, Some(vmm2.id)); + assert_eq!(instance.runtime().migration_id, Some(migration.id)); + + // Doing it again should be idempotent, and the instance record + // shouldn't change. + let instance2 = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration.id, + PropolisUuid::from_untyped_uuid(vmm2.id), + ) + .await + ) + .expect("setting the same migration IDs a second time should succeed"); + assert_eq!( + instance.runtime().dst_propolis_id, + instance2.runtime().dst_propolis_id + ); + assert_eq!( + instance.runtime().migration_id, + instance2.runtime().migration_id + ); + + // Trying to set a new migration should fail, as long as the prior stuff + // is still in place. + let vmm3 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: authz_instance.id(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.42".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("third VMM should insert"); + let migration2 = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm1.id, vmm3.id), + ) + .await + .expect("migration should be inserted successfully!"); + dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration2.id, + PropolisUuid::from_untyped_uuid(vmm3.id), + ) + .await + ) + .expect_err( + "trying to set migration IDs should fail when a previous \ + migration and VMM are still there", + ); + + // Pretend the previous migration saga has unwound the VMM + let updated = dbg!( + datastore + .vmm_update_runtime( + &PropolisUuid::from_untyped_uuid(vmm2.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next().next()), + state: VmmState::SagaUnwound, + }, + ) + .await + ) + .expect("updating VMM state should be fine"); + assert!(updated); + + // It should still fail, since the migration is still in progress. + dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration2.id, + PropolisUuid::from_untyped_uuid(vmm3.id), + ) + .await + ) + .expect_err( + "trying to set migration IDs should fail when a previous \ + migration ID is present and not marked as failed", + ); + + // Now, mark the previous migration as Failed. + let updated = dbg!(datastore + .migration_mark_failed(&opctx, migration.id) + .await + .expect( + "we should be able to mark the previous migration as failed" + )); + assert!(updated); + + // If the current migration is failed on both sides *and* the current + // VMM is SagaUnwound, we should be able to clobber them with new IDs. + let instance = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration2.id, + PropolisUuid::from_untyped_uuid(vmm3.id), + ) + .await + ) + .expect("replacing SagaUnwound VMM should work"); + assert_eq!(instance.runtime().migration_id, Some(migration2.id)); + assert_eq!(instance.runtime().dst_propolis_id, Some(vmm3.id)); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index 5efe88e83f..128239503c 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -6,12 +6,16 @@ use super::DataStore; use crate::context::OpContext; +use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::model::{Migration, MigrationState}; +use crate::db::model::Generation; +use crate::db::model::Migration; +use crate::db::model::MigrationState; use crate::db::pagination::paginated; use crate::db::schema::migration::dsl; use crate::db::update_and_check::UpdateAndCheck; +use crate::db::update_and_check::UpdateAndQueryResult; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -23,6 +27,7 @@ use omicron_common::api::external::UpdateResult; use omicron_common::api::internal::nexus; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::PropolisUuid; use uuid::Uuid; impl DataStore { @@ -76,24 +81,24 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// Marks a migration record as deleted if and only if both sides of the - /// migration are in a terminal state. - pub async fn migration_terminate( + /// Marks a migration record as failed. + pub async fn migration_mark_failed( &self, opctx: &OpContext, migration_id: Uuid, ) -> UpdateResult { - const TERMINAL_STATES: &[MigrationState] = &[ - MigrationState(nexus::MigrationState::Completed), - MigrationState(nexus::MigrationState::Failed), - ]; - + let failed = MigrationState(nexus::MigrationState::Failed); diesel::update(dsl::migration) .filter(dsl::id.eq(migration_id)) .filter(dsl::time_deleted.is_null()) - .filter(dsl::source_state.eq_any(TERMINAL_STATES)) - .filter(dsl::target_state.eq_any(TERMINAL_STATES)) - .set(dsl::time_deleted.eq(Utc::now())) + .set(( + dsl::source_state.eq(failed), + dsl::source_gen.eq(dsl::source_gen + 1), + dsl::time_source_updated.eq(Utc::now()), + dsl::target_state.eq(failed), + dsl::target_gen.eq(dsl::target_gen + 1), + dsl::time_target_updated.eq(Utc::now()), + )) .check_if_exists::(migration_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await @@ -105,10 +110,6 @@ impl DataStore { } /// Unconditionally mark a migration record as deleted. - /// - /// This is distinct from [`DataStore::migration_terminate`], as it will - /// mark a migration as deleted regardless of the states of the source and - /// target VMMs. pub async fn migration_mark_deleted( &self, opctx: &OpContext, @@ -127,6 +128,50 @@ impl DataStore { }) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + pub(crate) async fn migration_update_source_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + vmm_id: &PropolisUuid, + migration: &nexus::MigrationRuntimeState, + ) -> Result, diesel::result::Error> { + let generation = Generation(migration.r#gen); + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration.migration_id)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::source_gen.lt(generation)) + .filter(dsl::source_propolis_id.eq(vmm_id.into_untyped_uuid())) + .set(( + dsl::source_state.eq(MigrationState(migration.state)), + dsl::source_gen.eq(generation), + dsl::time_source_updated.eq(migration.time_updated), + )) + .check_if_exists::(migration.migration_id) + .execute_and_check(conn) + .await + } + + pub(crate) async fn migration_update_target_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + vmm_id: &PropolisUuid, + migration: &nexus::MigrationRuntimeState, + ) -> Result, diesel::result::Error> { + let generation = Generation(migration.r#gen); + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration.migration_id)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::target_gen.lt(generation)) + .filter(dsl::target_propolis_id.eq(vmm_id.into_untyped_uuid())) + .set(( + dsl::target_state.eq(MigrationState(migration.state)), + dsl::target_gen.eq(generation), + dsl::time_target_updated.eq(migration.time_updated), + )) + .check_if_exists::(migration.migration_id) + .execute_and_check(conn) + .await + } } #[cfg(test)] diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 881f0d4aa5..acf80829cf 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -83,6 +83,7 @@ mod rack; mod region; mod region_replacement; mod region_snapshot; +mod region_snapshot_replacement; mod role; mod saga; mod silo; @@ -110,7 +111,7 @@ mod zpool; pub use address_lot::AddressLotCreateResult; pub use dns::DataStoreDnsTest; pub use dns::DnsVersionUpdateBuilder; -pub use instance::InstanceAndActiveVmm; +pub use instance::{InstanceAndActiveVmm, InstanceGestalt}; pub use inventory::DataStoreInventoryTest; use nexus_db_model::AllSchemaVersions; pub use rack::RackInit; @@ -122,6 +123,7 @@ pub use sled::SledTransition; pub use sled::TransitionError; pub use switch_port::SwitchPortSettingsCombinedResult; pub use virtual_provisioning_collection::StorageType; +pub use vmm::VmmStateUpdateResult; pub use volume::read_only_resources_associated_with_volume; pub use volume::CrucibleResources; pub use volume::CrucibleTargets; diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index c5a8992cd2..1b1ff8a75b 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -162,30 +162,17 @@ impl DataStore { } /// List network interfaces associated with a given service. - pub async fn service_list_network_interfaces( + pub async fn service_list_network_interfaces_on_connection( &self, - opctx: &OpContext, + conn: &async_bb8_diesel::Connection, service_id: Uuid, ) -> ListResultVec { - // See the comment in `service_create_network_interface`. There's no - // obvious parent for a service network interface (as opposed to - // instance network interfaces, which require ListChildren on the - // instance to list). As a logical proxy, we check for listing children - // of the service IP pool. - let (authz_service_ip_pool, _) = - self.ip_pools_service_lookup(opctx).await?; - opctx - .authorize(authz::Action::ListChildren, &authz_service_ip_pool) - .await?; - use db::schema::service_network_interface::dsl; dsl::service_network_interface .filter(dsl::time_deleted.is_null()) .filter(dsl::service_id.eq(service_id)) .select(ServiceNetworkInterface::as_select()) - .get_results_async::( - &*self.pool_connection_authorized(opctx).await?, - ) + .get_results_async::(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -450,6 +437,26 @@ impl DataStore { .await .map_err(network_interface::DeleteError::External)?; + let conn = self + .pool_connection_authorized(opctx) + .await + .map_err(network_interface::DeleteError::External)?; + self.service_delete_network_interface_on_connection( + &conn, + service_id, + network_interface_id, + ) + .await + } + + /// Variant of [Self::service_delete_network_interface] which may be called + /// from a transaction context. + pub async fn service_delete_network_interface_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + service_id: Uuid, + network_interface_id: Uuid, + ) -> Result { let query = network_interface::DeleteQuery::new( NetworkInterfaceKind::Service, service_id, @@ -457,12 +464,7 @@ impl DataStore { ); query .clone() - .execute_and_check( - &*self - .pool_connection_authorized(opctx) - .await - .map_err(network_interface::DeleteError::External)?, - ) + .execute_and_check(conn) .await .map_err(|e| network_interface::DeleteError::from_diesel(e, &query)) } diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 3d328a6206..242560a415 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -5,9 +5,11 @@ //! [`DataStore`] methods on [`RegionSnapshot`]s. use super::DataStore; +use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; +use crate::db::model::PhysicalDiskPolicy; use crate::db::model::RegionSnapshot; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -71,4 +73,38 @@ impl DataStore { .map(|_rows_deleted| ()) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// Find region snapshots on expunged disks + pub async fn find_region_snapshots_on_expunged_physical_disks( + &self, + opctx: &OpContext, + ) -> LookupResult> { + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::physical_disk::dsl as physical_disk_dsl; + use db::schema::region_snapshot::dsl as region_snapshot_dsl; + use db::schema::zpool::dsl as zpool_dsl; + + region_snapshot_dsl::region_snapshot + .filter(region_snapshot_dsl::dataset_id.eq_any( + dataset_dsl::dataset + .filter(dataset_dsl::time_deleted.is_null()) + .filter(dataset_dsl::pool_id.eq_any( + zpool_dsl::zpool + .filter(zpool_dsl::time_deleted.is_null()) + .filter(zpool_dsl::physical_disk_id.eq_any( + physical_disk_dsl::physical_disk + .filter(physical_disk_dsl::disk_policy.eq(PhysicalDiskPolicy::Expunged)) + .select(physical_disk_dsl::id) + )) + .select(zpool_dsl::id) + )) + .select(dataset_dsl::id) + )) + .select(RegionSnapshot::as_select()) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs new file mode 100644 index 0000000000..5f99129ecd --- /dev/null +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -0,0 +1,1488 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`RegionSnapshotReplacement`] and +//! [`RegionSnapshotReplacementStep`] objects. + +use super::DataStore; +use crate::context::OpContext; +use crate::db; +use crate::db::datastore::SQL_BATCH_SIZE; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::lookup::LookupPath; +use crate::db::model::RegionSnapshot; +use crate::db::model::RegionSnapshotReplacement; +use crate::db::model::RegionSnapshotReplacementState; +use crate::db::model::RegionSnapshotReplacementStep; +use crate::db::model::RegionSnapshotReplacementStepState; +use crate::db::model::VolumeRepair; +use crate::db::pagination::paginated; +use crate::db::pagination::Paginator; +use crate::db::update_and_check::UpdateAndCheck; +use crate::db::update_and_check::UpdateStatus; +use crate::db::TransactionError; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncConnection; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use omicron_common::api::external::Error; +use uuid::Uuid; + +impl DataStore { + /// Create and insert a region snapshot replacement request for a + /// RegionSnapshot, returning the ID of the request. + pub async fn create_region_snapshot_replacement_request( + &self, + opctx: &OpContext, + region_snapshot: &RegionSnapshot, + ) -> Result { + let request = + RegionSnapshotReplacement::for_region_snapshot(region_snapshot); + let request_id = request.id; + + self.insert_region_snapshot_replacement_request(opctx, request).await?; + + Ok(request_id) + } + + /// Insert a region snapshot replacement request into the DB, also creating + /// the VolumeRepair record. + pub async fn insert_region_snapshot_replacement_request( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacement, + ) -> Result<(), Error> { + let (.., db_snapshot) = LookupPath::new(opctx, &self) + .snapshot_id(request.old_snapshot_id) + .fetch() + .await?; + + self.insert_region_snapshot_replacement_request_with_volume_id( + opctx, + request, + db_snapshot.volume_id, + ) + .await + } + + /// Insert a region snapshot replacement request into the DB, also creating + /// the VolumeRepair record. + pub async fn insert_region_snapshot_replacement_request_with_volume_id( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacement, + volume_id: Uuid, + ) -> Result<(), Error> { + self.pool_connection_authorized(opctx) + .await? + .transaction_async(|conn| async move { + use db::schema::region_snapshot_replacement::dsl; + use db::schema::volume_repair::dsl as volume_repair_dsl; + + // An associated volume repair record isn't _strictly_ needed: + // snapshot volumes should never be directly constructed, and + // therefore won't ever have an associated Upstairs that + // receives a volume replacement request. However it's being + // done in an attempt to be overly cautious. + + diesel::insert_into(volume_repair_dsl::volume_repair) + .values(VolumeRepair { volume_id, repair_id: request.id }) + .execute_async(&conn) + .await?; + + diesel::insert_into(dsl::region_snapshot_replacement) + .values(request) + .execute_async(&conn) + .await?; + + Ok(()) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn get_region_snapshot_replacement_request_by_id( + &self, + opctx: &OpContext, + id: Uuid, + ) -> Result { + use db::schema::region_snapshot_replacement::dsl; + + dsl::region_snapshot_replacement + .filter(dsl::id.eq(id)) + .get_result_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Find a region snapshot replacement request by region snapshot + pub async fn lookup_region_snapshot_replacement_request( + &self, + opctx: &OpContext, + region_snapshot: &RegionSnapshot, + ) -> Result, Error> { + use db::schema::region_snapshot_replacement::dsl; + + dsl::region_snapshot_replacement + .filter(dsl::old_dataset_id.eq(region_snapshot.dataset_id)) + .filter(dsl::old_region_id.eq(region_snapshot.region_id)) + .filter(dsl::old_snapshot_id.eq(region_snapshot.snapshot_id)) + .get_result_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Return region snapshot replacement records in state `Requested` with no + /// currently operating saga. + pub async fn get_requested_region_snapshot_replacements( + &self, + opctx: &OpContext, + ) -> Result, Error> { + use db::schema::region_snapshot_replacement::dsl; + + dsl::region_snapshot_replacement + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Requested), + ) + .filter(dsl::operating_saga_id.is_null()) + .get_results_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Return region snapshot replacement requests that are in state `Running` + /// with no currently operating saga. + pub async fn get_running_region_snapshot_replacements( + &self, + opctx: &OpContext, + ) -> Result, Error> { + use db::schema::region_snapshot_replacement::dsl; + + dsl::region_snapshot_replacement + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + ) + .filter(dsl::operating_saga_id.is_null()) + .get_results_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Return region snapshot replacement requests that are in state + /// `ReplacementDone` with no currently operating saga. + pub async fn get_replacement_done_region_snapshot_replacements( + &self, + opctx: &OpContext, + ) -> Result, Error> { + use db::schema::region_snapshot_replacement::dsl; + + dsl::region_snapshot_replacement + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::ReplacementDone), + ) + .filter(dsl::operating_saga_id.is_null()) + .get_results_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Transition a RegionSnapshotReplacement record from Requested to + /// Allocating, setting a unique id at the same time. + pub async fn set_region_snapshot_replacement_allocating( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Requested), + ) + .filter(dsl::operating_saga_id.is_null()) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Allocating), + dsl::operating_saga_id.eq(operating_saga_id), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == Some(operating_saga_id) + && record.replacement_state + == RegionSnapshotReplacementState::Allocating + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from Allocating to + /// Requested, clearing the operating saga id. + pub async fn undo_set_region_snapshot_replacement_allocating( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Allocating), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Requested), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::Requested + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition from Allocating to ReplacementDone, and clear the operating + /// saga id. + pub async fn set_region_snapshot_replacement_replacement_done( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + new_region_id: Uuid, + old_snapshot_volume_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Allocating), + ) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::ReplacementDone), + dsl::old_snapshot_volume_id.eq(Some(old_snapshot_volume_id)), + dsl::new_region_id.eq(Some(new_region_id)), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::ReplacementDone + && record.new_region_id == Some(new_region_id) + && record.old_snapshot_volume_id + == Some(old_snapshot_volume_id) + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from ReplacementDone to + /// DeletingOldVolume, setting a unique id at the same time. + pub async fn set_region_snapshot_replacement_deleting_old_volume( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::ReplacementDone), + ) + .filter(dsl::operating_saga_id.is_null()) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::DeletingOldVolume), + dsl::operating_saga_id.eq(operating_saga_id), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == Some(operating_saga_id) + && record.replacement_state + == RegionSnapshotReplacementState::DeletingOldVolume + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from DeletingOldVolume to + /// ReplacementDone, clearing the operating saga id. + pub async fn undo_set_region_snapshot_replacement_deleting_old_volume( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::DeletingOldVolume), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::ReplacementDone), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::ReplacementDone + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition from DeletingOldVolume to Running, and clear the operating + /// saga id. + pub async fn set_region_snapshot_replacement_running( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::DeletingOldVolume), + ) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::Running + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from Running to Complete. + /// Also removes the `volume_repair` record that is taking a "lock" on the + /// Volume. Note this doesn't occur from a saga context, and therefore 1) + /// doesn't accept an operating saga id parameter, and 2) checks that + /// operating_saga_id is null for the corresponding record. + pub async fn set_region_snapshot_replacement_complete( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + ) -> Result<(), Error> { + type TxnError = TransactionError; + + self.pool_connection_authorized(opctx) + .await? + .transaction_async(|conn| async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement::dsl; + + let result = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + ) + .filter(dsl::operating_saga_id.is_null()) + .set((dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete),)) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementState::Complete + { + Ok(()) + } else { + Err(TxnError::CustomError(Error::conflict( + format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ), + ))) + } + } + } + }) + .await + .map_err(|e| match e { + TxnError::CustomError(error) => error, + + TxnError::Database(error) => { + public_error_from_diesel(error, ErrorHandler::Server) + } + }) + } + + pub async fn create_region_snapshot_replacement_step( + &self, + opctx: &OpContext, + request_id: Uuid, + volume_id: Uuid, + ) -> Result { + let request = RegionSnapshotReplacementStep::new(request_id, volume_id); + let request_id = request.id; + + self.insert_region_snapshot_replacement_step(opctx, request).await?; + + Ok(request_id) + } + + pub async fn insert_region_snapshot_replacement_step( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacementStep, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + + let err = OptionalError::new(); + self.transaction_retry_wrapper( + "insert_region_snapshot_replacement_step", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + let request = request.clone(); + + async move { + use db::schema::region_snapshot_replacement_step::dsl; + use db::schema::volume_repair::dsl as volume_repair_dsl; + + // Skip inserting this new record if we found another region + // snapshot replacement step with this volume in the step's + // `old_snapshot_volume_id`, as that means we're duplicating + // the replacement work: that volume will be garbage + // collected later. There's a unique index that will prevent + // the same step being inserted with the same volume id. + + let maybe_record = dsl::region_snapshot_replacement_step + .filter(dsl::old_snapshot_volume_id.eq(request.volume_id)) + .get_result_async::(&conn) + .await + .optional()?; + + if let Some(found_record) = maybe_record { + return Err(err.bail(Error::conflict(format!( + "{:?} already referenced in old snapshot volume for \ + request {:?}", + request.volume_id, found_record.id, + )))); + } + + // The region snapshot replacement step saga could invoke a + // volume replacement: create an associated volume repair + // record. + + diesel::insert_into(volume_repair_dsl::volume_repair) + .values(VolumeRepair { + volume_id: request.volume_id, + repair_id: request.id, + }) + .execute_async(&conn) + .await?; + + diesel::insert_into(dsl::region_snapshot_replacement_step) + .values(request) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + return err; + } + + public_error_from_diesel(e, ErrorHandler::Server) + }) + } + + pub async fn get_region_snapshot_replacement_step_by_id( + &self, + opctx: &OpContext, + region_snapshot_replacement_step_id: Uuid, + ) -> Result { + use db::schema::region_snapshot_replacement_step::dsl; + + dsl::region_snapshot_replacement_step + .filter(dsl::id.eq(region_snapshot_replacement_step_id)) + .get_result_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn get_requested_region_snapshot_replacement_steps( + &self, + opctx: &OpContext, + ) -> Result, Error> { + opctx.check_complex_operations_allowed()?; + + let mut records = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = self.pool_connection_authorized(opctx).await?; + + while let Some(p) = paginator.next() { + use db::schema::region_snapshot_replacement_step::dsl; + + let batch = paginated( + dsl::region_snapshot_replacement_step, + dsl::id, + &p.current_pagparams(), + ) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementStepState::Requested), + ) + .get_results_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + paginator = p.found_batch(&batch, &|r| r.id); + records.extend(batch); + } + + Ok(records) + } + + pub async fn set_region_snapshot_replacement_step_running( + &self, + opctx: &OpContext, + region_snapshot_replacement_step_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement_step::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementStepState::Requested), + ) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementStepState::Running), + dsl::operating_saga_id.eq(operating_saga_id), + )) + .check_if_exists::( + region_snapshot_replacement_step_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == Some(operating_saga_id) + && record.replacement_state + == RegionSnapshotReplacementStepState::Running + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement step {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_step_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacementStep record from Running to + /// Requested, clearing the operating saga id. + pub async fn undo_set_region_snapshot_replacement_step_running( + &self, + opctx: &OpContext, + region_snapshot_replacement_step_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement_step::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementStepState::Running), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementStepState::Requested), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_step_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementStepState::Requested + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement step {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_step_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition from Running to Complete, clearing the operating saga id and + /// removing the associated `volume_repair` record. + pub async fn set_region_snapshot_replacement_step_complete( + &self, + opctx: &OpContext, + region_snapshot_replacement_step_id: Uuid, + operating_saga_id: Uuid, + old_snapshot_volume_id: Uuid, + ) -> Result<(), Error> { + type TxnError = TransactionError; + + self.pool_connection_authorized(opctx) + .await? + .transaction_async(|conn| async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_step_id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement_step::dsl; + let result = + diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step_id)) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .filter(dsl::old_snapshot_volume_id.is_null()) + .filter( + dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::Running, + ), + ) + .set(( + dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::Complete, + ), + dsl::operating_saga_id.eq(Option::::None), + dsl::old_snapshot_volume_id + .eq(old_snapshot_volume_id), + )) + .check_if_exists::( + region_snapshot_replacement_step_id, + ) + .execute_and_check( + &*self.pool_connection_authorized(opctx).await?, + ) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementStepState::Complete + { + Ok(()) + } else { + Err(TxnError::CustomError(Error::conflict( + format!( + "region snapshot replacement step {} set \ + to {:?} (operating saga id {:?})", + region_snapshot_replacement_step_id, + record.replacement_state, + record.operating_saga_id, + ), + ))) + } + } + } + }) + .await + .map_err(|e| match e { + TxnError::CustomError(error) => error, + + TxnError::Database(error) => { + public_error_from_diesel(error, ErrorHandler::Server) + } + }) + } + + /// Count all in-progress region snapshot replacement steps for a particular + /// region snapshot replacement id. + pub async fn in_progress_region_snapshot_replacement_steps( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + ) -> Result { + use db::schema::region_snapshot_replacement_step::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + let records = dsl::region_snapshot_replacement_step + .filter(dsl::request_id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .ne(RegionSnapshotReplacementStepState::VolumeDeleted), + ) + .count() + .get_result_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(records) + } + + /// Return all region snapshot replacement steps that are Complete + pub async fn region_snapshot_replacement_steps_requiring_garbage_collection( + &self, + opctx: &OpContext, + ) -> Result, Error> { + use db::schema::region_snapshot_replacement_step; + + let conn = self.pool_connection_authorized(opctx).await?; + + region_snapshot_replacement_step::table + .filter( + region_snapshot_replacement_step::replacement_state + .eq(RegionSnapshotReplacementStepState::Complete), + ) + .select(RegionSnapshotReplacementStep::as_select()) + .get_results_async::(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Set a region snapshot replacement step's state to VolumeDeleted + pub async fn set_region_snapshot_replacement_step_volume_deleted( + &self, + opctx: &OpContext, + region_snapshot_replacement_step_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement_step::dsl; + + let conn = self.pool_connection_authorized(opctx).await?; + + let updated = diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementStepState::Complete), + ) + .set( + dsl::replacement_state + .eq(RegionSnapshotReplacementStepState::VolumeDeleted), + ) + .check_if_exists::( + region_snapshot_replacement_step_id, + ) + .execute_and_check(&conn) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementStepState::VolumeDeleted + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement step {} set to {:?}", + region_snapshot_replacement_step_id, + record.replacement_state, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + use crate::db::datastore::test_utils::datastore_test; + use crate::db::model::RegionReplacement; + use nexus_test_utils::db::test_setup_database; + use omicron_test_utils::dev; + + #[tokio::test] + async fn test_one_replacement_per_volume() { + let logctx = dev::test_setup_log("test_one_replacement_per_volume"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let dataset_1_id = Uuid::new_v4(); + let region_1_id = Uuid::new_v4(); + let snapshot_1_id = Uuid::new_v4(); + + let dataset_2_id = Uuid::new_v4(); + let region_2_id = Uuid::new_v4(); + let snapshot_2_id = Uuid::new_v4(); + + let volume_id = Uuid::new_v4(); + + let request_1 = RegionSnapshotReplacement::new( + dataset_1_id, + region_1_id, + snapshot_1_id, + ); + + let request_2 = RegionSnapshotReplacement::new( + dataset_2_id, + region_2_id, + snapshot_2_id, + ); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request_1, volume_id, + ) + .await + .unwrap(); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request_2, volume_id, + ) + .await + .unwrap_err(); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_one_replacement_per_volume_conflict_with_region() { + let logctx = dev::test_setup_log( + "test_one_replacement_per_volume_conflict_with_region", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let dataset_1_id = Uuid::new_v4(); + let region_1_id = Uuid::new_v4(); + let snapshot_1_id = Uuid::new_v4(); + + let region_2_id = Uuid::new_v4(); + + let volume_id = Uuid::new_v4(); + + let request_1 = RegionSnapshotReplacement::new( + dataset_1_id, + region_1_id, + snapshot_1_id, + ); + + let request_2 = RegionReplacement::new(region_2_id, volume_id); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request_1, volume_id, + ) + .await + .unwrap(); + + datastore + .insert_region_replacement_request(&opctx, request_2) + .await + .unwrap_err(); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn count_replacement_steps() { + let logctx = dev::test_setup_log("count_replacement_steps"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let dataset_id = Uuid::new_v4(); + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let volume_id = Uuid::new_v4(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request, volume_id, + ) + .await + .unwrap(); + + // Make sure counts start at 0 + + assert_eq!( + datastore + .in_progress_region_snapshot_replacement_steps( + &opctx, request_id + ) + .await + .unwrap(), + 0, + ); + + assert!(datastore + .get_requested_region_snapshot_replacement_steps(&opctx) + .await + .unwrap() + .is_empty()); + + // Insert some replacement steps, and make sure counting works + + { + let step = RegionSnapshotReplacementStep::new( + request_id, + Uuid::new_v4(), // volume id + ); + + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap(); + } + + assert_eq!( + datastore + .in_progress_region_snapshot_replacement_steps( + &opctx, request_id + ) + .await + .unwrap(), + 1, + ); + + assert_eq!( + datastore + .get_requested_region_snapshot_replacement_steps(&opctx) + .await + .unwrap() + .len(), + 1, + ); + + { + let mut step = RegionSnapshotReplacementStep::new( + request_id, + Uuid::new_v4(), // volume id + ); + + step.replacement_state = + RegionSnapshotReplacementStepState::Running; + + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap(); + } + + assert_eq!( + datastore + .in_progress_region_snapshot_replacement_steps( + &opctx, request_id + ) + .await + .unwrap(), + 2, + ); + + assert_eq!( + datastore + .get_requested_region_snapshot_replacement_steps(&opctx) + .await + .unwrap() + .len(), + 1, + ); + + { + let mut step = RegionSnapshotReplacementStep::new( + request_id, + Uuid::new_v4(), // volume id + ); + + // VolumeDeleted does not count as "in-progress" + step.replacement_state = + RegionSnapshotReplacementStepState::VolumeDeleted; + + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap(); + } + + assert_eq!( + datastore + .in_progress_region_snapshot_replacement_steps( + &opctx, request_id + ) + .await + .unwrap(), + 2, + ); + + assert_eq!( + datastore + .get_requested_region_snapshot_replacement_steps(&opctx) + .await + .unwrap() + .len(), + 1, + ); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn unique_region_snapshot_replacement_step_per_volume() { + let logctx = dev::test_setup_log( + "unique_region_snapshot_replacement_step_per_volume", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Ensure that only one non-complete replacement step can be inserted + // per volume. + + let volume_id = Uuid::new_v4(); + + let step = + RegionSnapshotReplacementStep::new(Uuid::new_v4(), volume_id); + let first_request_id = step.id; + + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap(); + + let step = + RegionSnapshotReplacementStep::new(Uuid::new_v4(), volume_id); + + datastore + .insert_region_snapshot_replacement_step(&opctx, step.clone()) + .await + .unwrap_err(); + + // Ensure that transitioning the first step to running doesn't change + // things. + + let saga_id = Uuid::new_v4(); + + datastore + .set_region_snapshot_replacement_step_running( + &opctx, + first_request_id, + saga_id, + ) + .await + .unwrap(); + + datastore + .insert_region_snapshot_replacement_step(&opctx, step.clone()) + .await + .unwrap_err(); + + // Ensure that transitioning the first step to complete means another + // can be added. + + datastore + .set_region_snapshot_replacement_step_complete( + &opctx, + first_request_id, + saga_id, + Uuid::new_v4(), // old_snapshot_volume_id + ) + .await + .unwrap(); + + datastore + .insert_region_snapshot_replacement_step(&opctx, step.clone()) + .await + .unwrap(); + + // Ensure that transitioning the first step to volume deleted still + // works. + + datastore + .set_region_snapshot_replacement_step_volume_deleted( + &opctx, + first_request_id, + ) + .await + .unwrap(); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn region_snapshot_replacement_step_gc() { + let logctx = dev::test_setup_log("region_snapshot_replacement_step_gc"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let mut request = RegionSnapshotReplacement::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + ); + request.replacement_state = RegionSnapshotReplacementState::Complete; + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, + request, + Uuid::new_v4(), + ) + .await + .unwrap(); + + assert!(datastore + .region_snapshot_replacement_steps_requiring_garbage_collection( + &opctx + ) + .await + .unwrap() + .is_empty()); + + let mut step = + RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + step.replacement_state = RegionSnapshotReplacementStepState::Complete; + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap(); + + let mut step = + RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + step.replacement_state = RegionSnapshotReplacementStepState::Complete; + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap(); + + assert_eq!( + 2, + datastore + .region_snapshot_replacement_steps_requiring_garbage_collection( + &opctx, + ) + .await + .unwrap() + .len(), + ); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn region_snapshot_replacement_step_conflict() { + let logctx = + dev::test_setup_log("region_snapshot_replacement_step_conflict"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Assert that a region snapshot replacement step cannot be created for + // a volume that is the "old snapshot volume" for another snapshot + // replacement step. + + let request_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + let old_snapshot_volume_id = Uuid::new_v4(); + + let mut step = + RegionSnapshotReplacementStep::new(request_id, volume_id); + step.replacement_state = RegionSnapshotReplacementStepState::Complete; + step.old_snapshot_volume_id = Some(old_snapshot_volume_id); + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap(); + + let step = RegionSnapshotReplacementStep::new( + request_id, + old_snapshot_volume_id, + ); + datastore + .insert_region_snapshot_replacement_step(&opctx, step) + .await + .unwrap_err(); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn region_snapshot_replacement_step_conflict_with_region_replacement() + { + let logctx = dev::test_setup_log( + "region_snapshot_replacement_step_conflict_with_region_replacement", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Assert that a region snapshot replacement step cannot be performed on + // a volume if region replacement is occurring for that volume. + + let volume_id = Uuid::new_v4(); + + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); + datastore + .insert_region_replacement_request(&opctx, request) + .await + .unwrap(); + + let request = + RegionSnapshotReplacementStep::new(Uuid::new_v4(), volume_id); + datastore + .insert_region_snapshot_replacement_step(&opctx, request) + .await + .unwrap_err(); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 381b25dc17..123689087d 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -272,7 +272,7 @@ impl DataStore { ); } - sql_function!(fn random() -> diesel::sql_types::Float); + define_sql_function!(fn random() -> diesel::sql_types::Float); // We only actually care about one target here, so this // query should have a `.limit(1)` attached. We fetch all diff --git a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs index 247eefd3d5..7c3e1c4b8f 100644 --- a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs +++ b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs @@ -280,10 +280,7 @@ impl DataStore { } /// Transitively removes the CPU and memory charges for an instance from the - /// instance's project, silo, and fleet, provided that the instance's state - /// generation is less than `max_instance_gen`. This allows a caller who is - /// about to apply generation G to an instance to avoid deleting resources - /// if its update was superseded. + /// instance's project, silo, and fleet. pub async fn virtual_provisioning_collection_delete_instance( &self, opctx: &OpContext, @@ -291,12 +288,10 @@ impl DataStore { project_id: Uuid, cpus_diff: i64, ram_diff: ByteCount, - max_instance_gen: i64, ) -> Result, Error> { let provisions = VirtualProvisioningCollectionUpdate::new_delete_instance( id, - max_instance_gen, cpus_diff, ram_diff, project_id, @@ -518,8 +513,6 @@ mod test { // Delete the instance - // Make this value outrageously high, so that as a "max" it is ignored. - let max_instance_gen: i64 = 1000; datastore .virtual_provisioning_collection_delete_instance( &opctx, @@ -527,7 +520,6 @@ mod test { project_id, cpus, ram, - max_instance_gen, ) .await .unwrap(); @@ -614,10 +606,6 @@ mod test { // Delete the instance - // If the "instance gen" is too low, the delete operation should be - // dropped. This mimics circumstances where an instance update arrives - // late to the query. - let max_instance_gen = 0; datastore .virtual_provisioning_collection_delete_instance( &opctx, @@ -625,25 +613,6 @@ mod test { project_id, cpus, ram, - max_instance_gen, - ) - .await - .unwrap(); - for id in ids { - verify_collection_usage(&datastore, &opctx, id, 12, 1 << 30, 0) - .await; - } - - // Make this value outrageously high, so that as a "max" it is ignored. - let max_instance_gen = 1000; - datastore - .virtual_provisioning_collection_delete_instance( - &opctx, - instance_id, - project_id, - cpus, - ram, - max_instance_gen, ) .await .unwrap(); @@ -664,7 +633,6 @@ mod test { project_id, cpus, ram, - max_instance_gen, ) .await .unwrap(); diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 7ce8c1551e..14c3405a70 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -7,6 +7,7 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; +use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::Vmm; @@ -15,23 +16,44 @@ use crate::db::model::VmmState as DbVmmState; use crate::db::pagination::paginated; use crate::db::schema::vmm::dsl; use crate::db::update_and_check::UpdateAndCheck; +use crate::db::update_and_check::UpdateAndQueryResult; use crate::db::update_and_check::UpdateStatus; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::api::external::InternalContext; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; +use omicron_common::api::internal::nexus; +use omicron_common::api::internal::nexus::Migrations; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; use std::net::SocketAddr; use uuid::Uuid; +/// The result of an [`DataStore::vmm_and_migration_update_runtime`] call, +/// indicating which records were updated. +#[derive(Copy, Clone, Debug)] +pub struct VmmStateUpdateResult { + /// `true` if the VMM record was updated, `false` otherwise. + pub vmm_updated: bool, + + /// `true` if a migration record was updated for the migration in, false if + /// no update was performed or no migration in was provided. + pub migration_in_updated: bool, + + /// `true` if a migration record was updated for the migration out, false if + /// no update was performed or no migration out was provided. + pub migration_out_updated: bool, +} + impl DataStore { pub async fn vmm_insert( &self, @@ -116,29 +138,164 @@ impl DataStore { vmm_id: &PropolisUuid, new_runtime: &VmmRuntimeState, ) -> Result { - let updated = diesel::update(dsl::vmm) + self.vmm_update_runtime_on_connection( + &*self.pool_connection_unauthorized().await?, + vmm_id, + new_runtime, + ) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Vmm, + LookupType::ById(vmm_id.into_untyped_uuid()), + ), + ) + }) + } + + async fn vmm_update_runtime_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + vmm_id: &PropolisUuid, + new_runtime: &VmmRuntimeState, + ) -> Result, diesel::result::Error> { + diesel::update(dsl::vmm) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(vmm_id.into_untyped_uuid())) .filter(dsl::state_generation.lt(new_runtime.gen)) .set(new_runtime.clone()) .check_if_exists::(vmm_id.into_untyped_uuid()) - .execute_and_check(&*self.pool_connection_unauthorized().await?) + .execute_and_check(conn) .await - .map(|r| match r.status { - UpdateStatus::Updated => true, - UpdateStatus::NotUpdatedButExists => false, - }) - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByLookup( - ResourceType::Vmm, - LookupType::ById(vmm_id.into_untyped_uuid()), - ), - ) - })?; + } - Ok(updated) + /// Updates a VMM record and associated migration record(s) with a single + /// database command. + /// + /// This is intended to be used to apply updates from sled agent that + /// may change a VMM's runtime state (e.g. moving an instance from Running + /// to Stopped) and the state of its current active mgiration in a single + /// transaction. The caller is responsible for ensuring the VMM and + /// migration states are consistent with each other before calling this + /// routine. + /// + /// # Arguments + /// + /// - `vmm_id`: The ID of the VMM to update. + /// - `new_runtime`: The new VMM runtime state to try to write. + /// - `migrations`: The (optional) migration-in and migration-out states to + /// try to write. + /// + /// # Return value + /// + /// - `Ok(`[`VmmStateUpdateResult`]`)` if the query was issued + /// successfully. The returned [`VmmStateUpdateResult`] indicates which + /// database record(s) were updated. Note that an update can fail because + /// it was inapplicable (i.e. the database has state with a newer + /// generation already) or because the relevant record was not found. + /// - `Err` if another error occurred while accessing the database. + pub async fn vmm_and_migration_update_runtime( + &self, + opctx: &OpContext, + vmm_id: PropolisUuid, + new_runtime: &VmmRuntimeState, + Migrations { migration_in, migration_out }: Migrations<'_>, + ) -> Result { + fn migration_id( + m: Option<&nexus::MigrationRuntimeState>, + ) -> Option { + m.as_ref().map(|m| m.migration_id) + } + + // If both a migration-in and migration-out update was provided for this + // VMM, they can't be from the same migration, since migrating from a + // VMM to itself wouldn't make sense... + let migration_out_id = migration_id(migration_out); + if migration_out_id.is_some() + && migration_out_id == migration_id(migration_in) + { + return Err(Error::conflict( + "migrating from a VMM to itself is nonsensical", + )) + .internal_context(format!("migration_in: {migration_in:?}; migration_out: {migration_out:?}")); + } + + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("vmm_and_migration_update_runtime") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let vmm_updated = self + .vmm_update_runtime_on_connection( + &conn, + &vmm_id, + new_runtime, + ) + .await.map(|r| match r.status { UpdateStatus::Updated => true, UpdateStatus::NotUpdatedButExists => false })?; + let migration_out_updated = match migration_out { + Some(migration) => { + let r = self.migration_update_source_on_connection( + &conn, &vmm_id, migration, + ) + .await?; + match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => match r.found { + m if m.time_deleted.is_some() => return Err(err.bail(Error::Gone)), + m if m.source_propolis_id != vmm_id.into_untyped_uuid() => { + return Err(err.bail(Error::invalid_value( + "source propolis UUID", + format!("{vmm_id} is not the source VMM of this migration"), + ))); + } + // Not updated, generation has advanced. + _ => false + }, + } + }, + None => false, + }; + let migration_in_updated = match migration_in { + Some(migration) => { + let r = self.migration_update_target_on_connection( + &conn, &vmm_id, migration, + ) + .await?; + match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => match r.found { + m if m.time_deleted.is_some() => return Err(err.bail(Error::Gone)), + m if m.target_propolis_id != vmm_id.into_untyped_uuid() => { + return Err(err.bail(Error::invalid_value( + "target propolis UUID", + format!("{vmm_id} is not the target VMM of this migration"), + ))); + } + // Not updated, generation has advanced. + _ => false + }, + } + }, + None => false, + }; + Ok(VmmStateUpdateResult { + vmm_updated, + migration_in_updated, + migration_out_updated, + }) + }}) + .await + .map_err(|e| { + err.take().unwrap_or_else(|| public_error_from_diesel(e, ErrorHandler::Server)) + }) } /// Forcibly overwrites the Propolis IP/Port in the supplied VMM's record with @@ -176,7 +333,7 @@ impl DataStore { /// /// A VMM is considered "abandoned" if (and only if): /// - /// - It is in the `Destroyed` state. + /// - It is in the `Destroyed` or `SagaUnwound` state. /// - It is not currently running an instance, and it is also not the /// migration target of any instance (i.e. it is not pointed to by /// any instance record's `active_propolis_id` and `target_propolis_id` @@ -188,16 +345,15 @@ impl DataStore { pagparams: &DataPageParams<'_, Uuid>, ) -> ListResultVec { use crate::db::schema::instance::dsl as instance_dsl; - let destroyed = DbVmmState::Destroyed; + paginated(dsl::vmm, dsl::id, pagparams) // In order to be considered "abandoned", a VMM must be: - // - in the `Destroyed` state - .filter(dsl::state.eq(destroyed)) + // - in the `Destroyed` or `SagaUnwound` state + .filter(dsl::state.eq_any(DbVmmState::DESTROYABLE_STATES)) // - not deleted yet .filter(dsl::time_deleted.is_null()) // - not pointed to by any instance's `active_propolis_id` or // `target_propolis_id`. - // .left_join( // Left join with the `instance` table on the VMM's instance ID, so // that we can check if the instance pointed to by this VMM (if @@ -230,3 +386,295 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::db; + use crate::db::datastore::test_utils::datastore_test; + use crate::db::model::Generation; + use crate::db::model::Migration; + use crate::db::model::VmmRuntimeState; + use crate::db::model::VmmState; + use nexus_test_utils::db::test_setup_database; + use omicron_common::api::internal::nexus; + use omicron_test_utils::dev; + use omicron_uuid_kinds::InstanceUuid; + + #[tokio::test] + async fn test_vmm_and_migration_update_runtime() { + // Setup + let logctx = + dev::test_setup_log("test_vmm_and_migration_update_runtime"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let instance_id = InstanceUuid::from_untyped_uuid(Uuid::new_v4()); + let vmm1 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: instance_id.into_untyped_uuid(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.32".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("VMM 1 should be inserted successfully!"); + + let vmm2 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: instance_id.into_untyped_uuid(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.42".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("VMM 2 should be inserted successfully!"); + + let migration1 = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm1.id, vmm2.id), + ) + .await + .expect("migration should be inserted successfully!"); + + info!( + &logctx.log, + "pretending to migrate from vmm1 to vmm2"; + "vmm1" => ?vmm1, + "vmm2" => ?vmm2, + "migration" => ?migration1, + ); + + let vmm1_migration_out = nexus::MigrationRuntimeState { + migration_id: migration1.id, + state: nexus::MigrationState::Completed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + &opctx, + PropolisUuid::from_untyped_uuid(vmm1.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm1.runtime.r#gen.0.next()), + state: VmmState::Stopping, + }, + Migrations { + migration_in: None, + migration_out: Some(&vmm1_migration_out), + }, + ) + .await + .expect("vmm1 state should update"); + let vmm2_migration_in = nexus::MigrationRuntimeState { + migration_id: migration1.id, + state: nexus::MigrationState::Completed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + &opctx, + PropolisUuid::from_untyped_uuid(vmm2.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next()), + state: VmmState::Running, + }, + Migrations { + migration_in: Some(&vmm2_migration_in), + migration_out: None, + }, + ) + .await + .expect("vmm1 state should update"); + + let all_migrations = datastore + .instance_list_migrations( + &opctx, + instance_id, + &DataPageParams::max_page(), + ) + .await + .expect("must list migrations"); + assert_eq!(all_migrations.len(), 1); + let db_migration1 = &all_migrations[0]; + assert_eq!( + db_migration1.source_state, + db::model::MigrationState::COMPLETED + ); + assert_eq!( + db_migration1.target_state, + db::model::MigrationState::COMPLETED + ); + assert_eq!( + db_migration1.source_gen, + Generation(Generation::new().0.next()), + ); + assert_eq!( + db_migration1.target_gen, + Generation(Generation::new().0.next()), + ); + + // now, let's simulate a second migration, out of vmm2. + let vmm3 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: instance_id.into_untyped_uuid(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.69".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("VMM 2 should be inserted successfully!"); + + let migration2 = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm2.id, vmm3.id), + ) + .await + .expect("migration 2 should be inserted successfully!"); + info!( + &logctx.log, + "pretending to migrate from vmm2 to vmm3"; + "vmm2" => ?vmm2, + "vmm3" => ?vmm3, + "migration" => ?migration2, + ); + + let vmm2_migration_out = nexus::MigrationRuntimeState { + migration_id: migration2.id, + state: nexus::MigrationState::Completed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + &opctx, + PropolisUuid::from_untyped_uuid(vmm2.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next()), + state: VmmState::Destroyed, + }, + Migrations { + migration_in: Some(&vmm2_migration_in), + migration_out: Some(&vmm2_migration_out), + }, + ) + .await + .expect("vmm2 state should update"); + + let vmm3_migration_in = nexus::MigrationRuntimeState { + migration_id: migration2.id, + // Let's make this fail, just for fun... + state: nexus::MigrationState::Failed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + &opctx, + PropolisUuid::from_untyped_uuid(vmm3.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm3.runtime.r#gen.0.next()), + state: VmmState::Destroyed, + }, + Migrations { + migration_in: Some(&vmm3_migration_in), + migration_out: None, + }, + ) + .await + .expect("vmm3 state should update"); + + let all_migrations = datastore + .instance_list_migrations( + &opctx, + instance_id, + &DataPageParams::max_page(), + ) + .await + .expect("must list migrations"); + assert_eq!(all_migrations.len(), 2); + + // the previous migration should not have closed. + let new_db_migration1 = all_migrations + .iter() + .find(|m| m.id == migration1.id) + .expect("query must include migration1"); + assert_eq!(new_db_migration1.source_state, db_migration1.source_state); + assert_eq!(new_db_migration1.source_gen, db_migration1.source_gen); + assert_eq!( + db_migration1.time_source_updated, + new_db_migration1.time_source_updated + ); + assert_eq!(new_db_migration1.target_state, db_migration1.target_state); + assert_eq!(new_db_migration1.target_gen, db_migration1.target_gen,); + assert_eq!( + new_db_migration1.time_target_updated, + db_migration1.time_target_updated, + ); + + let db_migration2 = all_migrations + .iter() + .find(|m| m.id == migration2.id) + .expect("query must include migration2"); + assert_eq!( + db_migration2.source_state, + db::model::MigrationState::COMPLETED + ); + assert_eq!( + db_migration2.target_state, + db::model::MigrationState::FAILED + ); + assert_eq!( + db_migration2.source_gen, + Generation(Generation::new().0.next()), + ); + assert_eq!( + db_migration2.target_gen, + Generation(Generation::new().0.next()), + ); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 58c26d6484..f777384b7b 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -7,6 +7,7 @@ use super::DataStore; use crate::db; use crate::db::datastore::OpContext; +use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; @@ -21,6 +22,8 @@ use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; +use crate::db::pagination::paginated; +use crate::db::pagination::Paginator; use crate::db::queries::volume::DecreaseCrucibleResourceCountAndSoftDeleteVolume; use crate::transaction_retry::OptionalError; use anyhow::bail; @@ -48,6 +51,7 @@ use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; use std::collections::VecDeque; +use std::net::SocketAddr; use std::net::SocketAddrV6; use uuid::Uuid; @@ -1717,12 +1721,91 @@ fn region_in_vcr( Ok(region_found) } +/// Check if a read-only target is present anywhere in a Volume Construction +/// Request +fn read_only_target_in_vcr( + vcr: &VolumeConstructionRequest, + read_only_target: &SocketAddrV6, +) -> anyhow::Result { + struct Work<'a> { + vcr_part: &'a VolumeConstructionRequest, + under_read_only_parent: bool, + } + + let mut parts: VecDeque = VecDeque::new(); + parts.push_back(Work { vcr_part: &vcr, under_read_only_parent: false }); + + while let Some(work) = parts.pop_front() { + match work.vcr_part { + VolumeConstructionRequest::Volume { + sub_volumes, + read_only_parent, + .. + } => { + for sub_volume in sub_volumes { + parts.push_back(Work { + vcr_part: &sub_volume, + under_read_only_parent: work.under_read_only_parent, + }); + } + + if let Some(read_only_parent) = read_only_parent { + parts.push_back(Work { + vcr_part: &read_only_parent, + under_read_only_parent: true, + }); + } + } + + VolumeConstructionRequest::Url { .. } => { + // nothing required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if work.under_read_only_parent && !opts.read_only { + // This VCR isn't constructed properly, there's a read/write + // region under a read-only parent + bail!("read-write region under read-only parent"); + } + + for target in &opts.target { + let parsed_target: SocketAddrV6 = target.parse()?; + if parsed_target == *read_only_target && opts.read_only { + return Ok(true); + } + } + } + + VolumeConstructionRequest::File { .. } => { + // nothing required + } + } + } + + Ok(false) +} + pub struct VolumeReplacementParams { pub volume_id: Uuid, pub region_id: Uuid, pub region_addr: SocketAddrV6, } +// types for volume_replace_snapshot and replace_read_only_target_in_vcr +// parameters + +#[derive(Debug, Clone, Copy)] +pub struct VolumeWithTarget(Uuid); + +#[derive(Debug, Clone, Copy)] +pub struct ExistingTarget(SocketAddrV6); + +#[derive(Debug, Clone, Copy)] +pub struct ReplacementTarget(SocketAddrV6); + +#[derive(Debug, Clone, Copy)] +pub struct VolumeToDelete(Uuid); + impl DataStore { /// Replace a read-write region in a Volume with a new region. pub async fn volume_replace_region( @@ -2004,6 +2087,265 @@ impl DataStore { } }) } + + /// Replace a read-only target in a Volume with a new region + /// + /// In a single transaction: + /// + /// - update a volume's serialized construction request by replacing a + /// single target. + /// + /// - stash the replaced target in a "volume to delete"'s serialized + /// construction request + /// + /// Note that this transaction does _not_ update a region snapshot's volume + /// references table! This is legal because the existing target reference is + /// written into the volume to delete's construction request. + /// + /// This function's effects can be undone by calling it with swapped + /// `existing` and `replacement` parameters. + pub async fn volume_replace_snapshot( + &self, + volume_id: VolumeWithTarget, + existing: ExistingTarget, + replacement: ReplacementTarget, + volume_to_delete_id: VolumeToDelete, + ) -> Result<(), Error> { + #[derive(Debug, thiserror::Error)] + enum VolumeReplaceSnapshotError { + #[error("Error from Volume snapshot replacement: {0}")] + Public(Error), + + #[error("Serde error during Volume snapshot replacement: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Target Volume deleted")] + TargetVolumeDeleted, + + #[error("Snapshot replacement error: {0}")] + SnapshotReplacementError(#[from] anyhow::Error), + + #[error("Replaced {0} targets, expected {1}")] + UnexpectedReplacedTargets(usize, usize), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + } + let err = OptionalError::new(); + + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("volume_replace_snapshot") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::volume::dsl as volume_dsl; + + // Grab the old volume first + let maybe_old_volume = { + volume_dsl::volume + .filter(volume_dsl::id.eq(volume_id.0)) + .select(Volume::as_select()) + .first_async::(&conn) + .await + .optional() + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })? + }; + + let old_volume = if let Some(old_volume) = maybe_old_volume { + old_volume + } else { + // Existing volume was deleted, so return an error. We + // can't perform the snapshot replacement now! + return Err(err.bail( + VolumeReplaceSnapshotError::TargetVolumeDeleted + )); + }; + + let old_vcr: VolumeConstructionRequest = + match serde_json::from_str(&old_volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(err.bail( + VolumeReplaceSnapshotError::SerdeError(e) + )); + }, + }; + + // Does it look like this replacement already happened? + let old_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &existing.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + VolumeReplaceSnapshotError::SnapshotReplacementError(e) + )); + }, + }; + + let new_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &replacement.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + VolumeReplaceSnapshotError::SnapshotReplacementError(e) + )); + }, + }; + + if !old_target_in_vcr && new_target_in_vcr { + // It does seem like the replacement happened + return Ok(()); + } + + // Update the existing volume's construction request to + // replace the existing target's SocketAddrV6 with the + // replacement target's + + // Copy the old volume's VCR, changing out the old target + // for the new. + let (new_vcr, replacements) = match replace_read_only_target_in_vcr( + &old_vcr, + existing, + replacement, + ) { + Ok(new_vcr) => new_vcr, + Err(e) => { + return Err(err.bail( + VolumeReplaceSnapshotError::SnapshotReplacementError(e) + )); + } + }; + + // Expect that this only happened once. If it happened + // multiple times, question everything: how would a snapshot + // be used twice?! + + if replacements != 1 { + return Err(err.bail( + VolumeReplaceSnapshotError::UnexpectedReplacedTargets( + replacements, 1, + ) + )); + } + + let new_volume_data = serde_json::to_string( + &new_vcr, + ) + .map_err(|e| { + err.bail(VolumeReplaceSnapshotError::SerdeError(e)) + })?; + + // Update the existing volume's data + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id.0)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(&conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + + // Make a new VCR that will stash the target to delete. The + // values here don't matter, just that it gets fed into the + // volume_delete machinery later. + let vcr = VolumeConstructionRequest::Volume { + id: volume_to_delete_id.0, + block_size: 512, + sub_volumes: vec![ + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: sled_agent_client::types::CrucibleOpts { + id: volume_to_delete_id.0, + target: vec![ + existing.0.to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + } + ], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&vcr) + .map_err(|e| { + err.bail(VolumeReplaceSnapshotError::SerdeError(e)) + })?; + + // Update the volume to delete data + let num_updated = + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_to_delete_id.0)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(volume_data)) + .execute_async(&conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( + num_updated, 1, + ) + )); + } + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + VolumeReplaceSnapshotError::Public(e) => e, + + VolumeReplaceSnapshotError::SerdeError(_) => { + Error::internal_error(&err.to_string()) + } + + VolumeReplaceSnapshotError::TargetVolumeDeleted => { + Error::internal_error(&err.to_string()) + } + + VolumeReplaceSnapshotError::SnapshotReplacementError(_) => { + Error::internal_error(&err.to_string()) + } + + VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) => { + Error::internal_error(&err.to_string()) + } + + VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) => { + Error::internal_error(&err.to_string()) + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } } /// Return the targets from a VolumeConstructionRequest. @@ -2157,6 +2499,87 @@ fn replace_region_in_vcr( Ok(new_vcr) } +/// Replace a read-only target in a VolumeConstructionRequest +/// +/// Note that UUIDs are not randomized by this step: Crucible will reject a +/// `target_replace` call if the replacement VolumeConstructionRequest does not +/// exactly match the original, except for a single Region difference. +/// +/// Note that the generation number _is not_ bumped in this step. +fn replace_read_only_target_in_vcr( + vcr: &VolumeConstructionRequest, + old_target: ExistingTarget, + new_target: ReplacementTarget, +) -> anyhow::Result<(VolumeConstructionRequest, usize)> { + struct Work<'a> { + vcr_part: &'a mut VolumeConstructionRequest, + under_read_only_parent: bool, + } + let mut new_vcr = vcr.clone(); + + let mut parts: VecDeque = VecDeque::new(); + parts.push_back(Work { + vcr_part: &mut new_vcr, + under_read_only_parent: false, + }); + + let mut replacements = 0; + + while let Some(work) = parts.pop_front() { + match work.vcr_part { + VolumeConstructionRequest::Volume { + sub_volumes, + read_only_parent, + .. + } => { + for sub_volume in sub_volumes { + parts.push_back(Work { + vcr_part: sub_volume, + under_read_only_parent: work.under_read_only_parent, + }); + } + + if let Some(read_only_parent) = read_only_parent { + parts.push_back(Work { + vcr_part: read_only_parent, + under_read_only_parent: true, + }); + } + } + + VolumeConstructionRequest::Url { .. } => { + // nothing required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if work.under_read_only_parent && !opts.read_only { + // This VCR isn't constructed properly, there's a read/write + // region under a read-only parent + bail!("read-write region under read-only parent"); + } + + for target in &mut opts.target { + let parsed_target: SocketAddrV6 = target.parse()?; + if parsed_target == old_target.0 && opts.read_only { + *target = new_target.0.to_string(); + replacements += 1; + } + } + } + + VolumeConstructionRequest::File { .. } => { + // nothing required + } + } + } + + if replacements == 0 { + bail!("target {old_target:?} not found!"); + } + + Ok((new_vcr, replacements)) +} + /// Find Regions in a Volume's subvolumes list whose target match the argument /// IP, and add them to the supplied Vec. fn find_matching_rw_regions_in_volume( @@ -2199,11 +2622,50 @@ fn find_matching_rw_regions_in_volume( Ok(()) } -#[cfg(test)] -mod tests { - use super::*; - - use crate::db::datastore::test_utils::datastore_test; +impl DataStore { + pub async fn find_volumes_referencing_socket_addr( + &self, + opctx: &OpContext, + address: SocketAddr, + ) -> ListResultVec { + opctx.check_complex_operations_allowed()?; + + let mut volumes = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = self.pool_connection_authorized(opctx).await?; + + let needle = address.to_string(); + + while let Some(p) = paginator.next() { + use db::schema::volume::dsl; + + let haystack = + paginated(dsl::volume, dsl::id, &p.current_pagparams()) + .select(Volume::as_select()) + .get_results_async::(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for volume in haystack { + if volume.data().contains(&needle) { + volumes.push(volume); + } + } + } + + Ok(volumes) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::db::datastore::test_utils::datastore_test; use nexus_test_utils::db::test_setup_database; use omicron_test_utils::dev; use sled_agent_client::types::CrucibleOpts; @@ -2522,4 +2984,907 @@ mod tests { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_volume_replace_snapshot() { + let logctx = dev::test_setup_log("test_volume_replace_snapshot"); + let log = logctx.log.new(o!()); + let mut db = test_setup_database(&log).await; + let (_opctx, db_datastore) = datastore_test(&logctx, &db).await; + + // Insert two volumes: one with the target to replace, and one temporary + // "volume to delete" that's blank. + + let volume_id = Uuid::new_v4(); + let volume_to_delete_id = Uuid::new_v4(); + let rop_id = Uuid::new_v4(); + + db_datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd00:1122:3344:101::1]:11111"), + String::from("[fd00:1122:3344:102::1]:22222"), + String::from("[fd00:1122:3344:103::1]:33333"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + }, + }], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: rop_id, + target: vec![ + // target to replace + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + db_datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Do the replacement + + db_datastore + .volume_replace_snapshot( + VolumeWithTarget(volume_id), + ExistingTarget("[fd00:1122:3344:104::1]:400".parse().unwrap()), + ReplacementTarget( + "[fd55:1122:3344:101::1]:111".parse().unwrap(), + ), + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + // Ensure the shape of the resulting VCRs + + let vcr: VolumeConstructionRequest = serde_json::from_str( + db_datastore.volume_get(volume_id).await.unwrap().unwrap().data(), + ) + .unwrap(); + + assert_eq!( + &vcr, + &VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd00:1122:3344:101::1]:11111"), + String::from("[fd00:1122:3344:102::1]:22222"), + String::from("[fd00:1122:3344:103::1]:33333"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + }, + }], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: rop_id, + target: vec![ + // target replaced + String::from("[fd55:1122:3344:101::1]:111"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + } + )), + }, + ); + + let vcr: VolumeConstructionRequest = serde_json::from_str( + db_datastore + .volume_get(volume_to_delete_id) + .await + .unwrap() + .unwrap() + .data(), + ) + .unwrap(); + + assert_eq!( + &vcr, + &VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: volume_to_delete_id, + target: vec![ + // replaced target stashed here + String::from("[fd00:1122:3344:104::1]:400"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }, + ); + + // Now undo the replacement. Note volume ID is not swapped. + + db_datastore + .volume_replace_snapshot( + VolumeWithTarget(volume_id), + ExistingTarget("[fd55:1122:3344:101::1]:111".parse().unwrap()), + ReplacementTarget( + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + ), + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + let vcr: VolumeConstructionRequest = serde_json::from_str( + db_datastore.volume_get(volume_id).await.unwrap().unwrap().data(), + ) + .unwrap(); + + // Ensure the shape of the resulting VCR + assert_eq!( + &vcr, + &VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd00:1122:3344:101::1]:11111"), + String::from("[fd00:1122:3344:102::1]:22222"), + String::from("[fd00:1122:3344:103::1]:33333"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + }, + }], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: rop_id, + target: vec![ + // back to what it was + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + } + )), + }, + ); + + let vcr: VolumeConstructionRequest = serde_json::from_str( + db_datastore + .volume_get(volume_to_delete_id) + .await + .unwrap() + .unwrap() + .data(), + ) + .unwrap(); + + assert_eq!( + &vcr, + &VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: volume_to_delete_id, + target: vec![ + // replacement stashed here + String::from("[fd55:1122:3344:101::1]:111"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }, + ); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_find_volumes_referencing_socket_addr() { + let logctx = + dev::test_setup_log("test_find_volumes_referencing_socket_addr"); + let log = logctx.log.new(o!()); + let mut db = test_setup_database(&log).await; + let (opctx, db_datastore) = datastore_test(&logctx, &db).await; + + let volume_id = Uuid::new_v4(); + + // case where the needle is found + + db_datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + let volumes = db_datastore + .find_volumes_referencing_socket_addr( + &opctx, + "[fd00:1122:3344:104::1]:400".parse().unwrap(), + ) + .await + .unwrap(); + + assert_eq!(volumes.len(), 1); + assert_eq!(volumes[0].id(), volume_id); + + // case where the needle is missing + + let volumes = db_datastore + .find_volumes_referencing_socket_addr( + &opctx, + "[fd55:1122:3344:104::1]:400".parse().unwrap(), + ) + .await + .unwrap(); + + assert!(volumes.is_empty()); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[test] + fn test_read_only_target_in_vcr() { + // read_only_target_in_vcr should find read-only targets + + let vcr = VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }; + + assert!(read_only_target_in_vcr( + &vcr, + &"[fd00:1122:3344:104::1]:400".parse().unwrap(), + ) + .unwrap()); + + // read_only_target_in_vcr should _not_ find read-write targets + + let vcr = VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + }, + }], + read_only_parent: None, + }; + + assert!(!read_only_target_in_vcr( + &vcr, + &"[fd00:1122:3344:104::1]:400".parse().unwrap(), + ) + .unwrap()); + + // read_only_target_in_vcr should bail on incorrect VCRs (currently it + // only detects a read/write region under a read-only parent) + + let vcr = VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, // invalid! + }, + }, + )), + }; + + read_only_target_in_vcr( + &vcr, + &"[fd00:1122:3344:104::1]:400".parse().unwrap(), + ) + .unwrap_err(); + } + + #[test] + fn test_replace_read_only_target_in_vcr() { + // replace_read_only_target_in_vcr should perform a replacement in a + // read-only parent + + let volume_id = Uuid::new_v4(); + + let vcr = VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }; + + let old_target = + ExistingTarget("[fd00:1122:3344:105::1]:401".parse().unwrap()); + let new_target = + ReplacementTarget("[fd99:1122:3344:105::1]:12345".parse().unwrap()); + + let (new_vcr, replacements) = + replace_read_only_target_in_vcr(&vcr, old_target, new_target) + .unwrap(); + + assert_eq!(replacements, 1); + assert_eq!( + &new_vcr, + &VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + new_target.0.to_string(), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + } + } + )) + } + ); + + // replace_read_only_target_in_vcr should perform a replacement in a + // read-only parent in a sub-volume + + let vcr = VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd55:1122:3344:204::1]:1000"), + String::from("[fd55:1122:3344:205::1]:1001"), + String::from("[fd55:1122:3344:206::1]:1002"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + }, + }], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd33:1122:3344:304::1]:2000"), + String::from("[fd33:1122:3344:305::1]:2001"), + String::from("[fd33:1122:3344:306::1]:2002"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }; + + let old_target = + ExistingTarget("[fd33:1122:3344:306::1]:2002".parse().unwrap()); + let new_target = + ReplacementTarget("[fd99:1122:3344:105::1]:12345".parse().unwrap()); + + let (new_vcr, replacements) = + replace_read_only_target_in_vcr(&vcr, old_target, new_target) + .unwrap(); + + assert_eq!(replacements, 1); + assert_eq!( + &new_vcr, + &VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd55:1122:3344:204::1]:1000"), + String::from("[fd55:1122:3344:205::1]:1001"), + String::from("[fd55:1122:3344:206::1]:1002"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + } + }], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from( + "[fd33:1122:3344:304::1]:2000" + ), + String::from( + "[fd33:1122:3344:305::1]:2001" + ), + new_target.0.to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + } + } + )), + }], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd00:1122:3344:104::1]:400"), + String::from("[fd00:1122:3344:105::1]:401"), + String::from("[fd00:1122:3344:106::1]:402"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + } + } + )) + } + ); + + // replace_read_only_target_in_vcr should perform multiple replacements + // if necessary (even if this is dubious!) - the caller will decide if + // this should be legal or not + + let rop = VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd33:1122:3344:304::1]:2000"), + String::from("[fd33:1122:3344:305::1]:2001"), + String::from("[fd33:1122:3344:306::1]:2002"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }; + + let vcr = VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd55:1122:3344:204::1]:1000"), + String::from("[fd55:1122:3344:205::1]:1001"), + String::from("[fd55:1122:3344:206::1]:1002"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + }, + }], + read_only_parent: Some(Box::new(rop.clone())), + }], + read_only_parent: Some(Box::new(rop)), + }; + + let old_target = + ExistingTarget("[fd33:1122:3344:304::1]:2000".parse().unwrap()); + let new_target = + ReplacementTarget("[fd99:1122:3344:105::1]:12345".parse().unwrap()); + + let (new_vcr, replacements) = + replace_read_only_target_in_vcr(&vcr, old_target, new_target) + .unwrap(); + + assert_eq!(replacements, 2); + + let rop = VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + new_target.0.to_string(), + String::from("[fd33:1122:3344:305::1]:2001"), + String::from("[fd33:1122:3344:306::1]:2002"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }; + + assert_eq!( + &new_vcr, + &VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 10, + extent_count: 10, + gen: 1, + opts: CrucibleOpts { + id: volume_id, + target: vec![ + String::from("[fd55:1122:3344:204::1]:1000"), + String::from("[fd55:1122:3344:205::1]:1001"), + String::from("[fd55:1122:3344:206::1]:1002"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: false, + } + }], + read_only_parent: Some(Box::new(rop.clone())), + }], + read_only_parent: Some(Box::new(rop)), + } + ); + } } diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs deleted file mode 100644 index fded585b67..0000000000 --- a/nexus/db-queries/src/db/queries/instance.rs +++ /dev/null @@ -1,390 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Implement a query for updating an instance and VMM in a single CTE. - -use async_bb8_diesel::AsyncRunQueryDsl; -use diesel::prelude::QueryResult; -use diesel::query_builder::{Query, QueryFragment, QueryId}; -use diesel::result::Error as DieselError; -use diesel::sql_types::{Nullable, Uuid as SqlUuid}; -use diesel::{pg::Pg, query_builder::AstPass}; -use diesel::{Column, ExpressionMethods, QueryDsl, RunQueryDsl}; -use nexus_db_model::{ - schema::{ - instance::dsl as instance_dsl, migration::dsl as migration_dsl, - vmm::dsl as vmm_dsl, - }, - Generation, InstanceRuntimeState, MigrationState, VmmRuntimeState, -}; -use omicron_common::api::internal::nexus::{ - MigrationRole, MigrationRuntimeState, -}; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid}; -use uuid::Uuid; - -use crate::db::pool::DbConnection; -use crate::db::update_and_check::UpdateStatus; - -/// A CTE that checks and updates the instance and VMM tables in a single -/// atomic operation. -// -// The single-table update-and-check CTE has the following form: -// -// WITH found AS (SELECT FROM T WHERE ) -// updated AS (UPDATE T SET RETURNING *) -// SELECT -// found. -// updated. -// found.* -// FROM -// found -// LEFT JOIN -// updated -// ON -// found. = updated.; -// -// The idea behind this query is to have separate "found" and "updated" -// subqueries for the instance and VMM tables, then use those to create two more -// subqueries that perform the joins and yield the results, along the following -// lines: -// -// WITH vmm_found AS (SELECT(SELECT id FROM vmm WHERE vmm.id = id) AS id), -// vmm_updated AS (UPDATE vmm SET ... RETURNING *), -// instance_found AS (SELECT( -// SELECT id FROM instance WHERE instance.id = id -// ) AS id), -// instance_updated AS (UPDATE instance SET ... RETURNING *), -// vmm_result AS ( -// SELECT vmm_found.id AS found, vmm_updated.id AS updated -// FROM vmm_found -// LEFT JOIN vmm_updated -// ON vmm_found.id = vmm_updated.id -// ), -// instance_result AS ( -// SELECT instance_found.id AS found, instance_updated.id AS updated -// FROM instance_found -// LEFT JOIN instance_updated -// ON instance_found.id = instance_updated.id -// ) -// SELECT vmm_result.found, vmm_result.updated, instance_result.found, -// instance_result.updated -// FROM vmm_result, instance_result; -/// -/// If a [`MigrationRuntimeState`] is provided, similar "found" and "update" -/// clauses are also added to join the `migration` record for the instance's -/// active migration, if one exists, and update the migration record. If no -/// migration record is provided, this part of the query is skipped, and the -/// `migration_found` and `migration_updated` portions are always `false`. -// -// The "wrapper" SELECTs when finding instances and VMMs are used to get a NULL -// result in the final output instead of failing the entire query if the target -// object is missing. This maximizes Nexus's flexibility when dealing with -// updates from sled agent that refer to one valid and one deleted object. (This -// can happen if, e.g., sled agent sends a message indicating that a retired VMM -// has finally been destroyed when its instance has since been deleted.) -pub struct InstanceAndVmmUpdate { - instance_find: Box + Send>, - vmm_find: Box + Send>, - instance_update: Box + Send>, - vmm_update: Box + Send>, - migration: Option, -} - -struct MigrationUpdate { - find: Box + Send>, - update: Box + Send>, -} - -/// Contains the result of a combined instance-and-VMM update operation. -#[derive(Copy, Clone, PartialEq, Debug)] -pub struct InstanceAndVmmUpdateResult { - /// `Some(status)` if the target instance was found; the wrapped - /// `UpdateStatus` indicates whether the row was updated. `None` if the - /// instance was not found. - pub instance_status: Option, - - /// `Some(status)` if the target VMM was found; the wrapped `UpdateStatus` - /// indicates whether the row was updated. `None` if the VMM was not found. - pub vmm_status: Option, - - /// `Some(status)` if the target migration was found; the wrapped `UpdateStatus` - /// indicates whether the row was updated. `None` if the migration was not - /// found, or no migration update was performed. - pub migration_status: Option, -} - -/// Computes the update status to return from the results of queries that find -/// and update an object with an ID of type `T`. -fn compute_update_status( - found: Option, - updated: Option, -) -> Option -where - T: PartialEq + std::fmt::Display, -{ - match (found, updated) { - // If both the "find" and "update" prongs returned an ID, the row was - // updated. The IDs should match in this case (if they don't then the - // query was constructed very strangely!). - (Some(found_id), Some(updated_id)) if found_id == updated_id => { - Some(UpdateStatus::Updated) - } - // If the "find" prong returned an ID but the "update" prong didn't, the - // row exists but wasn't updated. - (Some(_), None) => Some(UpdateStatus::NotUpdatedButExists), - // If neither prong returned anything, indicate the row is missing. - (None, None) => None, - // If both prongs returned an ID, but they don't match, something - // terrible has happened--the prongs must have referred to different - // IDs! - (Some(found_id), Some(mismatched_id)) => unreachable!( - "updated ID {} didn't match found ID {}", - mismatched_id, found_id - ), - // Similarly, if the target ID was not found but something was updated - // anyway, then something is wrong with the update query--either it has - // the wrong ID or did not filter rows properly. - (None, Some(updated_id)) => unreachable!( - "ID {} was updated but no found ID was supplied", - updated_id - ), - } -} - -impl InstanceAndVmmUpdate { - pub fn new( - instance_id: InstanceUuid, - new_instance_runtime_state: InstanceRuntimeState, - vmm_id: PropolisUuid, - new_vmm_runtime_state: VmmRuntimeState, - migration: Option, - ) -> Self { - let instance_find = Box::new( - instance_dsl::instance - .filter(instance_dsl::id.eq(instance_id.into_untyped_uuid())) - .select(instance_dsl::id), - ); - - let vmm_find = Box::new( - vmm_dsl::vmm - .filter(vmm_dsl::id.eq(vmm_id.into_untyped_uuid())) - .select(vmm_dsl::id), - ); - - let instance_update = Box::new( - diesel::update(instance_dsl::instance) - .filter(instance_dsl::time_deleted.is_null()) - .filter(instance_dsl::id.eq(instance_id.into_untyped_uuid())) - .filter( - instance_dsl::state_generation - .lt(new_instance_runtime_state.gen), - ) - .set(new_instance_runtime_state), - ); - - let vmm_update = Box::new( - diesel::update(vmm_dsl::vmm) - .filter(vmm_dsl::time_deleted.is_null()) - .filter(vmm_dsl::id.eq(vmm_id.into_untyped_uuid())) - .filter(vmm_dsl::state_generation.lt(new_vmm_runtime_state.gen)) - .set(new_vmm_runtime_state), - ); - - let migration = migration.map( - |MigrationRuntimeState { - role, - migration_id, - state, - gen, - time_updated, - }| { - let state = MigrationState::from(state); - let find = Box::new( - migration_dsl::migration - .filter(migration_dsl::id.eq(migration_id)) - .filter(migration_dsl::time_deleted.is_null()) - .select(migration_dsl::id), - ); - let gen = Generation::from(gen); - let update: Box + Send> = match role { - MigrationRole::Target => Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::target_propolis_id - .eq(vmm_id.into_untyped_uuid()), - ) - .filter(migration_dsl::target_gen.lt(gen)) - .set(( - migration_dsl::target_state.eq(state), - migration_dsl::time_target_updated - .eq(time_updated), - )), - ), - MigrationRole::Source => Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::source_propolis_id - .eq(vmm_id.into_untyped_uuid()), - ) - .filter(migration_dsl::source_gen.lt(gen)) - .set(( - migration_dsl::source_state.eq(state), - migration_dsl::time_source_updated - .eq(time_updated), - )), - ), - }; - MigrationUpdate { find, update } - }, - ); - - Self { instance_find, vmm_find, instance_update, vmm_update, migration } - } - - pub async fn execute_and_check( - self, - conn: &(impl async_bb8_diesel::AsyncConnection + Sync), - ) -> Result { - let ( - vmm_found, - vmm_updated, - instance_found, - instance_updated, - migration_found, - migration_updated, - ) = self - .get_result_async::<( - Option, - Option, - Option, - Option, - Option, - Option, - )>(conn) - .await?; - - let instance_status = - compute_update_status(instance_found, instance_updated); - let vmm_status = compute_update_status(vmm_found, vmm_updated); - let migration_status = - compute_update_status(migration_found, migration_updated); - - Ok(InstanceAndVmmUpdateResult { - instance_status, - vmm_status, - migration_status, - }) - } -} - -impl QueryId for InstanceAndVmmUpdate { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} - -impl Query for InstanceAndVmmUpdate { - type SqlType = ( - Nullable, - Nullable, - Nullable, - Nullable, - Nullable, - Nullable, - ); -} - -impl RunQueryDsl for InstanceAndVmmUpdate {} - -impl QueryFragment for InstanceAndVmmUpdate { - fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, Pg>) -> QueryResult<()> { - out.push_sql("WITH instance_found AS (SELECT ("); - self.instance_find.walk_ast(out.reborrow())?; - out.push_sql(") AS id), "); - - out.push_sql("vmm_found AS (SELECT ("); - self.vmm_find.walk_ast(out.reborrow())?; - out.push_sql(") AS id), "); - - if let Some(MigrationUpdate { ref find, .. }) = self.migration { - out.push_sql("migration_found AS (SELECT ("); - find.walk_ast(out.reborrow())?; - out.push_sql(") AS id), "); - } - - out.push_sql("instance_updated AS ("); - self.instance_update.walk_ast(out.reborrow())?; - out.push_sql(" RETURNING id), "); - - out.push_sql("vmm_updated AS ("); - self.vmm_update.walk_ast(out.reborrow())?; - out.push_sql(" RETURNING id), "); - - if let Some(MigrationUpdate { ref update, .. }) = self.migration { - out.push_sql("migration_updated AS ("); - update.walk_ast(out.reborrow())?; - out.push_sql(" RETURNING id), "); - } - - out.push_sql("vmm_result AS ("); - out.push_sql("SELECT vmm_found."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql(" AS found, vmm_updated."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql(" AS updated"); - out.push_sql(" FROM vmm_found LEFT JOIN vmm_updated ON vmm_found."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql(" = vmm_updated."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql("), "); - - out.push_sql("instance_result AS ("); - out.push_sql("SELECT instance_found."); - out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(" AS found, instance_updated."); - out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(" AS updated"); - out.push_sql( - " FROM instance_found LEFT JOIN instance_updated ON instance_found.", - ); - out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(" = instance_updated."); - out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(")"); - - if self.migration.is_some() { - out.push_sql(", "); - out.push_sql("migration_result AS ("); - out.push_sql("SELECT migration_found."); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(" AS found, migration_updated."); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(" AS updated"); - out.push_sql( - " FROM migration_found LEFT JOIN migration_updated ON migration_found.", - ); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(" = migration_updated."); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(")"); - } - out.push_sql(" "); - - out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); - out.push_sql("instance_result.found, instance_result.updated, "); - if self.migration.is_some() { - out.push_sql("migration_result.found, migration_result.updated "); - } else { - out.push_sql("NULL, NULL "); - } - out.push_sql("FROM vmm_result, instance_result"); - if self.migration.is_some() { - out.push_sql(", migration_result"); - } - - Ok(()) - } -} diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index a1022f9187..f88b8fab6d 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -7,7 +7,6 @@ pub mod disk; pub mod external_ip; -pub mod instance; pub mod ip_pool; #[macro_use] mod next_item; diff --git a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs index fd86912107..902d955a79 100644 --- a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs +++ b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs @@ -81,17 +81,9 @@ pub fn from_diesel(e: DieselError) -> external::Error { #[derive(Clone)] enum UpdateKind { InsertStorage(VirtualProvisioningResource), - DeleteStorage { - id: uuid::Uuid, - disk_byte_diff: ByteCount, - }, + DeleteStorage { id: uuid::Uuid, disk_byte_diff: ByteCount }, InsertInstance(VirtualProvisioningResource), - DeleteInstance { - id: uuid::Uuid, - max_instance_gen: i64, - cpus_diff: i64, - ram_diff: ByteCount, - }, + DeleteInstance { id: uuid::Uuid, cpus_diff: i64, ram_diff: ByteCount }, } type SelectableSql = < @@ -246,15 +238,7 @@ WITH ),") .bind::(id) }, - UpdateKind::DeleteInstance { id, max_instance_gen, .. } => { - // The filter condition here ensures that the provisioning record is - // only deleted if the corresponding instance has a generation - // number less than the supplied `max_instance_gen`. This allows a - // caller that is about to apply an instance update that will stop - // the instance and that bears generation G to avoid deleting - // resources if the instance generation was already advanced to or - // past G. - // + UpdateKind::DeleteInstance { id, .. } => { // If the relevant instance ID is not in the database, then some // other operation must have ensured the instance was previously // stopped (because that's the only way it could have been deleted), @@ -279,14 +263,13 @@ WITH FROM instance WHERE - instance.id = ").param().sql(" AND instance.state_generation < ").param().sql(" + instance.id = ").param().sql(" LIMIT 1 ) AS update ),") .bind::(id) .bind::(id) - .bind::(max_instance_gen) }, }; @@ -477,7 +460,6 @@ FROM pub fn new_delete_instance( id: InstanceUuid, - max_instance_gen: i64, cpus_diff: i64, ram_diff: ByteCount, project_id: uuid::Uuid, @@ -485,7 +467,6 @@ FROM Self::apply_update( UpdateKind::DeleteInstance { id: id.into_untyped_uuid(), - max_instance_gen, cpus_diff, ram_diff, }, @@ -567,14 +548,9 @@ mod test { let project_id = Uuid::nil(); let cpus_diff = 4; let ram_diff = 2048.try_into().unwrap(); - let max_instance_gen = 0; let query = VirtualProvisioningCollectionUpdate::new_delete_instance( - id, - max_instance_gen, - cpus_diff, - ram_diff, - project_id, + id, cpus_diff, ram_diff, project_id, ); expectorate_query_contents( @@ -678,17 +654,12 @@ mod test { let conn = pool.pool().get().await.unwrap(); let id = InstanceUuid::nil(); - let max_instance_gen = 0; let project_id = Uuid::nil(); let cpus_diff = 16.try_into().unwrap(); let ram_diff = 2048.try_into().unwrap(); let query = VirtualProvisioningCollectionUpdate::new_delete_instance( - id, - max_instance_gen, - cpus_diff, - ram_diff, - project_id, + id, cpus_diff, ram_diff, project_id, ); let _ = query .explain_async(&conn) diff --git a/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql b/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql index 3c97b7efc7..69b2e017fd 100644 --- a/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql +++ b/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql @@ -40,9 +40,7 @@ WITH 1 ) = 1 - AND EXISTS( - SELECT 1 FROM instance WHERE instance.id = $5 AND instance.state_generation < $6 LIMIT 1 - ) + AND EXISTS(SELECT 1 FROM instance WHERE instance.id = $5 LIMIT 1) AS update ), unused_cte_arm @@ -50,7 +48,7 @@ WITH DELETE FROM virtual_provisioning_resource WHERE - virtual_provisioning_resource.id = $7 AND (SELECT do_update.update FROM do_update LIMIT 1) + virtual_provisioning_resource.id = $6 AND (SELECT do_update.update FROM do_update LIMIT 1) RETURNING virtual_provisioning_resource.id, virtual_provisioning_resource.time_modified, @@ -65,8 +63,8 @@ WITH virtual_provisioning_collection SET time_modified = current_timestamp(), - cpus_provisioned = virtual_provisioning_collection.cpus_provisioned - $8, - ram_provisioned = virtual_provisioning_collection.ram_provisioned - $9 + cpus_provisioned = virtual_provisioning_collection.cpus_provisioned - $7, + ram_provisioned = virtual_provisioning_collection.ram_provisioned - $8 WHERE virtual_provisioning_collection.id = ANY (SELECT all_collections.id FROM all_collections) AND (SELECT do_update.update FROM do_update LIMIT 1) diff --git a/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql new file mode 100644 index 0000000000..bb460ff713 --- /dev/null +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql @@ -0,0 +1,93 @@ +WITH + migration_in_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $1 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_in_updated + AS ( + UPDATE + migration + SET + target_state = $2, time_target_updated = $3, target_gen = $4 + WHERE + (migration.id = $5 AND migration.target_propolis_id = $6) AND migration.target_gen < $7 + RETURNING + id + ), + migration_in_result + AS ( + SELECT + migration_in_found.id AS found, migration_in_updated.id AS updated + FROM + migration_in_found + LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id + ), + migration_out_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $8 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_out_updated + AS ( + UPDATE + migration + SET + source_state = $9, time_source_updated = $10, source_gen = $11 + WHERE + (migration.id = $12 AND migration.source_propolis_id = $13) AND migration.source_gen < $14 + RETURNING + id + ), + migration_out_result + AS ( + SELECT + migration_out_found.id AS found, migration_out_updated.id AS updated + FROM + migration_out_found + LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $15) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $16, state_generation = $17, state = $18 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $19) AND vmm.state_generation < $20 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + migration_in_result.found, + migration_in_result.updated, + migration_out_result.found, + migration_out_result.updated +FROM + vmm_result, migration_in_result, migration_out_result diff --git a/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_in.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_in.sql new file mode 100644 index 0000000000..3fec792c6f --- /dev/null +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_in.sql @@ -0,0 +1,61 @@ +WITH + migration_in_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $1 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_in_updated + AS ( + UPDATE + migration + SET + target_state = $2, time_target_updated = $3, target_gen = $4 + WHERE + (migration.id = $5 AND migration.target_propolis_id = $6) AND migration.target_gen < $7 + RETURNING + id + ), + migration_in_result + AS ( + SELECT + migration_in_found.id AS found, migration_in_updated.id AS updated + FROM + migration_in_found + LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $8) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $9, state_generation = $10, state = $11 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $12) AND vmm.state_generation < $13 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + migration_in_result.found, + migration_in_result.updated, + NULL, + NULL +FROM + vmm_result, migration_in_result diff --git a/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_out.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_out.sql new file mode 100644 index 0000000000..7adeff48da --- /dev/null +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_out.sql @@ -0,0 +1,61 @@ +WITH + migration_out_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $1 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_out_updated + AS ( + UPDATE + migration + SET + source_state = $2, time_source_updated = $3, source_gen = $4 + WHERE + (migration.id = $5 AND migration.source_propolis_id = $6) AND migration.source_gen < $7 + RETURNING + id + ), + migration_out_result + AS ( + SELECT + migration_out_found.id AS found, migration_out_updated.id AS updated + FROM + migration_out_found + LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $8) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $9, state_generation = $10, state = $11 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $12) AND vmm.state_generation < $13 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + NULL, + NULL, + migration_out_result.found, + migration_out_result.updated +FROM + vmm_result, migration_out_result diff --git a/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_only.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_only.sql new file mode 100644 index 0000000000..cfe56740fe --- /dev/null +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_only.sql @@ -0,0 +1,24 @@ +WITH + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $1) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $2, state_generation = $3, state = $4 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $5) AND vmm.state_generation < $6 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, vmm_result.updated, NULL, NULL, NULL, NULL +FROM + vmm_result diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 40f5d95a5f..754f37c064 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -132,6 +132,8 @@ region_replacement.period_secs = 30 region_replacement_driver.period_secs = 10 # How frequently to query the status of active instances. instance_watcher.period_secs = 30 +# How frequently to schedule new instance update sagas. +instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index b194ecf1b6..bd50e846bd 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -118,6 +118,8 @@ region_replacement.period_secs = 30 region_replacement_driver.period_secs = 10 # How frequently to query the status of active instances. instance_watcher.period_secs = 30 +# How frequently to schedule new instance update sagas. +instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 69f80209c3..a531b66df4 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -43,6 +43,7 @@ async-bb8-diesel.workspace = true diesel.workspace = true httptest.workspace = true ipnet.workspace = true +nexus-db-queries = { workspace = true, features = ["testing"] } nexus-reconfigurator-planning.workspace = true nexus-reconfigurator-preparation.workspace = true nexus-inventory.workspace = true diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 3504d41e4d..8bcae27bc0 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -457,6 +457,9 @@ mod test { use crate::overridables::Overridables; use crate::Sled; use dns_service_client::DnsDiff; + use internal_dns::config::Host; + use internal_dns::config::Zone; + use internal_dns::names::BOUNDARY_NTP_DNS_NAME; use internal_dns::resolver::Resolver; use internal_dns::ServiceName; use internal_dns::DNS_ZONE; @@ -496,6 +499,7 @@ mod test { use omicron_common::address::get_switch_zone_address; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; + use omicron_common::address::BOUNDARY_NTP_REDUNDANCY; use omicron_common::address::COCKROACHDB_REDUNDANCY; use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::RACK_PREFIX; @@ -662,7 +666,7 @@ mod test { }) .collect(); - let blueprint_dns_zone = blueprint_internal_dns_config( + let mut blueprint_dns_zone = blueprint_internal_dns_config( &blueprint, &sleds_by_id, &Default::default(), @@ -686,6 +690,10 @@ mod test { // 4. Our out-of-service zone does *not* appear in the DNS config, // neither with an AAAA record nor in an SRV record. // + // 5. The boundary NTP zones' IP addresses are mapped to AAAA records in + // the special boundary DNS name (in addition to having their normal + // zone DNS name -> AAAA record from 1). + // // Together, this tells us that we have SRV records for all services, // that those SRV records all point to at least one of the Omicron zones // for that service, and that we correctly ignored zones that were not @@ -720,6 +728,33 @@ mod test { }) .collect(); + // Prune the special boundary NTP DNS name out, collecting their IP + // addresses, and build a list of expected SRV targets to ensure these + // IPs show up both in the special boundary NTP DNS name and as their + // normal SRV records. + let boundary_ntp_ips = blueprint_dns_zone + .records + .remove(BOUNDARY_NTP_DNS_NAME) + .expect("missing boundary NTP DNS name") + .into_iter() + .map(|record| match record { + DnsRecord::Aaaa(ip) => ip, + _ => panic!("expected AAAA record; got {record:?}"), + }); + let mut expected_boundary_ntp_srv_targets = boundary_ntp_ips + .map(|ip| { + let Some(zone_id) = omicron_zones_by_ip.get(&ip) else { + panic!("did not find zone ID for boundary NTP IP {ip}"); + }; + let name = Host::Zone(Zone::Other(*zone_id)).fqdn(); + println!( + "Boundary NTP IP {ip} maps to expected \ + SRV record target {name}" + ); + name + }) + .collect::>(); + // Now go through all the DNS names that have AAAA records and remove // any corresponding Omicron zone. While doing this, construct a set of // the fully-qualified DNS names (i.e., with the zone name suffix @@ -814,6 +849,16 @@ mod test { ]); for (name, records) in &blueprint_dns_zone.records { + let mut this_kind = None; + let kinds_left: Vec<_> = + srv_kinds_expected.iter().copied().collect(); + for kind in kinds_left { + if kind.dns_name() == *name { + srv_kinds_expected.remove(&kind); + this_kind = Some(kind); + } + } + let srvs: Vec<_> = records .iter() .filter_map(|dns_record| match dns_record { @@ -828,19 +873,27 @@ mod test { correspond to a name that points to any Omicron zone", srv.target ); - } - - let kinds_left: Vec<_> = - srv_kinds_expected.iter().copied().collect(); - for kind in kinds_left { - if kind.dns_name() == *name { - srv_kinds_expected.remove(&kind); + if this_kind == Some(ServiceName::BoundaryNtp) { + assert!( + expected_boundary_ntp_srv_targets.contains(&srv.target), + "found boundary NTP SRV record with target {:?} \ + that does not correspond to an expected boundary \ + NTP zone", + srv.target, + ); + expected_boundary_ntp_srv_targets.remove(&srv.target); } } } println!("SRV kinds with no records found: {:?}", srv_kinds_expected); assert!(srv_kinds_expected.is_empty()); + + println!( + "Boundary NTP SRV targets not found: {:?}", + expected_boundary_ntp_srv_targets + ); + assert!(expected_boundary_ntp_srv_targets.is_empty()); } #[tokio::test] @@ -1261,6 +1314,7 @@ mod test { cockroachdb_settings: &CockroachDbSettings::empty(), external_ip_rows: &[], service_nic_rows: &[], + target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY, target_nexus_zone_count: NEXUS_REDUNDANCY, target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY, target_cockroachdb_cluster_version: @@ -1288,7 +1342,8 @@ mod test { .unwrap(); let sled_id = blueprint.sleds().next().expect("expected at least one sled"); - let nalready = builder.sled_num_zones_of_kind(sled_id, ZoneKind::Nexus); + let nalready = + builder.sled_num_running_zones_of_kind(sled_id, ZoneKind::Nexus); let rv = builder .sled_ensure_zone_multiple_nexus(sled_id, nalready + 1) .unwrap(); diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index e3d2019230..bb525b1b8b 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -28,7 +28,6 @@ use std::net::SocketAddrV6; mod cockroachdb; mod datasets; mod dns; -mod external_networking; mod omicron_physical_disks; mod omicron_zones; mod overridables; @@ -117,31 +116,14 @@ where "blueprint_id" => %blueprint.id ); - // Deallocate external networking resources for non-externally-reachable - // zones first. This will allow external networking resource allocation to - // succeed if we are swapping an external IP between two zones (e.g., moving - // a specific external IP from an old external DNS zone to a new one). - external_networking::ensure_zone_external_networking_deallocated( - &opctx, - datastore, - blueprint - .all_omicron_zones_not_in( - BlueprintZoneFilter::ShouldBeExternallyReachable, - ) - .map(|(_sled_id, zone)| zone), - ) - .await - .map_err(|err| vec![err])?; - - external_networking::ensure_zone_external_networking_allocated( - &opctx, - datastore, - blueprint - .all_omicron_zones(BlueprintZoneFilter::ShouldBeExternallyReachable) - .map(|(_sled_id, zone)| zone), - ) - .await - .map_err(|err| vec![err])?; + datastore + .blueprint_ensure_external_networking_resources(&opctx, blueprint) + .await + .map_err(|err| { + vec![anyhow!(err).context( + "failed to ensure external networking resources in database", + )] + })?; let sleds_by_id: BTreeMap = datastore .sled_list_all_batched(&opctx, SledFilter::InService) diff --git a/nexus/reconfigurator/planning/proptest-regressions/planner/omicron_zone_placement.txt b/nexus/reconfigurator/planning/proptest-regressions/planner/omicron_zone_placement.txt index bb2ad481bc..17ae1771d1 100644 --- a/nexus/reconfigurator/planning/proptest-regressions/planner/omicron_zone_placement.txt +++ b/nexus/reconfigurator/planning/proptest-regressions/planner/omicron_zone_placement.txt @@ -5,3 +5,4 @@ # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc 72b902d1405681df2dd46efc097da6840ff1234dc9d0d7c0ecf07bed0b0e7d8d # shrinks to input = _TestPlaceOmicronZonesArgs { input: ArbitraryTestInput { existing_sleds: {[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]: ExistingSled { zones: ZonesToPlace { zones: [] }, waiting_for_ntp: false, num_disks: 1 }}, zones_to_place: ZonesToPlace { zones: [Nexus] } } } +cc d725ad7fd51d0409c2f24088730159c1c3043a7675d46b966e45cb86b570a141 # shrinks to input = _TestPlaceOmicronZonesArgs { input: ArbitraryTestInput { existing_sleds: {[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]: ExistingSled { zones: ZonesToPlace { zones: [] }, num_zpools: 2 }}, zones_to_place: ZonesToPlace { zones: [BoundaryNtp, BoundaryNtp] } } } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 09ae4132f3..2d8a7c9598 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -26,6 +26,7 @@ use nexus_types::deployment::BlueprintZonesConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::DiskFilter; use nexus_types::deployment::OmicronZoneExternalFloatingIp; +use nexus_types::deployment::OmicronZoneExternalSnatIp; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledDetails; use nexus_types::deployment::SledFilter; @@ -71,6 +72,7 @@ use typed_rng::UuidRng; use super::external_networking::BuilderExternalNetworking; use super::external_networking::ExternalNetworkingChoice; +use super::external_networking::ExternalSnatNetworkingChoice; use super::zones::is_already_expunged; use super::zones::BuilderZoneState; use super::zones::BuilderZonesConfig; @@ -86,12 +88,14 @@ pub enum Error { NoAvailableZpool { sled_id: SledUuid, kind: ZoneKind }, #[error("no Nexus zones exist in parent blueprint")] NoNexusZonesInParentBlueprint, + #[error("no Boundary NTP zones exist in parent blueprint")] + NoBoundaryNtpZonesInParentBlueprint, #[error("no external service IP addresses are available")] NoExternalServiceIpAvailable, #[error("no system MAC addresses are available")] NoSystemMacAddressAvailable, - #[error("exhausted available Nexus IP addresses")] - ExhaustedNexusIps, + #[error("exhausted available OPTE IP addresses for service {kind:?}")] + ExhaustedOpteIps { kind: ZoneKind }, #[error( "invariant violation: found decommissioned sled with \ {num_zones} non-expunged zones: {sled_id}" @@ -101,7 +105,7 @@ pub enum Error { num_zones: usize, }, #[error("programming error in planner")] - Planner(#[from] anyhow::Error), + Planner(#[source] anyhow::Error), } /// Describes whether an idempotent "ensure" operation resulted in action taken @@ -341,8 +345,9 @@ impl<'a> BlueprintBuilder<'a> { pub fn current_sled_zones( &self, sled_id: SledUuid, + filter: BlueprintZoneFilter, ) -> impl Iterator { - self.zones.current_sled_zones(sled_id).map(|(config, _)| config) + self.zones.current_sled_zones(sled_id, filter).map(|(config, _)| config) } /// Assemble a final [`Blueprint`] based on the contents of the builder @@ -432,7 +437,8 @@ impl<'a> BlueprintBuilder<'a> { // Do any zones need to be marked expunged? let mut zones_to_expunge = BTreeMap::new(); - let sled_zones = self.zones.current_sled_zones(sled_id); + let sled_zones = + self.zones.current_sled_zones(sled_id, BlueprintZoneFilter::All); for (zone_config, state) in sled_zones { let zone_id = zone_config.id; let log = log.new(o!( @@ -498,9 +504,9 @@ impl<'a> BlueprintBuilder<'a> { change .expunge_zones(zones_to_expunge.keys().cloned().collect()) .map_err(|error| { - anyhow!(error).context(format!( + Error::Planner(anyhow!(error).context(format!( "for sled {sled_id}, error expunging zones" - )) + ))) })?; // Finally, add comments describing what happened. @@ -620,7 +626,7 @@ impl<'a> BlueprintBuilder<'a> { // If there's already an NTP zone on this sled, do nothing. let has_ntp = self .zones - .current_sled_zones(sled_id) + .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) .any(|(z, _)| z.zone_type.is_ntp()); if has_ntp { return Ok(Ensure::NotNeeded); @@ -687,8 +693,10 @@ impl<'a> BlueprintBuilder<'a> { let pool_name = ZpoolName::new_external(zpool_id); // If this sled already has a Crucible zone on this pool, do nothing. - let has_crucible_on_this_pool = - self.zones.current_sled_zones(sled_id).any(|(z, _)| { + let has_crucible_on_this_pool = self + .zones + .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) + .any(|(z, _)| { matches!( &z.zone_type, BlueprintZoneType::Crucible(blueprint_zone_type::Crucible { @@ -739,13 +747,13 @@ impl<'a> BlueprintBuilder<'a> { /// /// This value may change before a blueprint is actually generated if /// further changes are made to the builder. - pub fn sled_num_zones_of_kind( + pub fn sled_num_running_zones_of_kind( &self, sled_id: SledUuid, kind: ZoneKind, ) -> usize { self.zones - .current_sled_zones(sled_id) + .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) .filter(|(z, _)| z.zone_type.kind() == kind) .count() } @@ -793,7 +801,8 @@ impl<'a> BlueprintBuilder<'a> { external_dns_servers: Vec, ) -> Result { // How many Nexus zones do we need to add? - let nexus_count = self.sled_num_zones_of_kind(sled_id, ZoneKind::Nexus); + let nexus_count = + self.sled_num_running_zones_of_kind(sled_id, ZoneKind::Nexus); let num_nexus_to_add = match desired_zone_count.checked_sub(nexus_count) { Some(0) => return Ok(EnsureMultiple::NotNeeded), @@ -820,21 +829,19 @@ impl<'a> BlueprintBuilder<'a> { ip: external_ip, }; - let nic = { - NetworkInterface { - id: self.rng.network_interface_rng.next(), - kind: NetworkInterfaceKind::Service { - id: nexus_id.into_untyped_uuid(), - }, - name: format!("nexus-{nexus_id}").parse().unwrap(), - ip: nic_ip, - mac: nic_mac, - subnet: nic_subnet, - vni: Vni::SERVICES_VNI, - primary: true, - slot: 0, - transit_ips: vec![], - } + let nic = NetworkInterface { + id: self.rng.network_interface_rng.next(), + kind: NetworkInterfaceKind::Service { + id: nexus_id.into_untyped_uuid(), + }, + name: format!("nexus-{nexus_id}").parse().unwrap(), + ip: nic_ip, + mac: nic_mac, + subnet: nic_subnet, + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + transit_ips: vec![], }; let ip = self.sled_alloc_ip(sled_id)?; @@ -878,7 +885,7 @@ impl<'a> BlueprintBuilder<'a> { ) -> Result { // How many CRDB zones do we need to add? let crdb_count = - self.sled_num_zones_of_kind(sled_id, ZoneKind::CockroachDb); + self.sled_num_running_zones_of_kind(sled_id, ZoneKind::CockroachDb); let num_crdb_to_add = match desired_zone_count.checked_sub(crdb_count) { Some(0) => return Ok(EnsureMultiple::NotNeeded), Some(n) => n, @@ -920,6 +927,157 @@ impl<'a> BlueprintBuilder<'a> { Ok(EnsureMultiple::Changed { added: num_crdb_to_add, removed: 0 }) } + pub fn sled_promote_internal_ntp_to_boundary_ntp( + &mut self, + sled_id: SledUuid, + ) -> Result { + // The upstream NTP/DNS servers and domain _should_ come from Nexus and + // be modifiable by the operator, but currently can only be set at RSS. + // We can only promote a new boundary NTP zone by copying these settings + // from an existing one. + let (ntp_servers, dns_servers, domain) = self + .parent_blueprint + .all_omicron_zones(BlueprintZoneFilter::All) + .find_map(|(_, z)| match &z.zone_type { + BlueprintZoneType::BoundaryNtp(zone_config) => Some(( + zone_config.ntp_servers.clone(), + zone_config.dns_servers.clone(), + zone_config.domain.clone(), + )), + _ => None, + }) + .ok_or(Error::NoBoundaryNtpZonesInParentBlueprint)?; + + self.sled_promote_internal_ntp_to_boundary_ntp_with_config( + sled_id, + ntp_servers, + dns_servers, + domain, + ) + } + + pub fn sled_promote_internal_ntp_to_boundary_ntp_with_config( + &mut self, + sled_id: SledUuid, + ntp_servers: Vec, + dns_servers: Vec, + domain: Option, + ) -> Result { + // Check the sled id and return an appropriate error if it's invalid. + let _ = self.sled_resources(sled_id)?; + + let sled_zones = self.zones.change_sled_zones(sled_id); + + // Find the internal NTP zone and expunge it. + let mut internal_ntp_zone_id_iter = sled_zones + .iter_zones(BlueprintZoneFilter::ShouldBeRunning) + .filter_map(|config| { + if matches!( + config.zone().zone_type, + BlueprintZoneType::InternalNtp(_) + ) { + Some(config.zone().id) + } else { + None + } + }); + + // We should have exactly one internal NTP zone. + let internal_ntp_zone_id = + internal_ntp_zone_id_iter.next().ok_or_else(|| { + Error::Planner(anyhow!( + "cannot promote internal NTP zone on sled {sled_id}: \ + no internal NTP zone found" + )) + })?; + if internal_ntp_zone_id_iter.next().is_some() { + return Err(Error::Planner(anyhow!( + "sled {sled_id} has multiple internal NTP zones" + ))); + } + std::mem::drop(internal_ntp_zone_id_iter); + + // Expunge the internal NTP zone. + sled_zones.expunge_zone(internal_ntp_zone_id).map_err(|error| { + Error::Planner(anyhow!(error).context(format!( + "error expunging internal NTP zone from sled {sled_id}" + ))) + })?; + + // Add the new boundary NTP zone. + let new_zone_id = self.rng.zone_rng.next(); + let ExternalSnatNetworkingChoice { + snat_cfg, + nic_ip, + nic_subnet, + nic_mac, + } = self.external_networking.for_new_boundary_ntp()?; + let external_ip = OmicronZoneExternalSnatIp { + id: self.rng.external_ip_rng.next(), + snat_cfg, + }; + let nic = NetworkInterface { + id: self.rng.network_interface_rng.next(), + kind: NetworkInterfaceKind::Service { + id: new_zone_id.into_untyped_uuid(), + }, + name: format!("ntp-{new_zone_id}").parse().unwrap(), + ip: nic_ip, + mac: nic_mac, + subnet: nic_subnet, + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + transit_ips: vec![], + }; + + let underlay_ip = self.sled_alloc_ip(sled_id)?; + let port = omicron_common::address::NTP_PORT; + let zone_type = + BlueprintZoneType::BoundaryNtp(blueprint_zone_type::BoundaryNtp { + address: SocketAddrV6::new(underlay_ip, port, 0, 0), + ntp_servers, + dns_servers, + domain, + nic, + external_ip, + }); + let filesystem_pool = + self.sled_select_zpool(sled_id, zone_type.kind())?; + + self.sled_add_zone( + sled_id, + BlueprintZoneConfig { + disposition: BlueprintZoneDisposition::InService, + id: new_zone_id, + underlay_address: underlay_ip, + filesystem_pool: Some(filesystem_pool), + zone_type, + }, + )?; + + Ok(EnsureMultiple::Changed { added: 1, removed: 1 }) + } + + pub fn sled_expunge_zone( + &mut self, + sled_id: SledUuid, + zone_id: OmicronZoneUuid, + ) -> Result<(), Error> { + // Check the sled id and return an appropriate error if it's invalid. + let _ = self.sled_resources(sled_id)?; + + let sled_zones = self.zones.change_sled_zones(sled_id); + sled_zones.expunge_zone(zone_id).map_err(|error| { + Error::Planner( + anyhow!(error) + .context("failed to expunge zone from sled {sled_id}"), + ) + })?; + + Ok(()) + } + fn sled_add_zone( &mut self, sled_id: SledUuid, @@ -930,8 +1088,10 @@ impl<'a> BlueprintBuilder<'a> { let sled_zones = self.zones.change_sled_zones(sled_id); sled_zones.add_zone(zone).map_err(|error| { - anyhow!(error) - .context(format!("error adding zone to sled {sled_id}")) + Error::Planner( + anyhow!(error) + .context(format!("error adding zone to sled {sled_id}")), + ) })?; Ok(()) @@ -966,7 +1126,10 @@ impl<'a> BlueprintBuilder<'a> { // Record each of the sled's zones' underlay addresses as // allocated. - for (z, _) in self.zones.current_sled_zones(sled_id) { + for (z, _) in self + .zones + .current_sled_zones(sled_id, BlueprintZoneFilter::All) + { allocator.reserve(z.underlay_address); } @@ -995,7 +1158,9 @@ impl<'a> BlueprintBuilder<'a> { // sled already has a durable zone of that kind on the same zpool. Build // up a set of invalid zpools for this sled/kind pair. let mut skip_zpools = BTreeSet::new(); - for zone_config in self.current_sled_zones(sled_id) { + for zone_config in self + .current_sled_zones(sled_id, BlueprintZoneFilter::ShouldBeRunning) + { if let Some(zpool) = zone_config.zone_type.durable_zpool() { if zone_kind == zone_config.zone_type.kind() { skip_zpools.insert(zpool); @@ -1124,17 +1289,21 @@ impl<'a> BlueprintZonesBuilder<'a> { pub fn current_sled_zones( &self, sled_id: SledUuid, + filter: BlueprintZoneFilter, ) -> Box + '_> { if let Some(sled_zones) = self.changed_zones.get(&sled_id) { - Box::new(sled_zones.iter_zones().map(|z| (z.zone(), z.state()))) - } else if let Some(parent_zones) = self.parent_zones.get(&sled_id) { Box::new( - parent_zones - .zones - .iter() - .map(|z| (z, BuilderZoneState::Unchanged)), + sled_zones.iter_zones(filter).map(|z| (z.zone(), z.state())), ) + } else if let Some(parent_zones) = self.parent_zones.get(&sled_id) { + Box::new(parent_zones.zones.iter().filter_map(move |z| { + if z.disposition.matches(filter) { + Some((z, BuilderZoneState::Unchanged)) + } else { + None + } + })) } else { Box::new(std::iter::empty()) } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs index 3326bfdbe5..93c845add5 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs @@ -6,11 +6,13 @@ use super::Error; use anyhow::bail; use debug_ignore::DebugIgnore; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; +use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; use nexus_types::deployment::PlanningInput; +use nexus_types::inventory::SourceNatConfig; use omicron_common::address::IpRange; use omicron_common::address::DNS_OPTE_IPV4_SUBNET; use omicron_common::address::DNS_OPTE_IPV6_SUBNET; @@ -20,7 +22,9 @@ use omicron_common::address::NTP_OPTE_IPV4_SUBNET; use omicron_common::address::NTP_OPTE_IPV6_SUBNET; use omicron_common::address::NUM_SOURCE_NAT_PORTS; use omicron_common::api::external::MacAddr; +use omicron_common::api::internal::shared::SourceNatConfigError; use oxnet::IpNet; +use std::collections::btree_map::Entry; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashSet; @@ -28,22 +32,13 @@ use std::hash::Hash; use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; - -// These imports are currently only used `#[cfg(test)]` methods, but those -// methods will become non-test-only once we support boundary NTP zone -// allocation. -#[cfg(test)] -use nexus_types::inventory::SourceNatConfig; -#[cfg(test)] -use omicron_common::api::internal::shared::SourceNatConfigError; -#[cfg(test)] -use std::collections::btree_map::Entry; -#[cfg(test)] use strum::IntoEnumIterator as _; #[derive(Debug)] pub(super) struct BuilderExternalNetworking<'a> { // These fields mirror how RSS chooses addresses for zone NICs. + boundary_ntp_v4_ips: AvailableIterator<'static, Ipv4Addr>, + boundary_ntp_v6_ips: AvailableIterator<'static, Ipv6Addr>, nexus_v4_ips: AvailableIterator<'static, Ipv4Addr>, nexus_v6_ips: AvailableIterator<'static, Ipv6Addr>, @@ -100,6 +95,10 @@ impl<'a> BuilderExternalNetworking<'a> { let mut existing_nexus_v4_ips: HashSet = HashSet::new(); let mut existing_nexus_v6_ips: HashSet = HashSet::new(); + let mut existing_boundary_ntp_v4_ips: HashSet = + HashSet::new(); + let mut existing_boundary_ntp_v6_ips: HashSet = + HashSet::new(); let mut external_ip_alloc = ExternalIpAllocator::new(input.service_ip_pool_ranges()); let mut used_macs: HashSet = HashSet::new(); @@ -108,8 +107,20 @@ impl<'a> BuilderExternalNetworking<'a> { parent_blueprint.all_omicron_zones(BlueprintZoneFilter::All) { let zone_type = &z.zone_type; - if let BlueprintZoneType::Nexus(nexus) = zone_type { - match nexus.nic.ip { + match zone_type { + BlueprintZoneType::BoundaryNtp(ntp) => match ntp.nic.ip { + IpAddr::V4(ip) => { + if !existing_boundary_ntp_v4_ips.insert(ip) { + bail!("duplicate Boundary NTP NIC IP: {ip}"); + } + } + IpAddr::V6(ip) => { + if !existing_boundary_ntp_v6_ips.insert(ip) { + bail!("duplicate Boundary NTP NIC IP: {ip}"); + } + } + }, + BlueprintZoneType::Nexus(nexus) => match nexus.nic.ip { IpAddr::V4(ip) => { if !existing_nexus_v4_ips.insert(ip) { bail!("duplicate Nexus NIC IP: {ip}"); @@ -120,7 +131,8 @@ impl<'a> BuilderExternalNetworking<'a> { bail!("duplicate Nexus NIC IP: {ip}"); } } - } + }, + _ => (), } if let Some((external_ip, nic)) = zone_type.external_networking() { @@ -171,7 +183,12 @@ impl<'a> BuilderExternalNetworking<'a> { } } IpAddr::V4(ip) if NTP_OPTE_IPV4_SUBNET.contains(ip) => { - // TODO check existing_ntp_v4_ips, once it exists + if !existing_boundary_ntp_v4_ips.contains(&ip) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } } IpAddr::V4(ip) if DNS_OPTE_IPV4_SUBNET.contains(ip) => { // TODO check existing_dns_v4_ips, once it exists @@ -185,7 +202,12 @@ impl<'a> BuilderExternalNetworking<'a> { } } IpAddr::V6(ip) if NTP_OPTE_IPV6_SUBNET.contains(ip) => { - // TODO check existing_ntp_v6_ips, once it exists + if !existing_boundary_ntp_v6_ips.contains(&ip) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } } IpAddr::V6(ip) if DNS_OPTE_IPV6_SUBNET.contains(ip) => { // TODO check existing_dns_v6_ips, once it exists @@ -217,10 +239,22 @@ impl<'a> BuilderExternalNetworking<'a> { .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES), existing_nexus_v6_ips, ); + let boundary_ntp_v4_ips = AvailableIterator::new( + NTP_OPTE_IPV4_SUBNET + .addr_iter() + .skip(NUM_INITIAL_RESERVED_IP_ADDRESSES), + existing_boundary_ntp_v4_ips, + ); + let boundary_ntp_v6_ips = AvailableIterator::new( + NTP_OPTE_IPV6_SUBNET.iter().skip(NUM_INITIAL_RESERVED_IP_ADDRESSES), + existing_boundary_ntp_v6_ips, + ); let available_system_macs = AvailableIterator::new(MacAddr::iter_system(), used_macs); Ok(Self { + boundary_ntp_v4_ips, + boundary_ntp_v6_ips, nexus_v4_ips, nexus_v6_ips, external_ip_alloc, @@ -236,14 +270,14 @@ impl<'a> BuilderExternalNetworking<'a> { IpAddr::V4(_) => ( self.nexus_v4_ips .next() - .ok_or(Error::ExhaustedNexusIps)? + .ok_or(Error::ExhaustedOpteIps { kind: ZoneKind::Nexus })? .into(), IpNet::from(*NEXUS_OPTE_IPV4_SUBNET), ), IpAddr::V6(_) => ( self.nexus_v6_ips .next() - .ok_or(Error::ExhaustedNexusIps)? + .ok_or(Error::ExhaustedOpteIps { kind: ZoneKind::Nexus })? .into(), IpNet::from(*NEXUS_OPTE_IPV6_SUBNET), ), @@ -260,6 +294,43 @@ impl<'a> BuilderExternalNetworking<'a> { nic_mac, }) } + + pub(super) fn for_new_boundary_ntp( + &mut self, + ) -> Result { + let snat_cfg = self.external_ip_alloc.claim_next_snat_ip()?; + let (nic_ip, nic_subnet) = match snat_cfg.ip { + IpAddr::V4(_) => ( + self.boundary_ntp_v4_ips + .next() + .ok_or(Error::ExhaustedOpteIps { + kind: ZoneKind::BoundaryNtp, + })? + .into(), + IpNet::from(*NTP_OPTE_IPV4_SUBNET), + ), + IpAddr::V6(_) => ( + self.boundary_ntp_v6_ips + .next() + .ok_or(Error::ExhaustedOpteIps { + kind: ZoneKind::BoundaryNtp, + })? + .into(), + IpNet::from(*NTP_OPTE_IPV6_SUBNET), + ), + }; + let nic_mac = self + .available_system_macs + .next() + .ok_or(Error::NoSystemMacAddressAvailable)?; + + Ok(ExternalSnatNetworkingChoice { + snat_cfg, + nic_ip, + nic_subnet, + nic_mac, + }) + } } #[derive(Debug, Clone, Copy)] @@ -270,6 +341,14 @@ pub(super) struct ExternalNetworkingChoice { pub(super) nic_mac: MacAddr, } +#[derive(Debug, Clone, Copy)] +pub(super) struct ExternalSnatNetworkingChoice { + pub(super) snat_cfg: SourceNatConfig, + pub(super) nic_ip: IpAddr, + pub(super) nic_subnet: IpNet, + pub(super) nic_mac: MacAddr, +} + /// Combines a base iterator with an `in_use` set, filtering out any elements /// that are in the "in_use" set. /// @@ -407,9 +486,6 @@ impl<'a> ExternalIpAllocator<'a> { Err(Error::NoExternalServiceIpAvailable) } - // This is currently only used by a unit test, but will be used by real code - // once we support boundary NTP zone allocation. - #[cfg(test)] fn claim_next_snat_ip(&mut self) -> Result { // Prefer reusing an existing SNAT IP, if we still have port ranges // available on that ip. @@ -453,9 +529,6 @@ enum SnatPortRange { } impl SnatPortRange { - // This is currently only used by a unit test, but will be used by real code - // once we support boundary NTP zone allocation. - #[cfg(test)] fn into_source_nat_config(self, ip: IpAddr) -> SourceNatConfig { let first = match self { SnatPortRange::One => 0, diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/zones.rs b/nexus/reconfigurator/planning/src/blueprint_builder/zones.rs index 6cb76539ec..1413dfec19 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/zones.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/zones.rs @@ -5,7 +5,8 @@ use std::collections::BTreeSet; use nexus_types::deployment::{ - BlueprintZoneConfig, BlueprintZoneDisposition, BlueprintZonesConfig, + BlueprintZoneConfig, BlueprintZoneDisposition, BlueprintZoneFilter, + BlueprintZonesConfig, }; use omicron_common::api::external::Generation; use omicron_uuid_kinds::OmicronZoneUuid; @@ -71,6 +72,31 @@ impl BuilderZonesConfig { Ok(()) } + pub(super) fn expunge_zone( + &mut self, + zone_id: OmicronZoneUuid, + ) -> Result<(), BuilderZonesConfigError> { + let zone = self + .zones + .iter_mut() + .find(|zone| zone.zone.id == zone_id) + .ok_or_else(|| { + let mut unmatched = BTreeSet::new(); + unmatched.insert(zone_id); + BuilderZonesConfigError::ExpungeUnmatchedZones { unmatched } + })?; + + // Check that the zone is expungeable. Typically, zones passed + // in here should have had this check done to them already, but + // in case they're not, or in case something else about those + // zones changed in between, check again. + is_already_expunged(&zone.zone, zone.state)?; + zone.zone.disposition = BlueprintZoneDisposition::Expunged; + zone.state = BuilderZoneState::Modified; + + Ok(()) + } + pub(super) fn expunge_zones( &mut self, mut zones: BTreeSet, @@ -100,8 +126,9 @@ impl BuilderZonesConfig { pub(super) fn iter_zones( &self, + filter: BlueprintZoneFilter, ) -> impl Iterator { - self.zones.iter() + self.zones.iter().filter(move |z| z.zone().disposition.matches(filter)) } pub(super) fn build(self) -> BlueprintZonesConfig { @@ -279,7 +306,10 @@ mod tests { // Iterate over the zones for the sled and ensure that the NTP zone is // present. { - let mut zones = builder.zones.current_sled_zones(new_sled_id); + let mut zones = builder.zones.current_sled_zones( + new_sled_id, + BlueprintZoneFilter::ShouldBeRunning, + ); let (_, state) = zones.next().expect("exactly one zone for sled"); assert!(zones.next().is_none(), "exactly one zone for sled"); assert_eq!( @@ -323,7 +353,7 @@ mod tests { // Attempt to expunge one of the other zones on the sled. let existing_zone_id = change - .iter_zones() + .iter_zones(BlueprintZoneFilter::ShouldBeRunning) .find(|z| z.zone.id != new_zone_id) .expect("at least one existing zone") .zone @@ -352,7 +382,10 @@ mod tests { { // Iterate over the zones and ensure that the Oximeter zone is // present, and marked added. - let mut zones = builder.zones.current_sled_zones(existing_sled_id); + let mut zones = builder.zones.current_sled_zones( + existing_sled_id, + BlueprintZoneFilter::ShouldBeRunning, + ); zones .find_map(|(z, state)| { if z.id == new_zone_id { diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 509c6722cb..3bd1b8757e 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -16,6 +16,7 @@ use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::CockroachDbSettings; @@ -144,8 +145,10 @@ impl<'a> Planner<'a> { // Check 2: have all this sled's zones been expunged? It's possible // we ourselves have made this change, which is fine. - let all_zones_expunged = - self.blueprint.current_sled_zones(sled_id).all(|zone| { + let all_zones_expunged = self + .blueprint + .current_sled_zones(sled_id, BlueprintZoneFilter::All) + .all(|zone| { zone.disposition == BlueprintZoneDisposition::Expunged }); @@ -187,7 +190,7 @@ impl<'a> Planner<'a> { if !commissioned_sled_ids.contains(&sled_id) { let num_zones = self .blueprint - .current_sled_zones(sled_id) + .current_sled_zones(sled_id, BlueprintZoneFilter::All) .filter(|zone| { zone.disposition != BlueprintZoneDisposition::Expunged }) @@ -351,8 +354,9 @@ impl<'a> Planner<'a> { let mut zone_placement = None; for zone_kind in [ - DiscretionaryOmicronZone::Nexus, + DiscretionaryOmicronZone::BoundaryNtp, DiscretionaryOmicronZone::CockroachDb, + DiscretionaryOmicronZone::Nexus, ] { let num_zones_to_add = self.num_additional_zones_needed(zone_kind); if num_zones_to_add == 0 { @@ -361,29 +365,30 @@ impl<'a> Planner<'a> { // We need to add at least one zone; construct our `zone_placement` // (or reuse the existing one if a previous loop iteration already // created it). - let zone_placement = match zone_placement.as_mut() { - Some(zone_placement) => zone_placement, - None => { - // This constructs a picture of the sleds as we currently - // understand them, as far as which sleds have discretionary - // zones. This will remain valid as we loop through the - // `zone_kind`s in this function, as any zone additions will - // update the `zone_placement` heap in-place. - let current_discretionary_zones = self - .input - .all_sled_resources(SledFilter::Discretionary) - .filter(|(sled_id, _)| { - !sleds_waiting_for_ntp_zone.contains(&sled_id) - }) - .map(|(sled_id, sled_resources)| { - OmicronZonePlacementSledState { + let zone_placement = zone_placement.get_or_insert_with(|| { + // This constructs a picture of the sleds as we currently + // understand them, as far as which sleds have discretionary + // zones. This will remain valid as we loop through the + // `zone_kind`s in this function, as any zone additions will + // update the `zone_placement` heap in-place. + let current_discretionary_zones = self + .input + .all_sled_resources(SledFilter::Discretionary) + .filter(|(sled_id, _)| { + !sleds_waiting_for_ntp_zone.contains(&sled_id) + }) + .map(|(sled_id, sled_resources)| { + OmicronZonePlacementSledState { sled_id, num_zpools: sled_resources .all_zpools(ZpoolFilter::InService) .count(), discretionary_zones: self .blueprint - .current_sled_zones(sled_id) + .current_sled_zones( + sled_id, + BlueprintZoneFilter::ShouldBeRunning, + ) .filter_map(|zone| { DiscretionaryOmicronZone::from_zone_type( &zone.zone_type, @@ -391,12 +396,9 @@ impl<'a> Planner<'a> { }) .collect(), } - }); - zone_placement.insert(OmicronZonePlacement::new( - current_discretionary_zones, - )) - } - }; + }); + OmicronZonePlacement::new(current_discretionary_zones) + }); self.add_discretionary_zones( zone_placement, zone_kind, @@ -421,17 +423,20 @@ impl<'a> Planner<'a> { for sled_id in self.input.all_sled_ids(SledFilter::InService) { let num_zones_of_kind = self .blueprint - .sled_num_zones_of_kind(sled_id, zone_kind.into()); + .sled_num_running_zones_of_kind(sled_id, zone_kind.into()); num_existing_kind_zones += num_zones_of_kind; } let target_count = match zone_kind { - DiscretionaryOmicronZone::Nexus => { - self.input.target_nexus_zone_count() + DiscretionaryOmicronZone::BoundaryNtp => { + self.input.target_boundary_ntp_zone_count() } DiscretionaryOmicronZone::CockroachDb => { self.input.target_cockroachdb_zone_count() } + DiscretionaryOmicronZone::Nexus => { + self.input.target_nexus_zone_count() + } }; // TODO-correctness What should we do if we have _too many_ @@ -496,29 +501,36 @@ impl<'a> Planner<'a> { // total zones go on a given sled, but we have a count of how many // we want to add. Construct a new target count. Maybe the builder // should provide a different interface here? - let new_total_zone_count = - self.blueprint.sled_num_zones_of_kind(sled_id, kind.into()) - + additional_zone_count; + let new_total_zone_count = self + .blueprint + .sled_num_running_zones_of_kind(sled_id, kind.into()) + + additional_zone_count; let result = match kind { - DiscretionaryOmicronZone::Nexus => { - self.blueprint.sled_ensure_zone_multiple_nexus( + DiscretionaryOmicronZone::BoundaryNtp => self + .blueprint + .sled_promote_internal_ntp_to_boundary_ntp(sled_id)?, + DiscretionaryOmicronZone::CockroachDb => { + self.blueprint.sled_ensure_zone_multiple_cockroachdb( sled_id, new_total_zone_count, )? } - DiscretionaryOmicronZone::CockroachDb => { - self.blueprint.sled_ensure_zone_multiple_cockroachdb( + DiscretionaryOmicronZone::Nexus => { + self.blueprint.sled_ensure_zone_multiple_nexus( sled_id, new_total_zone_count, )? } }; match result { - EnsureMultiple::Changed { added, removed: _ } => { + EnsureMultiple::Changed { added, removed } => { info!( - self.log, "will add {added} Nexus zone(s) to sled"; + self.log, "modified zones on sled"; "sled_id" => %sled_id, + "kind" => ?kind, + "added" => added, + "removed" => removed, ); new_zones_added += added; } @@ -1389,11 +1401,18 @@ mod test { assert_eq!(diff.sleds_removed.len(), 0); assert_eq!(diff.sleds_modified.len(), 1); - // We should be removing all zones using this zpool - assert_eq!(diff.zones.added.len(), 0); + // We should be removing all zones using this zpool. Because we're + // removing the NTP zone, we should add a new one. + assert_eq!(diff.zones.added.len(), 1); assert_eq!(diff.zones.removed.len(), 0); assert_eq!(diff.zones.modified.len(), 1); + let (_zone_id, added_zones) = diff.zones.added.iter().next().unwrap(); + assert_eq!(added_zones.zones.len(), 1); + for zone in &added_zones.zones { + assert_eq!(zone.kind(), ZoneKind::InternalNtp); + } + let (_zone_id, modified_zones) = diff.zones.modified.iter().next().unwrap(); assert_eq!(modified_zones.zones.len(), zones_using_zpool); diff --git a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs index dcfb3b3150..c08f30124c 100644 --- a/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs +++ b/nexus/reconfigurator/planning/src/planner/omicron_zone_placement.rs @@ -14,8 +14,9 @@ use std::mem; #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[cfg_attr(test, derive(test_strategy::Arbitrary))] pub(crate) enum DiscretionaryOmicronZone { - Nexus, + BoundaryNtp, CockroachDb, + Nexus, // TODO expand this enum as we start to place more services } @@ -24,11 +25,11 @@ impl DiscretionaryOmicronZone { zone_type: &BlueprintZoneType, ) -> Option { match zone_type { - BlueprintZoneType::Nexus(_) => Some(Self::Nexus), + BlueprintZoneType::BoundaryNtp(_) => Some(Self::BoundaryNtp), BlueprintZoneType::CockroachDb(_) => Some(Self::CockroachDb), + BlueprintZoneType::Nexus(_) => Some(Self::Nexus), // Zones that we should place but don't yet. - BlueprintZoneType::BoundaryNtp(_) - | BlueprintZoneType::Clickhouse(_) + BlueprintZoneType::Clickhouse(_) | BlueprintZoneType::ClickhouseKeeper(_) | BlueprintZoneType::CruciblePantry(_) | BlueprintZoneType::ExternalDns(_) @@ -46,8 +47,9 @@ impl DiscretionaryOmicronZone { impl From for ZoneKind { fn from(zone: DiscretionaryOmicronZone) -> Self { match zone { - DiscretionaryOmicronZone::Nexus => Self::Nexus, + DiscretionaryOmicronZone::BoundaryNtp => Self::BoundaryNtp, DiscretionaryOmicronZone::CockroachDb => Self::CockroachDb, + DiscretionaryOmicronZone::Nexus => Self::Nexus, } } } @@ -68,6 +70,15 @@ pub(super) struct OmicronZonePlacementSledState { pub discretionary_zones: Vec, } +impl OmicronZonePlacementSledState { + fn num_discretionary_zones_of_kind( + &self, + kind: DiscretionaryOmicronZone, + ) -> usize { + self.discretionary_zones.iter().filter(|&&z| z == kind).count() + } +} + /// `OmicronZonePlacement` keeps an internal heap of sleds and their current /// discretionary zones and chooses sleds for placement of additional /// discretionary zones. @@ -154,21 +165,24 @@ impl OmicronZonePlacement { let mut sleds_skipped = Vec::new(); let mut chosen_sled = None; while let Some(sled) = self.sleds.pop() { - // Ensure we have at least one zpool more than the number of - // `zone_kind` zones already placed on this sled. If we don't, we - // already have a zone of this kind on each zpool, so we'll skip - // this sled. - if sled - .discretionary_zones - .iter() - .filter(|&&z| z == zone_kind) - .count() - < sled.num_zpools - { + let num_existing = sled.num_discretionary_zones_of_kind(zone_kind); + + // For boundary NTP, a sled is only eligible if it does not already + // hold a boundary NTP zone. + let should_skip = zone_kind + == DiscretionaryOmicronZone::BoundaryNtp + && num_existing > 0; + + // For all zone kinds, a sled is only eligible if it has at + // least one zpool more than the number of `zone_kind` zones + // already placed on this sled. + let should_skip = should_skip || num_existing >= sled.num_zpools; + + if should_skip { + sleds_skipped.push(sled); + } else { chosen_sled = Some(sled); break; - } else { - sleds_skipped.push(sled); } } @@ -374,14 +388,22 @@ pub mod test { ) -> Result<(), String> { let sled_state = self.sleds.get(&sled_id).expect("valid sled_id"); let existing_zones = sled_state.count_zones_of_kind(kind); - if existing_zones < sled_state.num_zpools { - Ok(()) - } else { + + // Boundary NTP is special: there should be at most one instance per + // sled, so placing a new boundary NTP zone is only legal if the + // sled doesn't already have one. + if kind == DiscretionaryOmicronZone::BoundaryNtp + && existing_zones > 0 + { + Err(format!("sled {sled_id} already has a boundary NTP zone")) + } else if existing_zones >= sled_state.num_zpools { Err(format!( "already have {existing_zones} \ {kind:?} instances but only {} zpools", sled_state.num_zpools )) + } else { + Ok(()) } } @@ -446,10 +468,20 @@ pub mod test { &self, kind: DiscretionaryOmicronZone, ) -> Result<(), String> { - // Zones should be placeable unless every sled already has a zone of - // this kind on every disk. + let max_this_kind_for_sled = |sled_state: &TestSledState| { + // Boundary NTP zones should be placeable unless every sled + // already has one. Other zone types should be placeable unless + // every sled already has a zone of that kind on every disk. + if kind == DiscretionaryOmicronZone::BoundaryNtp { + usize::min(1, sled_state.num_zpools) + } else { + sled_state.num_zpools + } + }; + for (sled_id, sled_state) in self.sleds.iter() { - if sled_state.count_zones_of_kind(kind) < sled_state.num_zpools + if sled_state.count_zones_of_kind(kind) + < max_this_kind_for_sled(sled_state) { return Err(format!( "sled {sled_id} is eligible for {kind:?} placement" diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index cef0c81b6f..899b896b7d 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -79,6 +79,7 @@ pub struct SystemDescription { sled_subnets: Box, available_non_scrimlet_slots: BTreeSet, available_scrimlet_slots: BTreeSet, + target_boundary_ntp_zone_count: usize, target_nexus_zone_count: usize, target_cockroachdb_zone_count: usize, target_cockroachdb_cluster_version: CockroachDbClusterVersion, @@ -130,9 +131,11 @@ impl SystemDescription { // Policy defaults let target_nexus_zone_count = NEXUS_REDUNDANCY; - // TODO-cleanup This is wrong, but we don't currently set up any CRDB - // nodes in our fake system, so this prevents downstream test issues - // with the planner thinking our system is out of date from the gate. + // TODO-cleanup These are wrong, but we don't currently set up any + // boundary NTP or CRDB nodes in our fake system, so this prevents + // downstream test issues with the planner thinking our system is out of + // date from the gate. + let target_boundary_ntp_zone_count = 0; let target_cockroachdb_zone_count = 0; let target_cockroachdb_cluster_version = @@ -151,6 +154,7 @@ impl SystemDescription { sled_subnets, available_non_scrimlet_slots, available_scrimlet_slots, + target_boundary_ntp_zone_count, target_nexus_zone_count, target_cockroachdb_zone_count, target_cockroachdb_cluster_version, @@ -319,6 +323,7 @@ impl SystemDescription { ) -> anyhow::Result { let policy = Policy { service_ip_pool_ranges: self.service_ip_pool_ranges.clone(), + target_boundary_ntp_zone_count: self.target_boundary_ntp_zone_count, target_nexus_zone_count: self.target_nexus_zone_count, target_cockroachdb_zone_count: self.target_cockroachdb_zone_count, target_cockroachdb_cluster_version: self diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 68971ec3e1..e0ba0f10ba 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -33,6 +33,7 @@ use nexus_types::identity::Resource; use nexus_types::inventory::Collection; use omicron_common::address::IpRange; use omicron_common::address::Ipv6Subnet; +use omicron_common::address::BOUNDARY_NTP_REDUNDANCY; use omicron_common::address::COCKROACHDB_REDUNDANCY; use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::address::SLED_PREFIX; @@ -60,6 +61,7 @@ pub struct PlanningInputFromDb<'a> { pub ip_pool_range_rows: &'a [nexus_db_model::IpPoolRange], pub external_ip_rows: &'a [nexus_db_model::ExternalIp], pub service_nic_rows: &'a [nexus_db_model::ServiceNetworkInterface], + pub target_boundary_ntp_zone_count: usize, pub target_nexus_zone_count: usize, pub target_cockroachdb_zone_count: usize, pub target_cockroachdb_cluster_version: CockroachDbClusterVersion, @@ -75,6 +77,7 @@ impl PlanningInputFromDb<'_> { self.ip_pool_range_rows.iter().map(IpRange::from).collect(); let policy = Policy { service_ip_pool_ranges, + target_boundary_ntp_zone_count: self.target_boundary_ntp_zone_count, target_nexus_zone_count: self.target_nexus_zone_count, target_cockroachdb_zone_count: self.target_cockroachdb_zone_count, target_cockroachdb_cluster_version: self @@ -236,6 +239,7 @@ pub async fn reconfigurator_state_load( sled_rows: &sled_rows, zpool_rows: &zpool_rows, ip_pool_range_rows: &ip_pool_range_rows, + target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY, target_nexus_zone_count: NEXUS_REDUNDANCY, target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY, target_cockroachdb_cluster_version: CockroachDbClusterVersion::POLICY, diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 2f1c4cd738..850e63443a 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -98,6 +98,7 @@ use super::tasks::dns_config; use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::external_endpoints; +use super::tasks::instance_updater; use super::tasks::instance_watcher; use super::tasks::inventory_collection; use super::tasks::lookup_region_port; @@ -154,6 +155,7 @@ pub struct BackgroundTasks { pub task_region_replacement: Activator, pub task_region_replacement_driver: Activator, pub task_instance_watcher: Activator, + pub task_instance_updater: Activator, pub task_service_firewall_propagation: Activator, pub task_abandoned_vmm_reaper: Activator, pub task_vpc_route_manager: Activator, @@ -234,6 +236,7 @@ impl BackgroundTasksInitializer { task_region_replacement: Activator::new(), task_region_replacement_driver: Activator::new(), task_instance_watcher: Activator::new(), + task_instance_updater: Activator::new(), task_service_firewall_propagation: Activator::new(), task_abandoned_vmm_reaper: Activator::new(), task_vpc_route_manager: Activator::new(), @@ -294,6 +297,7 @@ impl BackgroundTasksInitializer { task_region_replacement, task_region_replacement_driver, task_instance_watcher, + task_instance_updater, task_service_firewall_propagation, task_abandoned_vmm_reaper, task_vpc_route_manager, @@ -613,10 +617,9 @@ impl BackgroundTasksInitializer { { let watcher = instance_watcher::InstanceWatcher::new( datastore.clone(), - resolver.clone(), + sagas.clone(), producer_registry, instance_watcher::WatcherIdentity { nexus_id, rack_id }, - task_v2p_manager.clone(), ); driver.register(TaskDefinition { name: "instance_watcher", @@ -629,6 +632,25 @@ impl BackgroundTasksInitializer { }) }; + // Background task: schedule update sagas for instances in need of + // state updates. + { + let updater = instance_updater::InstanceUpdater::new( + datastore.clone(), + sagas.clone(), + config.instance_updater.disable, + ); + driver.register( TaskDefinition { + name: "instance_updater", + description: "detects if instances require update sagas and schedules them", + period: config.instance_watcher.period_secs, + task_impl: Box::new(updater), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_instance_updater, + }); + } + // Background task: service firewall rule propagation driver.register(TaskDefinition { name: "service_firewall_rule_propagation", diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 460d74360d..ee780812ae 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -124,6 +124,7 @@ mod test { }; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::DataStore; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::BlueprintZoneFilter; @@ -150,7 +151,9 @@ mod test { type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; - fn create_blueprint( + async fn create_blueprint( + datastore: &DataStore, + opctx: &OpContext, blueprint_zones: BTreeMap, blueprint_disks: BTreeMap, dns_version: Generation, @@ -162,28 +165,46 @@ mod test { .copied() .map(|sled_id| (sled_id, SledState::Active)) .collect::>(); - ( - BlueprintTarget { - target_id: id, - enabled: true, - time_made_target: chrono::Utc::now(), - }, - Blueprint { - id, - blueprint_zones, - blueprint_disks, - sled_state, - cockroachdb_setting_preserve_downgrade: - CockroachDbPreserveDowngrade::DoNotModify, - parent_blueprint_id: None, - internal_dns_version: dns_version, - external_dns_version: dns_version, - cockroachdb_fingerprint: String::new(), - time_created: chrono::Utc::now(), - creator: "test".to_string(), - comment: "test blueprint".to_string(), - }, - ) + + // Ensure the blueprint we're creating is the current target (required + // for successful blueprint realization). This requires its parent to be + // the existing target, so fetch that first. + let current_target = datastore + .blueprint_target_get_current(opctx) + .await + .expect("fetched current target blueprint"); + + let target = BlueprintTarget { + target_id: id, + enabled: true, + time_made_target: chrono::Utc::now(), + }; + let blueprint = Blueprint { + id, + blueprint_zones, + blueprint_disks, + sled_state, + cockroachdb_setting_preserve_downgrade: + CockroachDbPreserveDowngrade::DoNotModify, + parent_blueprint_id: Some(current_target.target_id), + internal_dns_version: dns_version, + external_dns_version: dns_version, + cockroachdb_fingerprint: String::new(), + time_created: chrono::Utc::now(), + creator: "test".to_string(), + comment: "test blueprint".to_string(), + }; + + datastore + .blueprint_insert(opctx, &blueprint) + .await + .expect("inserted new blueprint"); + datastore + .blueprint_target_set_current(opctx, target) + .await + .expect("set new blueprint as current target"); + + (target, blueprint) } #[nexus_test(server = crate::Server)] @@ -253,11 +274,16 @@ mod test { // With a target blueprint having no zones, the task should trivially // complete and report a successful (empty) summary. let generation = Generation::new(); - let blueprint = Arc::new(create_blueprint( - BTreeMap::new(), - BTreeMap::new(), - generation, - )); + let blueprint = Arc::new( + create_blueprint( + &datastore, + &opctx, + BTreeMap::new(), + BTreeMap::new(), + generation, + ) + .await, + ); blueprint_tx.send(Some(blueprint)).unwrap(); let value = task.activate(&opctx).await; println!("activating with no zones: {:?}", value); @@ -300,13 +326,16 @@ mod test { // // TODO: add expunged zones to the test (should not be deployed). let mut blueprint = create_blueprint( + &datastore, + &opctx, BTreeMap::from([ (sled_id1, make_zones(BlueprintZoneDisposition::InService)), (sled_id2, make_zones(BlueprintZoneDisposition::Quiesced)), ]), BTreeMap::new(), generation, - ); + ) + .await; // Insert records for the zpools backing the datasets in these zones. for (sled_id, config) in diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs new file mode 100644 index 0000000000..46a3bead21 --- /dev/null +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -0,0 +1,270 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for detecting instances in need of update sagas. + +use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas::instance_update; +use crate::app::sagas::NexusSaga; +use anyhow::Context; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_model::Instance; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::DataStore; +use nexus_db_queries::{authn, authz}; +use nexus_types::identity::Resource; +use omicron_common::api::external::ListResultVec; +use serde_json::json; +use std::future::Future; +use std::sync::Arc; +use tokio::task::JoinSet; + +pub struct InstanceUpdater { + datastore: Arc, + sagas: Arc, + disable: bool, +} + +impl InstanceUpdater { + pub fn new( + datastore: Arc, + sagas: Arc, + disable: bool, + ) -> Self { + InstanceUpdater { datastore, sagas, disable } + } + + async fn actually_activate( + &mut self, + opctx: &OpContext, + stats: &mut ActivationStats, + ) -> Result<(), anyhow::Error> { + async fn find_instances( + what: &'static str, + log: &slog::Logger, + last_err: &mut Result<(), anyhow::Error>, + query: impl Future>, + ) -> Vec { + slog::debug!(&log, "looking for instances with {what}..."); + match query.await { + Ok(list) => { + slog::info!( + &log, + "listed instances with {what}"; + "count" => list.len(), + ); + list + } + Err(error) => { + slog::error!( + &log, + "failed to list instances with {what}"; + "error" => %error, + ); + *last_err = Err(error).with_context(|| { + format!("failed to find instances with {what}",) + }); + Vec::new() + } + } + } + + let mut last_err = Ok(()); + let mut sagas = JoinSet::new(); + + // NOTE(eliza): These don't, strictly speaking, need to be two separate + // queries, they probably could instead be `OR`ed together in SQL. I + // just thought it was nice to be able to record the number of instances + // found separately for each state. + let destroyed_active_vmms = find_instances( + "destroyed active VMMs", + &opctx.log, + &mut last_err, + self.datastore.find_instances_with_destroyed_active_vmms(opctx), + ) + .await; + stats.destroyed_active_vmms = destroyed_active_vmms.len(); + self.start_sagas( + &opctx, + stats, + &mut last_err, + &mut sagas, + destroyed_active_vmms, + ) + .await; + + let terminated_active_migrations = find_instances( + "terminated active migrations", + &opctx.log, + &mut last_err, + self.datastore + .find_instances_with_terminated_active_migrations(opctx), + ) + .await; + stats.terminated_active_migrations = terminated_active_migrations.len(); + self.start_sagas( + &opctx, + stats, + &mut last_err, + &mut sagas, + terminated_active_migrations, + ) + .await; + + // Now, wait for the sagas to complete. + while let Some(saga_result) = sagas.join_next().await { + match saga_result { + Err(err) => { + debug_assert!( + false, + "since nexus is compiled with `panic=\"abort\"`, and \ + we never cancel the tasks on the `JoinSet`, a \ + `JoinError` should never be observed!", + ); + stats.sagas_failed += 1; + last_err = Err(err.into()); + } + Ok(Err(err)) => { + warn!(opctx.log, "update saga failed!"; "error" => %err); + stats.sagas_failed += 1; + last_err = Err(err); + } + Ok(Ok(())) => stats.sagas_completed += 1, + } + } + + last_err + } + + async fn start_sagas( + &self, + opctx: &OpContext, + stats: &mut ActivationStats, + last_err: &mut Result<(), anyhow::Error>, + sagas: &mut JoinSet>, + instances: impl IntoIterator, + ) { + let serialized_authn = authn::saga::Serialized::for_opctx(opctx); + for instance in instances { + let instance_id = instance.id(); + let saga = async { + let (.., authz_instance) = + LookupPath::new(&opctx, &self.datastore) + .instance_id(instance_id) + .lookup_for(authz::Action::Modify) + .await?; + instance_update::SagaInstanceUpdate::prepare( + &instance_update::Params { + serialized_authn: serialized_authn.clone(), + authz_instance, + }, + ) + .with_context(|| { + format!("failed to prepare instance-update saga for {instance_id}") + }) + } + .await; + match saga { + Ok(saga) => { + let start_saga = self.sagas.clone(); + sagas.spawn(async move { + start_saga.saga_start(saga).await.with_context(|| { + format!("update saga for {instance_id} failed") + }) + }); + stats.sagas_started += 1; + } + Err(err) => { + warn!( + opctx.log, + "failed to start instance-update saga!"; + "instance_id" => %instance_id, + "error" => %err, + ); + stats.saga_start_failures += 1; + *last_err = Err(err); + } + } + } + } +} + +#[derive(Default)] +struct ActivationStats { + destroyed_active_vmms: usize, + terminated_active_migrations: usize, + sagas_started: usize, + sagas_completed: usize, + sagas_failed: usize, + saga_start_failures: usize, +} + +impl BackgroundTask for InstanceUpdater { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + let mut stats = ActivationStats::default(); + + let error = if self.disable { + slog::info!(&opctx.log, "background instance updater explicitly disabled"); + None + } else { + match self.actually_activate(opctx, &mut stats).await { + Ok(()) => { + slog::info!( + &opctx.log, + "instance updater activation completed"; + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + ); + debug_assert_eq!( + stats.sagas_failed, + 0, + "if the task completed successfully, then no sagas \ + should have failed", + ); + debug_assert_eq!( + stats.saga_start_failures, + 0, + "if the task completed successfully, all sagas \ + should have started successfully" + ); + None + } + Err(error) => { + slog::warn!( + &opctx.log, + "instance updater activation failed!"; + "error" => %error, + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + "update_sagas_failed" => stats.sagas_failed, + "update_saga_start_failures" => stats.saga_start_failures, + ); + Some(error.to_string()) + } + } + }; + json!({ + "destroyed_active_vmms": stats.destroyed_active_vmms, + "terminated_active_migrations": stats.terminated_active_migrations, + "sagas_started": stats.sagas_started, + "sagas_completed": stats.sagas_completed, + "sagas_failed": stats.sagas_failed, + "saga_start_failures": stats.saga_start_failures, + "error": error, + }) + } + .boxed() + } +} diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 8a41e2d062..f63c21105e 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -4,8 +4,8 @@ //! Background task for pulling instance state from sled-agents. -use crate::app::background::Activator; use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; use futures::{future::BoxFuture, FutureExt}; use http::StatusCode; use nexus_db_model::Instance; @@ -17,6 +17,7 @@ use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; use nexus_types::identity::Asset; use nexus_types::identity::Resource; +use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_uuid_kinds::GenericUuid; @@ -37,10 +38,9 @@ use virtual_machine::VirtualMachine; /// Background task that periodically checks instance states. pub(crate) struct InstanceWatcher { datastore: Arc, - resolver: internal_dns::resolver::Resolver, + sagas: Arc, metrics: Arc>, id: WatcherIdentity, - v2p_manager: Activator, } const MAX_SLED_AGENTS: NonZeroU32 = unsafe { @@ -51,16 +51,15 @@ const MAX_SLED_AGENTS: NonZeroU32 = unsafe { impl InstanceWatcher { pub(crate) fn new( datastore: Arc, - resolver: internal_dns::resolver::Resolver, + sagas: Arc, producer_registry: &ProducerRegistry, id: WatcherIdentity, - v2p_manager: Activator, ) -> Self { let metrics = Arc::new(Mutex::new(metrics::Metrics::default())); producer_registry .register_producer(metrics::Producer(metrics.clone())) .unwrap(); - Self { datastore, resolver, metrics, id, v2p_manager } + Self { datastore, sagas, metrics, id } } fn check_instance( @@ -70,7 +69,7 @@ impl InstanceWatcher { target: VirtualMachine, ) -> impl Future + Send + 'static { let datastore = self.datastore.clone(); - let resolver = self.resolver.clone(); + let sagas = self.sagas.clone(); let opctx = opctx.child( std::iter::once(( @@ -80,7 +79,6 @@ impl InstanceWatcher { .collect(), ); let client = client.clone(); - let v2p_manager = self.v2p_manager.clone(); async move { slog::trace!(opctx.log, "checking on instance..."); @@ -89,8 +87,12 @@ impl InstanceWatcher { target.instance_id, )) .await; - let mut check = - Check { target, outcome: Default::default(), result: Ok(()) }; + let mut check = Check { + target, + outcome: Default::default(), + result: Ok(()), + update_saga_queued: false, + }; let state = match rsp { Ok(rsp) => rsp.into_inner(), Err(ClientError::ErrorResponse(rsp)) => { @@ -152,50 +154,37 @@ impl InstanceWatcher { let new_runtime_state: SledInstanceState = state.into(); check.outcome = CheckOutcome::Success(new_runtime_state.vmm_state.state.into()); - slog::debug!( + debug!( opctx.log, "updating instance state"; "state" => ?new_runtime_state.vmm_state.state, ); - check.result = crate::app::instance::notify_instance_updated( + match crate::app::instance::notify_instance_updated( &datastore, - &resolver, - &opctx, &opctx, - &opctx.log, - &InstanceUuid::from_untyped_uuid(target.instance_id), + InstanceUuid::from_untyped_uuid(target.instance_id), &new_runtime_state, - &v2p_manager, ) .await - .map_err(|e| { - slog::warn!( - opctx.log, - "error updating instance"; - "error" => ?e, - "state" => ?new_runtime_state.vmm_state.state, - ); - Incomplete::UpdateFailed - }) - .and_then(|updated| { - updated.ok_or_else(|| { - slog::warn!( - opctx.log, - "error updating instance: not found in database"; - "state" => ?new_runtime_state.vmm_state.state, - ); - Incomplete::InstanceNotFound - }) - }) - .map(|updated| { - slog::debug!( - opctx.log, - "update successful"; - "instance_updated" => updated.instance_updated, - "vmm_updated" => updated.vmm_updated, - "state" => ?new_runtime_state.vmm_state.state, - ); - }); + { + Err(e) => { + warn!(opctx.log, "error updating instance"; "error" => %e); + check.result = match e { + Error::ObjectNotFound { .. } => { + Err(Incomplete::InstanceNotFound) + } + _ => Err(Incomplete::UpdateFailed), + }; + } + Ok(Some(saga)) => { + check.update_saga_queued = true; + if let Err(e) = sagas.saga_start(saga).await { + warn!(opctx.log, "update saga failed"; "error" => ?e); + check.result = Err(Incomplete::UpdateFailed); + } + } + Ok(None) => {} + }; check } @@ -259,6 +248,8 @@ struct Check { /// Depending on when the error occurred, the `outcome` field may also /// be populated. result: Result<(), Incomplete>, + + update_saga_queued: bool, } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)] @@ -418,6 +409,7 @@ impl BackgroundTask for InstanceWatcher { // Now, wait for the check results to come back. let mut total: usize = 0; + let mut update_sagas_queued: usize = 0; let mut instance_states: BTreeMap = BTreeMap::new(); let mut check_failures: BTreeMap = @@ -446,7 +438,11 @@ impl BackgroundTask for InstanceWatcher { if let Err(ref reason) = check.result { *check_errors.entry(reason.as_str().into_owned()).or_default() += 1; } + if check.update_saga_queued { + update_sagas_queued += 1; + } self.metrics.lock().unwrap().record_check(check); + } // All requests completed! Prune any old instance metrics for @@ -460,6 +456,7 @@ impl BackgroundTask for InstanceWatcher { "total_completed" => instance_states.len() + check_failures.len(), "total_failed" => check_failures.len(), "total_incomplete" => check_errors.len(), + "update_sagas_queued" => update_sagas_queued, "pruned_instances" => pruned, ); serde_json::json!({ @@ -467,6 +464,7 @@ impl BackgroundTask for InstanceWatcher { "instance_states": instance_states, "failed_checks": check_failures, "incomplete_checks": check_errors, + "update_sagas_queued": update_sagas_queued, "pruned_instances": pruned, }) } diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 5062799bdb..fe041a6daa 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -14,6 +14,7 @@ pub mod dns_config; pub mod dns_propagation; pub mod dns_servers; pub mod external_endpoints; +pub mod instance_updater; pub mod instance_watcher; pub mod inventory_collection; pub mod lookup_region_port; diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index ca4635b13e..e9095cc991 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -17,6 +17,7 @@ use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledFilter; use nexus_types::inventory::Collection; +use omicron_common::address::BOUNDARY_NTP_REDUNDANCY; use omicron_common::address::COCKROACHDB_REDUNDANCY; use omicron_common::address::NEXUS_REDUNDANCY; use omicron_common::api::external::CreateResult; @@ -175,6 +176,7 @@ impl super::Nexus { ip_pool_range_rows: &ip_pool_range_rows, external_ip_rows: &external_ip_rows, service_nic_rows: &service_nic_rows, + target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY, target_nexus_zone_count: NEXUS_REDUNDANCY, target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY, target_cockroachdb_cluster_version: diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index a41fa0bd4e..344d2688f7 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -13,6 +13,7 @@ use super::MAX_SSH_KEYS_PER_INSTANCE; use super::MAX_VCPU_PER_INSTANCE; use super::MIN_MEMORY_BYTES_PER_INSTANCE; use crate::app::sagas; +use crate::app::sagas::NexusSaga; use crate::cidata::InstanceCiData; use crate::external_api::params; use cancel_safe_futures::prelude::*; @@ -20,13 +21,12 @@ use futures::future::Fuse; use futures::{FutureExt, SinkExt, StreamExt}; use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; -use nexus_db_model::Vmm; +use nexus_db_model::Vmm as DbVmm; use nexus_db_model::VmmState as DbVmmState; use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; -use nexus_db_queries::db::datastore::instance::InstanceUpdateResult; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; @@ -46,7 +46,6 @@ use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; use omicron_common::api::external::UpdateResult; use omicron_common::api::internal::nexus; -use omicron_common::api::internal::nexus::VmmState; use omicron_common::api::internal::shared::SourceNatConfig; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -59,10 +58,8 @@ use propolis_client::support::InstanceSerialConsoleHelper; use propolis_client::support::WSClientOffset; use propolis_client::support::WebSocketStream; use sagas::instance_common::ExternalIpAttach; -use sled_agent_client::types::InstanceMigrationSourceParams; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::InstanceProperties; -use sled_agent_client::types::InstancePutMigrationIdsBody; use sled_agent_client::types::InstancePutStateBody; use std::matches; use std::net::SocketAddr; @@ -189,6 +186,11 @@ pub(crate) enum InstanceRegisterReason { Migrate { vmm_id: PropolisUuid, target_vmm_id: PropolisUuid }, } +enum InstanceStartDisposition { + Start, + AlreadyStarted, +} + impl super::Nexus { pub fn instance_lookup<'a>( &'a self, @@ -524,144 +526,6 @@ impl super::Nexus { self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await } - /// Attempts to set the migration IDs for the supplied instance via the - /// sled specified in `db_instance`. - /// - /// The caller is assumed to have fetched the current instance record from - /// the DB and verified that the record has no migration IDs. - /// - /// Returns `Ok` and the updated instance record if this call successfully - /// updated the instance with the sled agent and that update was - /// successfully reflected into CRDB. Returns `Err` with an appropriate - /// error otherwise. - /// - /// # Panics - /// - /// Asserts that `db_instance` has no migration ID or destination Propolis - /// ID set. - pub(crate) async fn instance_set_migration_ids( - &self, - opctx: &OpContext, - instance_id: InstanceUuid, - sled_id: SledUuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - migration_params: InstanceMigrationSourceParams, - ) -> UpdateResult { - assert!(prev_instance_runtime.migration_id.is_none()); - assert!(prev_instance_runtime.dst_propolis_id.is_none()); - - let (.., authz_instance) = LookupPath::new(opctx, &self.db_datastore) - .instance_id(instance_id.into_untyped_uuid()) - .lookup_for(authz::Action::Modify) - .await?; - - let sa = self.sled_client(&sled_id).await?; - let instance_put_result = sa - .instance_put_migration_ids( - &instance_id, - &InstancePutMigrationIdsBody { - old_runtime: prev_instance_runtime.clone().into(), - migration_params: Some(migration_params), - }, - ) - .await - .map(|res| Some(res.into_inner().into())) - .map_err(|e| SledAgentInstancePutError(e)); - - // Write the updated instance runtime state back to CRDB. If this - // outright fails, this operation fails. If the operation nominally - // succeeds but nothing was updated, this action is outdated and the - // caller should not proceed with migration. - let InstanceUpdateResult { instance_updated, .. } = - match instance_put_result { - Ok(state) => { - self.write_returned_instance_state(&instance_id, state) - .await? - } - Err(e) => { - if e.instance_unhealthy() { - let _ = self - .mark_instance_failed( - &instance_id, - &prev_instance_runtime, - &e, - ) - .await; - } - return Err(e.into()); - } - }; - - if instance_updated { - Ok(self - .db_datastore - .instance_refetch(opctx, &authz_instance) - .await?) - } else { - Err(Error::conflict( - "instance is already migrating, or underwent an operation that \ - prevented this migration from proceeding" - )) - } - } - - /// Attempts to clear the migration IDs for the supplied instance via the - /// sled specified in `db_instance`. - /// - /// The supplied instance record must contain valid migration IDs. - /// - /// Returns `Ok` if sled agent accepted the request to clear migration IDs - /// and the resulting attempt to write instance runtime state back to CRDB - /// succeeded. This routine returns `Ok` even if the update was not actually - /// applied (due to a separate generation number change). - /// - /// # Panics - /// - /// Asserts that `db_instance` has a migration ID and destination Propolis - /// ID set. - pub(crate) async fn instance_clear_migration_ids( - &self, - instance_id: InstanceUuid, - sled_id: SledUuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - ) -> Result<(), Error> { - assert!(prev_instance_runtime.migration_id.is_some()); - assert!(prev_instance_runtime.dst_propolis_id.is_some()); - - let sa = self.sled_client(&sled_id).await?; - let instance_put_result = sa - .instance_put_migration_ids( - &instance_id, - &InstancePutMigrationIdsBody { - old_runtime: prev_instance_runtime.clone().into(), - migration_params: None, - }, - ) - .await - .map(|res| Some(res.into_inner().into())) - .map_err(|e| SledAgentInstancePutError(e)); - - match instance_put_result { - Ok(state) => { - self.write_returned_instance_state(&instance_id, state).await?; - } - Err(e) => { - if e.instance_unhealthy() { - let _ = self - .mark_instance_failed( - &instance_id, - &prev_instance_runtime, - &e, - ) - .await; - } - return Err(e.into()); - } - } - - Ok(()) - } - /// Reboot the specified instance. pub(crate) async fn instance_reboot( &self, @@ -719,54 +583,26 @@ impl super::Nexus { .db_datastore .instance_fetch_with_vmm(opctx, &authz_instance) .await?; - let (instance, vmm) = (state.instance(), state.vmm()); - if let Some(vmm) = vmm { - match vmm.runtime.state { - DbVmmState::Starting - | DbVmmState::Running - | DbVmmState::Rebooting => { - debug!(self.log, "asked to start an active instance"; - "instance_id" => %authz_instance.id()); - - return Ok(state); - } - DbVmmState::Stopped => { - let propolis_id = instance - .runtime() - .propolis_id - .expect("needed a VMM ID to fetch a VMM record"); - error!(self.log, - "instance is stopped but still has an active VMM"; - "instance_id" => %authz_instance.id(), - "propolis_id" => %propolis_id); + match instance_start_allowed(&self.log, &state)? { + InstanceStartDisposition::AlreadyStarted => Ok(state), + InstanceStartDisposition::Start => { + let saga_params = sagas::instance_start::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + db_instance: state.instance().clone(), + }; + + self.sagas + .saga_execute::( + saga_params, + ) + .await?; - return Err(Error::internal_error( - "instance is stopped but still has an active VMM", - )); - } - _ => { - return Err(Error::conflict(&format!( - "instance is in state {} but must be {} to be started", - vmm.runtime.state, - InstanceState::Stopped - ))); - } + self.db_datastore + .instance_fetch_with_vmm(opctx, &authz_instance) + .await } } - - let saga_params = sagas::instance_start::Params { - serialized_authn: authn::saga::Serialized::for_opctx(opctx), - db_instance: instance.clone(), - }; - - self.sagas - .saga_execute::( - saga_params, - ) - .await?; - - self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await } /// Make sure the given Instance is stopped. @@ -858,11 +694,10 @@ impl super::Nexus { vmm_state: &Option, requested: &InstanceStateChangeRequest, ) -> Result { - let effective_state = if let Some(vmm) = vmm_state { - vmm.runtime.state.into() - } else { - instance_state.runtime().nexus_state.into() - }; + let effective_state = InstanceAndActiveVmm::determine_effective_state( + instance_state, + vmm_state.as_ref(), + ); // Requests that operate on active instances have to be directed to the // instance's current sled agent. If there is none, the request needs to @@ -1014,13 +849,13 @@ impl super::Nexus { // the caller to let it decide how to handle it. // // When creating the zone for the first time, we just get - // Ok(None) here, which is a no-op in write_returned_instance_state. + // Ok(None) here, in which case, there's nothing to write back. match instance_put_result { - Ok(state) => self - .write_returned_instance_state(&instance_id, state) + Ok(Some(ref state)) => self + .notify_instance_updated(opctx, instance_id, state) .await - .map(|_| ()) .map_err(Into::into), + Ok(None) => Ok(()), Err(e) => Err(InstanceStateChangeError::SledAgent(e)), } } @@ -1301,12 +1136,13 @@ impl super::Nexus { }, ) .await - .map(|res| Some(res.into_inner().into())) + .map(|res| res.into_inner().into()) .map_err(|e| SledAgentInstancePutError(e)); match instance_register_result { Ok(state) => { - self.write_returned_instance_state(&instance_id, state).await?; + self.notify_instance_updated(opctx, instance_id, &state) + .await?; } Err(e) => { if e.instance_unhealthy() { @@ -1325,59 +1161,6 @@ impl super::Nexus { Ok(()) } - /// Takes an updated instance state returned from a call to sled agent and - /// writes it back to the database. - /// - /// # Return value - /// - /// - `Ok((instance_updated, vmm_updated))` if no failures occurred. The - /// tuple fields indicate which database records (if any) were updated. - /// Note that it is possible for sled agent not to return an updated - /// instance state from a particular API call. In that case, the `state` - /// parameter is `None` and this routine returns `Ok((false, false))`. - /// - `Err` if an error occurred while writing state to the database. A - /// database operation that succeeds but doesn't update anything (e.g. - /// owing to an outdated generation number) will return `Ok`. - async fn write_returned_instance_state( - &self, - instance_id: &InstanceUuid, - state: Option, - ) -> Result { - slog::debug!(&self.log, - "writing instance state returned from sled agent"; - "instance_id" => %instance_id, - "new_state" => ?state); - - if let Some(state) = state { - let update_result = self - .db_datastore - .instance_and_vmm_update_runtime( - instance_id, - &state.instance_state.into(), - &state.propolis_id, - &state.vmm_state.into(), - &state.migration_state, - ) - .await; - - slog::debug!(&self.log, - "attempted to write instance state from sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %state.propolis_id, - "result" => ?update_result); - - update_result - } else { - // There was no instance state to write back, so --- perhaps - // obviously --- nothing happened. - Ok(InstanceUpdateResult { - instance_updated: false, - vmm_updated: false, - migration_updated: None, - }) - } - } - /// Attempts to move an instance from `prev_instance_runtime` to the /// `Failed` state in response to an error returned from a call to a sled /// agent instance API, supplied in `reason`. @@ -1541,21 +1324,74 @@ impl super::Nexus { pub(crate) async fn notify_instance_updated( &self, opctx: &OpContext, - instance_id: &InstanceUuid, + instance_id: InstanceUuid, new_runtime_state: &nexus::SledInstanceState, ) -> Result<(), Error> { - notify_instance_updated( - &self.datastore(), - self.resolver(), - &self.opctx_alloc, + let saga = notify_instance_updated( + &self.db_datastore, opctx, - &self.log, instance_id, new_runtime_state, - &self.background_tasks.task_v2p_manager, ) .await?; - self.vpc_needed_notify_sleds(); + + // We don't need to wait for the instance update saga to run to + // completion to return OK to the sled-agent --- all it needs to care + // about is that the VMM/migration state in the database was updated. + // Even if we fail to successfully start an update saga, the + // instance-updater background task will eventually see that the + // instance is in a state which requires an update saga, and ensure that + // one is eventually executed. + // + // Therefore, just spawn the update saga in a new task, and return. + if let Some(saga) = saga { + info!(opctx.log, "starting update saga for {instance_id}"; + "instance_id" => %instance_id, + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?new_runtime_state.migrations(), + ); + let sagas = self.sagas.clone(); + let task_instance_updater = + self.background_tasks.task_instance_updater.clone(); + let log = opctx.log.clone(); + tokio::spawn(async move { + // TODO(eliza): maybe we should use the lower level saga API so + // we can see if the saga failed due to the lock being held and + // retry it immediately? + let running_saga = async move { + let runnable_saga = sagas.saga_prepare(saga).await?; + runnable_saga.start().await + } + .await; + let result = match running_saga { + Err(error) => { + error!(&log, "failed to start update saga for {instance_id}"; + "instance_id" => %instance_id, + "error" => %error, + ); + // If we couldn't start the update saga for this + // instance, kick the instance-updater background task + // to try and start it again in a timely manner. + task_instance_updater.activate(); + return; + } + Ok(saga) => { + saga.wait_until_stopped().await.into_omicron_result() + } + }; + if let Err(error) = result { + error!(&log, "update saga for {instance_id} failed"; + "instance_id" => %instance_id, + "error" => %error, + ); + // If we couldn't complete the update saga for this + // instance, kick the instance-updater background task + // to try and start it again in a timely manner. + task_instance_updater.activate(); + } + }); + } + Ok(()) } @@ -1670,7 +1506,7 @@ impl super::Nexus { opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, action: authz::Action, - ) -> Result<(Vmm, SocketAddr), Error> { + ) -> Result<(DbVmm, SocketAddr), Error> { let (.., authz_instance) = instance_lookup.lookup_for(action).await?; let state = self @@ -1717,7 +1553,7 @@ impl super::Nexus { opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, action: authz::Action, - ) -> Result<(Vmm, propolis_client::Client), Error> { + ) -> Result<(DbVmm, propolis_client::Client), Error> { let (vmm, client_addr) = self .propolis_addr_for_instance(opctx, instance_lookup, action) .await?; @@ -1995,193 +1831,136 @@ impl super::Nexus { } /// Invoked by a sled agent to publish an updated runtime state for an -/// Instance. -#[allow(clippy::too_many_arguments)] // :( +/// Instance, returning an update saga for that instance (if one must be +/// executed). pub(crate) async fn notify_instance_updated( datastore: &DataStore, - resolver: &internal_dns::resolver::Resolver, - opctx_alloc: &OpContext, opctx: &OpContext, - log: &slog::Logger, - instance_id: &InstanceUuid, + instance_id: InstanceUuid, new_runtime_state: &nexus::SledInstanceState, - v2p_manager: &crate::app::background::Activator, -) -> Result, Error> { +) -> Result, Error> { + use sagas::instance_update; + + let migrations = new_runtime_state.migrations(); let propolis_id = new_runtime_state.propolis_id; + info!(opctx.log, "received new VMM runtime state from sled agent"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?migrations, + ); - info!(log, "received new runtime state from sled agent"; - "instance_id" => %instance_id, - "instance_state" => ?new_runtime_state.instance_state, - "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?new_runtime_state.migration_state); - - // Grab the current state of the instance in the DB to reason about - // whether this update is stale or not. - let (.., authz_instance, db_instance) = LookupPath::new(&opctx, &datastore) - .instance_id(instance_id.into_untyped_uuid()) - .fetch() + let result = datastore + .vmm_and_migration_update_runtime( + &opctx, + propolis_id, + // TODO(eliza): probably should take this by value... + &new_runtime_state.vmm_state.clone().into(), + migrations, + ) .await?; - // Update OPTE and Dendrite if the instance's active sled assignment - // changed or a migration was retired. If these actions fail, sled agent - // is expected to retry this update. - // - // This configuration must be updated before updating any state in CRDB - // so that, if the instance was migrating or has shut down, it will not - // appear to be able to migrate or start again until the appropriate - // networking state has been written. Without this interlock, another - // thread or another Nexus can race with this routine to write - // conflicting configuration. - // - // In the future, this should be replaced by a call to trigger a - // networking state update RPW. - super::instance_network::ensure_updated_instance_network_config( - datastore, - log, - resolver, - opctx, - opctx_alloc, - &authz_instance, - db_instance.runtime(), - &new_runtime_state.instance_state, - v2p_manager, - ) - .await?; - - // If the supplied instance state indicates that the instance no longer - // has an active VMM, attempt to delete the virtual provisioning record, - // and the assignment of the Propolis metric producer to an oximeter - // collector. - // - // As with updating networking state, this must be done before - // committing the new runtime state to the database: once the DB is - // written, a new start saga can arrive and start the instance, which - // will try to create its own virtual provisioning charges, which will - // race with this operation. - if new_runtime_state.instance_state.propolis_id.is_none() { - datastore - .virtual_provisioning_collection_delete_instance( - opctx, - *instance_id, - db_instance.project_id, - i64::from(db_instance.ncpus.0 .0), - db_instance.memory, - (&new_runtime_state.instance_state.gen).into(), - ) + // If an instance-update saga must be executed as a result of this update, + // prepare and return it. + if instance_update::update_saga_needed( + &opctx.log, + instance_id, + new_runtime_state, + &result, + ) { + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) .await?; - - // TODO-correctness: The `notify_instance_updated` method can run - // concurrently with itself in some situations, such as where a - // sled-agent attempts to update Nexus about a stopped instance; - // that times out; and it makes another request to a different - // Nexus. The call to `unassign_producer` is racy in those - // situations, and we may end with instances with no metrics. - // - // This unfortunate case should be handled as part of - // instance-lifecycle improvements, notably using a reliable - // persistent workflow to correctly update the oximete assignment as - // an instance's state changes. - // - // Tracked in https://github.com/oxidecomputer/omicron/issues/3742. - super::oximeter::unassign_producer( - datastore, - log, - opctx, - &instance_id.into_untyped_uuid(), - ) - .await?; + let saga = instance_update::SagaInstanceUpdate::prepare( + &instance_update::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + authz_instance, + }, + )?; + Ok(Some(saga)) + } else { + Ok(None) } +} - // Write the new instance and VMM states back to CRDB. This needs to be - // done before trying to clean up the VMM, since the datastore will only - // allow a VMM to be marked as deleted if it is already in a terminal - // state. - let result = datastore - .instance_and_vmm_update_runtime( - instance_id, - &db::model::InstanceRuntimeState::from( - new_runtime_state.instance_state.clone(), - ), - &propolis_id, - &db::model::VmmRuntimeState::from( - new_runtime_state.vmm_state.clone(), - ), - &new_runtime_state.migration_state, - ) - .await; +/// Determines the disposition of a request to start an instance given its state +/// (and its current VMM's state, if it has one) in the database. +fn instance_start_allowed( + log: &slog::Logger, + state: &InstanceAndActiveVmm, +) -> Result { + let (instance, vmm) = (state.instance(), state.vmm()); - // If the VMM is now in a terminal state, make sure its resources get - // cleaned up. - // - // For idempotency, only check to see if the update was successfully - // processed and ignore whether the VMM record was actually updated. - // This is required to handle the case where this routine is called - // once, writes the terminal VMM state, fails before all per-VMM - // resources are released, returns a retriable error, and is retried: - // the per-VMM resources still need to be cleaned up, but the DB update - // will return Ok(_, false) because the database was already updated. + // If the instance has an active VMM, there's nothing to start, but this + // disposition of this call (succeed for idempotency vs. fail with an + // error describing the conflict) depends on the state that VMM is in. // - // Unlike the pre-update cases, it is legal to do this cleanup *after* - // committing state to the database, because a terminated VMM cannot be - // reused (restarting or migrating its former instance will use new VMM - // IDs). - if result.is_ok() { - let propolis_terminated = matches!( - new_runtime_state.vmm_state.state, - VmmState::Destroyed | VmmState::Failed - ); - - if propolis_terminated { - info!(log, "vmm is terminated, cleaning up resources"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id); - - datastore - .sled_reservation_delete(opctx, propolis_id.into_untyped_uuid()) - .await?; - - if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? { - warn!(log, "failed to mark vmm record as deleted"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state); - } + // If the instance doesn't have an active VMM, see if the instance state + // permits it to start. + match state.effective_state() { + // If the VMM is already starting or is in another "active" + // state, succeed to make successful start attempts idempotent. + s @ InstanceState::Starting + | s @ InstanceState::Running + | s @ InstanceState::Rebooting + | s @ InstanceState::Migrating => { + debug!(log, "asked to start an active instance"; + "instance_id" => %instance.id(), + "state" => ?s); + + Ok(InstanceStartDisposition::AlreadyStarted) } - } - - match result { - Ok(result) => { - info!(log, "instance and vmm updated by sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "instance_updated" => result.instance_updated, - "vmm_updated" => result.vmm_updated, - "migration_updated" => ?result.migration_updated); - Ok(Some(result)) + InstanceState::Stopped => { + match vmm.as_ref() { + // If a previous start saga failed and left behind a VMM in the + // SagaUnwound state, allow a new start saga to try to overwrite + // it. + Some(vmm) if vmm.runtime.state == DbVmmState::SagaUnwound => { + debug!( + log, + "instance's last VMM's start saga unwound, OK to start"; + "instance_id" => %instance.id() + ); + + Ok(InstanceStartDisposition::Start) + } + // This shouldn't happen: `InstanceAndVmm::effective_state` should + // only return `Stopped` if there is no active VMM or if the VMM is + // `SagaUnwound`. + Some(vmm) => { + error!(log, + "instance is stopped but still has an active VMM"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "propolis_state" => ?vmm.runtime.state); + + Err(Error::internal_error( + "instance is stopped but still has an active VMM", + )) + } + // Ah, it's actually stopped. We can restart it. + None => Ok(InstanceStartDisposition::Start), + } } - - // The update command should swallow object-not-found errors and - // return them back as failures to update, so this error case is - // unexpected. There's no work to do if this occurs, however. - Err(Error::ObjectNotFound { .. }) => { - error!(log, "instance/vmm update unexpectedly returned \ - an object not found error"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id); - Ok(None) + InstanceState::Stopping => { + let (propolis_id, propolis_state) = match vmm.as_ref() { + Some(vmm) => (Some(vmm.id), Some(vmm.runtime.state)), + None => (None, None), + }; + debug!(log, "instance's VMM is still in the process of stopping"; + "instance_id" => %instance.id(), + "propolis_id" => ?propolis_id, + "propolis_state" => ?propolis_state); + Err(Error::conflict( + "instance must finish stopping before it can be started", + )) } - - // If the datastore is unavailable, propagate that to the caller. - // TODO-robustness Really this should be any _transient_ error. How - // can we distinguish? Maybe datastore should emit something - // different from Error with an Into. - Err(error) => { - warn!(log, "failed to update instance from sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "error" => ?error); - Err(error) + s => { + return Err(Error::conflict(&format!( + "instance is in state {s} but it must be {} to be started", + InstanceState::Stopped + ))) } } } @@ -2189,15 +1968,23 @@ pub(crate) async fn notify_instance_updated( #[cfg(test)] mod tests { use super::super::Nexus; - use super::{CloseCode, CloseFrame, WebSocketMessage, WebSocketStream}; + use super::*; use core::time::Duration; use futures::{SinkExt, StreamExt}; + use nexus_db_model::{ + Instance as DbInstance, InstanceState as DbInstanceState, + VmmInitialState, VmmState as DbVmmState, + }; + use omicron_common::api::external::{ + Hostname, IdentityMetadataCreateParams, InstanceCpuCount, Name, + }; use omicron_test_utils::dev::test_setup_log; + use params::InstanceNetworkInterfaceAttachment; use propolis_client::support::tungstenite::protocol::Role; use propolis_client::support::{ InstanceSerialConsoleHelper, WSClientOffset, }; - use std::net::{Ipv4Addr, SocketAddr, SocketAddrV4}; + use std::net::{IpAddr, Ipv4Addr, SocketAddr, SocketAddrV4}; #[tokio::test] async fn test_serial_console_stream_proxying() { @@ -2290,4 +2077,105 @@ mod tests { .expect("proxy task exited successfully"); logctx.cleanup_successful(); } + + /// Creates an instance record and a VMM record that points back to it. Note + /// that the VMM is *not* installed in the instance's `active_propolis_id` + /// field. + fn make_instance_and_vmm() -> (DbInstance, DbVmm) { + let params = params::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: Name::try_from("elysium".to_owned()).unwrap(), + description: "this instance is disco".to_owned(), + }, + ncpus: InstanceCpuCount(1), + memory: ByteCount::from_gibibytes_u32(1), + hostname: Hostname::try_from("elysium").unwrap(), + user_data: vec![], + network_interfaces: InstanceNetworkInterfaceAttachment::None, + external_ips: vec![], + disks: vec![], + ssh_public_keys: None, + start: false, + }; + + let instance_id = InstanceUuid::from_untyped_uuid(Uuid::new_v4()); + let project_id = Uuid::new_v4(); + let instance = DbInstance::new(instance_id, project_id, ¶ms); + + let propolis_id = PropolisUuid::from_untyped_uuid(Uuid::new_v4()); + let sled_id = SledUuid::from_untyped_uuid(Uuid::new_v4()); + let vmm = DbVmm::new( + propolis_id, + instance_id, + sled_id, + ipnetwork::IpNetwork::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0) + .unwrap(), + 0, + VmmInitialState::Starting, + ); + + (instance, vmm) + } + + #[test] + fn test_instance_start_allowed_when_no_vmm() { + let logctx = test_setup_log("test_instance_start_allowed_when_no_vmm"); + let (mut instance, _vmm) = make_instance_and_vmm(); + instance.runtime_state.nexus_state = DbInstanceState::NoVmm; + let state = InstanceAndActiveVmm::from((instance, None)); + assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + logctx.cleanup_successful(); + } + + #[test] + fn test_instance_start_allowed_when_vmm_in_saga_unwound() { + let logctx = test_setup_log( + "test_instance_start_allowed_when_vmm_in_saga_unwound", + ); + let (mut instance, mut vmm) = make_instance_and_vmm(); + instance.runtime_state.nexus_state = DbInstanceState::Vmm; + instance.runtime_state.propolis_id = Some(vmm.id); + vmm.runtime.state = DbVmmState::SagaUnwound; + let state = InstanceAndActiveVmm::from((instance, Some(vmm))); + assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + logctx.cleanup_successful(); + } + + #[test] + fn test_instance_start_forbidden_while_creating() { + let logctx = + test_setup_log("test_instance_start_forbidden_while_creating"); + let (mut instance, _vmm) = make_instance_and_vmm(); + instance.runtime_state.nexus_state = DbInstanceState::Creating; + let state = InstanceAndActiveVmm::from((instance, None)); + assert!(instance_start_allowed(&logctx.log, &state).is_err()); + logctx.cleanup_successful(); + } + + #[test] + fn test_instance_start_idempotent_if_active() { + let logctx = test_setup_log("test_instance_start_idempotent_if_active"); + let (mut instance, mut vmm) = make_instance_and_vmm(); + instance.runtime_state.nexus_state = DbInstanceState::Vmm; + instance.runtime_state.propolis_id = Some(vmm.id); + vmm.runtime.state = DbVmmState::Starting; + let state = + InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone()))); + assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + + vmm.runtime.state = DbVmmState::Running; + let state = + InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone()))); + assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + + vmm.runtime.state = DbVmmState::Rebooting; + let state = + InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone()))); + assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + + vmm.runtime.state = DbVmmState::Migrating; + let state = InstanceAndActiveVmm::from((instance, Some(vmm))); + assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 5f5274dea2..8cd0a34fbf 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -4,7 +4,6 @@ //! Routines that manage instance-related networking state. -use crate::app::background; use crate::app::switch_port; use ipnetwork::IpNetwork; use nexus_db_model::ExternalIp; @@ -14,11 +13,9 @@ use nexus_db_model::Ipv4NatValues; use nexus_db_model::Vni as DbVni; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; use omicron_common::api::external::Error; -use omicron_common::api::internal::nexus; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::SwitchLocation; use omicron_uuid_kinds::GenericUuid; @@ -230,175 +227,6 @@ pub(crate) async fn boundary_switches( Ok(boundary_switches) } -/// Given old and new instance runtime states, determines the desired -/// networking configuration for a given instance and ensures it has been -/// propagated to all relevant sleds. -/// -/// # Arguments -/// -/// - `datastore`: the datastore to use for lookups and updates. -/// - `log`: the [`slog::Logger`] to log to. -/// - `resolver`: an internal DNS resolver to look up DPD service addresses. -/// - `opctx`: An operation context for this operation. -/// - `opctx_alloc`: An operational context list permissions for all sleds. When -/// called by methods on the [`Nexus`] type, this is the `OpContext` used for -/// instance allocation. In a background task, this may be the background -/// task's operational context; nothing stops you from passing the same -/// `OpContext` as both `opctx` and `opctx_alloc`. -/// - `authz_instance``: A resolved authorization context for the instance of -/// interest. -/// - `prev_instance_state``: The most-recently-recorded instance runtime -/// state for this instance. -/// - `new_instance_state`: The instance state that the caller of this routine -/// has observed and that should be used to set up this instance's -/// networking state. -/// -/// # Return value -/// -/// `Ok(())` if this routine completed all the operations it wanted to -/// complete, or an appropriate `Err` otherwise. -#[allow(clippy::too_many_arguments)] // Yeah, I know, I know, Clippy... -pub(crate) async fn ensure_updated_instance_network_config( - datastore: &DataStore, - log: &slog::Logger, - resolver: &internal_dns::resolver::Resolver, - opctx: &OpContext, - opctx_alloc: &OpContext, - authz_instance: &authz::Instance, - prev_instance_state: &db::model::InstanceRuntimeState, - new_instance_state: &nexus::InstanceRuntimeState, - v2p_manager: &background::Activator, -) -> Result<(), Error> { - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - - // If this instance update is stale, do nothing, since the superseding - // update may have allowed the instance's location to change further. - if prev_instance_state.gen >= new_instance_state.gen.into() { - debug!(log, - "instance state generation already advanced, \ - won't touch network config"; - "instance_id" => %instance_id); - - return Ok(()); - } - - // If this update will retire the instance's active VMM, delete its - // networking state. It will be re-established the next time the - // instance starts. - if new_instance_state.propolis_id.is_none() { - info!(log, - "instance cleared its Propolis ID, cleaning network config"; - "instance_id" => %instance_id, - "propolis_id" => ?prev_instance_state.propolis_id); - - clear_instance_networking_state( - datastore, - log, - resolver, - opctx, - opctx_alloc, - authz_instance, - v2p_manager, - ) - .await?; - return Ok(()); - } - - // If the instance still has a migration in progress, don't change - // any networking state until an update arrives that retires that - // migration. - // - // This is needed to avoid the following race: - // - // 1. Migration from S to T completes. - // 2. Migration source sends an update that changes the instance's - // active VMM but leaves the migration ID in place. - // 3. Meanwhile, migration target sends an update that changes the - // instance's active VMM and clears the migration ID. - // 4. The migration target's call updates networking state and commits - // the new instance record. - // 5. The instance migrates from T to T' and Nexus applies networking - // configuration reflecting that the instance is on T'. - // 6. The update in step 2 applies configuration saying the instance - // is on sled T. - if new_instance_state.migration_id.is_some() { - debug!(log, - "instance still has a migration in progress, won't touch \ - network config"; - "instance_id" => %instance_id, - "migration_id" => ?new_instance_state.migration_id); - - return Ok(()); - } - - let new_propolis_id = new_instance_state.propolis_id.unwrap(); - - // Updates that end live migration need to push OPTE V2P state even if - // the instance's active sled did not change (see below). - let migration_retired = prev_instance_state.migration_id.is_some() - && new_instance_state.migration_id.is_none(); - - if (prev_instance_state.propolis_id - == new_instance_state.propolis_id.map(GenericUuid::into_untyped_uuid)) - && !migration_retired - { - debug!(log, "instance didn't move, won't touch network config"; - "instance_id" => %instance_id); - - return Ok(()); - } - - // Either the instance moved from one sled to another, or it attempted - // to migrate and failed. Ensure the correct networking configuration - // exists for its current home. - // - // TODO(#3107) This is necessary even if the instance didn't move, - // because registering a migration target on a sled creates OPTE ports - // for its VNICs, and that creates new V2P mappings on that sled that - // place the relevant virtual IPs on the local sled. Once OPTE stops - // creating these mappings, this path only needs to be taken if an - // instance has changed sleds. - let new_sled_id = match datastore - .vmm_fetch(&opctx, authz_instance, &new_propolis_id) - .await - { - Ok(vmm) => vmm.sled_id, - - // A VMM in the active position should never be destroyed. If the - // sled sending this message is the owner of the instance's last - // active VMM and is destroying it, it should also have retired that - // VMM. - Err(Error::ObjectNotFound { .. }) => { - error!(log, "instance's active vmm unexpectedly not found"; - "instance_id" => %instance_id, - "propolis_id" => %new_propolis_id); - - return Ok(()); - } - - Err(e) => return Err(e), - }; - - v2p_manager.activate(); - - let (.., sled) = - LookupPath::new(opctx, datastore).sled_id(new_sled_id).fetch().await?; - - instance_ensure_dpd_config( - datastore, - log, - resolver, - opctx, - opctx_alloc, - instance_id, - &sled.address(), - None, - ) - .await?; - - Ok(()) -} - /// Ensures that the Dendrite configuration for the supplied instance is /// up-to-date. /// @@ -685,43 +513,6 @@ pub(crate) async fn probe_ensure_dpd_config( Ok(()) } -/// Deletes an instance's OPTE V2P mappings and the boundary switch NAT -/// entries for its external IPs. -/// -/// This routine returns immediately upon encountering any errors (and will -/// not try to destroy any more objects after the point of failure). -async fn clear_instance_networking_state( - datastore: &DataStore, - log: &slog::Logger, - resolver: &internal_dns::resolver::Resolver, - opctx: &OpContext, - opctx_alloc: &OpContext, - authz_instance: &authz::Instance, - v2p_manager: &background::Activator, -) -> Result<(), Error> { - v2p_manager.activate(); - - instance_delete_dpd_config( - datastore, - log, - resolver, - opctx, - opctx_alloc, - authz_instance, - ) - .await?; - - notify_dendrite_nat_state( - datastore, - log, - resolver, - opctx_alloc, - Some(InstanceUuid::from_untyped_uuid(authz_instance.id())), - true, - ) - .await -} - /// Attempts to delete all of the Dendrite NAT configuration for the /// instance identified by `authz_instance`. /// diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs index 2b510a0f12..fcdbb0db59 100644 --- a/nexus/src/app/saga.rs +++ b/nexus/src/app/saga.rs @@ -371,12 +371,6 @@ pub(crate) struct StoppedSaga { impl StoppedSaga { /// Fetches the raw Steno result for the saga's execution - /// - /// This is a test-only routine meant for use in tests that need to examine - /// the details of a saga's final state (e.g., examining the exact point at - /// which it failed). Non-test callers should use `into_omicron_result` - /// instead. - #[cfg(test)] pub(crate) fn into_raw_result(self) -> SagaResult { self.result } diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 4f0ec7c0c6..d19230892f 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -1065,7 +1065,7 @@ pub mod test { app::sagas::instance_create::SagaInstanceCreate, app::sagas::test_helpers, external_api::params, }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -1201,39 +1201,6 @@ pub mod test { .is_none() } - async fn no_sled_resource_instance_records_exist( - datastore: &DataStore, - ) -> bool { - use nexus_db_queries::db::model::SledResource; - use nexus_db_queries::db::schema::sled_resource::dsl; - - let conn = datastore.pool_connection_for_tests().await.unwrap(); - - datastore - .transaction_retry_wrapper( - "no_sled_resource_instance_records_exist", - ) - .transaction(&conn, |conn| async move { - conn.batch_execute_async( - nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, - ) - .await - .unwrap(); - - Ok(dsl::sled_resource - .filter(dsl::kind.eq( - nexus_db_queries::db::model::SledResourceKind::Instance, - )) - .select(SledResource::as_select()) - .get_results_async::(&conn) - .await - .unwrap() - .is_empty()) - }) - .await - .unwrap() - } - async fn disk_is_detached(datastore: &DataStore) -> bool { use nexus_db_queries::db::model::Disk; use nexus_db_queries::db::schema::disk::dsl; @@ -1267,7 +1234,10 @@ pub mod test { assert!(no_instance_records_exist(datastore).await); assert!(no_network_interface_records_exist(datastore).await); assert!(no_external_ip_records_exist(datastore).await); - assert!(no_sled_resource_instance_records_exist(datastore).await); + assert!( + test_helpers::no_sled_resource_instance_records_exist(cptestctx) + .await + ); assert!( test_helpers::no_virtual_provisioning_resource_records_exist( cptestctx diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index b8599feb04..bb4bf282e4 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -16,9 +16,7 @@ use nexus_db_queries::{authn, authz, db}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::{ - InstanceMigrationSourceParams, InstanceMigrationTargetParams, -}; +use sled_agent_client::types::InstanceMigrationTargetParams; use slog::warn; use std::net::{Ipv6Addr, SocketAddr}; use steno::ActionError; @@ -72,22 +70,44 @@ declare_saga_actions! { CREATE_MIGRATION_RECORD -> "migration_record" { + sim_create_migration_record - - sim_delete_migration_record + - sim_fail_migration_record } - // This step the instance's migration ID and destination Propolis ID - // fields. Because the instance is active, its current sled agent maintains - // its most recent runtime state, so to update it, the saga calls into the - // sled and asks it to produce an updated instance record with the - // appropriate migration IDs and a new generation number. + // fields in the database. + // + // If the instance's migration ID has already been set when we attempt to + // set ours, that means we have probably raced with another migrate saga for + // the same instance. If this is the case, this action will fail and the + // saga will unwind. + // + // Yes, it's a bit unfortunate that our attempt to compare-and-swap in a + // migration ID happens only after we've created VMM and migration records, + // and that we'll have to destroy them as we unwind. However, the + // alternative, setting the migration IDs *before* records for the target + // VMM and the migration are created, would mean that there is a period of + // time during which the instance record contains foreign keys into the + // `vmm` and `migration` tables that don't have corresponding records to + // those tables. Because the `instance` table is queried in the public API, + // we take care to ensure that it doesn't have "dangling pointers" to + // records in the `vmm` and `migration` tables that don't exist yet. + // + // Note that unwinding this action does *not* clear the migration IDs from + // the instance record. This is to avoid a potential race with the instance + // update saga where: // - // The source sled agent synchronizes concurrent attempts to set these IDs. - // Setting a new migration ID and re-setting an existing ID are allowed, but - // trying to set an ID when a different ID is already present fails. + // - a `instance-migrate` saga sets the migration IDs at instance state + // generation _N_ + // - an `instance-update` saga increments the instance's state generation to + // _N_ + 1 + // - the `instance-migrate` saga unwinds and attempts to clear the migration + // IDs, but can't, because the state generation has advanced. + // + // Instead, we leave the migration IDs in place and rely on setting the VMM + // state to `SagaUnwound` to indicate to other future `instance-migrate` + // sagas that it's okay to start a new migration. SET_MIGRATION_IDS -> "set_migration_ids" { + sim_set_migration_ids - - sim_clear_migration_ids } // This step registers the instance with the destination sled. Care is @@ -239,7 +259,7 @@ async fn sim_create_migration_record( .map_err(ActionError::action_failed) } -async fn sim_delete_migration_record( +async fn sim_fail_migration_record( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx: &std::sync::Arc = @@ -251,9 +271,24 @@ async fn sim_delete_migration_record( ); let migration_id = sagactx.lookup::("migrate_id")?; - info!(osagactx.log(), "deleting migration record"; - "migration_id" => %migration_id); - osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await?; + info!( + osagactx.log(), + "migration saga unwinding, marking migration record as failed"; + "instance_id" => %params.instance.id(), + "migration_id" => %migration_id, + ); + // If the migration record wasn't updated, this means it's already deleted, + // which...seems weird, but isn't worth getting the whole saga unwind stuck over. + if let Err(e) = + osagactx.datastore().migration_mark_failed(&opctx, migration_id).await + { + warn!(osagactx.log(), + "Error marking migration record as failed during rollback"; + "instance_id" => %params.instance.id(), + "migration_id" => %migration_id, + "error" => ?e); + } + Ok(()) } @@ -323,75 +358,28 @@ async fn sim_set_migration_ids( let db_instance = ¶ms.instance; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); - let src_sled_id = SledUuid::from_untyped_uuid(params.src_vmm.sled_id); + let src_propolis_id = PropolisUuid::from_untyped_uuid(params.src_vmm.id); let migration_id = sagactx.lookup::("migrate_id")?; let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; - info!(osagactx.log(), "setting migration IDs on migration source sled"; + info!(osagactx.log(), "setting instance migration IDs"; "instance_id" => %db_instance.id(), - "sled_id" => %src_sled_id, "migration_id" => %migration_id, + "src_propolis_id" => %src_propolis_id, "dst_propolis_id" => %dst_propolis_id, "prev_runtime_state" => ?db_instance.runtime()); - let updated_record = osagactx - .nexus() + osagactx + .datastore() .instance_set_migration_ids( &opctx, instance_id, - src_sled_id, - db_instance.runtime(), - InstanceMigrationSourceParams { dst_propolis_id, migration_id }, - ) - .await - .map_err(ActionError::action_failed)?; - - Ok(updated_record) -} - -async fn sim_clear_migration_ids( - sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); - let params = sagactx.saga_params::()?; - let src_sled_id = SledUuid::from_untyped_uuid(params.src_vmm.sled_id); - let db_instance = - sagactx.lookup::("set_migration_ids")?; - let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); - - info!(osagactx.log(), "clearing migration IDs for saga unwind"; - "instance_id" => %db_instance.id(), - "sled_id" => %src_sled_id, - "prev_runtime_state" => ?db_instance.runtime()); - - // Because the migration never actually started (and thus didn't finish), - // the instance should be at the same Propolis generation as it was when - // migration IDs were set, which means sled agent should accept a request to - // clear them. The only exception is if the instance stopped, but that also - // clears its migration IDs; in that case there is no work to do here. - // - // Other failures to clear migration IDs are handled like any other failure - // to update an instance's state: the callee attempts to mark the instance - // as failed; if the failure occurred because the instance changed state - // such that sled agent could not fulfill the request, the callee will - // produce a stale generation number and will not actually mark the instance - // as failed. - if let Err(e) = osagactx - .nexus() - .instance_clear_migration_ids( - instance_id, - src_sled_id, - db_instance.runtime(), + src_propolis_id, + migration_id, + dst_propolis_id, ) .await - { - warn!(osagactx.log(), - "Error clearing migration IDs during rollback"; - "instance_id" => %instance_id, - "error" => ?e); - } - - Ok(()) + .map_err(ActionError::action_failed) } async fn sim_ensure_destination_propolis( @@ -575,21 +563,16 @@ async fn sim_instance_migrate( #[cfg(test)] mod tests { + use super::*; use crate::app::sagas::test_helpers; - use camino::Utf8Path; use dropshot::test_util::ClientTestContext; - use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::{ create_default_ip_pool, create_project, object_create, }; - use nexus_test_utils::start_sled_agent; use nexus_test_utils_macros::nexus_test; use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, }; - use omicron_sled_agent::sim::Server; - - use super::*; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -603,35 +586,6 @@ mod tests { project.identity.id } - async fn add_sleds( - cptestctx: &ControlPlaneTestContext, - num_sleds: usize, - ) -> Vec<(SledUuid, Server)> { - let mut sas = Vec::with_capacity(num_sleds); - for _ in 0..num_sleds { - let sa_id = SledUuid::new_v4(); - let log = - cptestctx.logctx.log.new(o!("sled_id" => sa_id.to_string())); - let addr = - cptestctx.server.get_http_server_internal_address().await; - - info!(&cptestctx.logctx.log, "Adding simulated sled"; "sled_id" => %sa_id); - let update_dir = Utf8Path::new("/should/be/unused"); - let sa = start_sled_agent( - log, - addr, - sa_id, - &update_dir, - omicron_sled_agent::sim::SimMode::Explicit, - ) - .await - .unwrap(); - sas.push((sa_id, sa)); - } - - sas - } - async fn create_instance( client: &ClientTestContext, ) -> omicron_common::api::external::Instance { @@ -659,32 +613,11 @@ mod tests { .await } - fn select_first_alternate_sled( - db_vmm: &db::model::Vmm, - other_sleds: &[(SledUuid, Server)], - ) -> SledUuid { - let default_sled_uuid: SledUuid = - nexus_test_utils::SLED_AGENT_UUID.parse().unwrap(); - if other_sleds.is_empty() { - panic!("need at least one other sled"); - } - - if other_sleds.iter().any(|sled| sled.0 == default_sled_uuid) { - panic!("default test sled agent was in other_sleds"); - } - - if db_vmm.sled_id == default_sled_uuid.into_untyped_uuid() { - other_sleds[0].0 - } else { - default_sled_uuid - } - } - #[nexus_test(server = crate::Server)] async fn test_saga_basic_usage_succeeds( cptestctx: &ControlPlaneTestContext, ) { - let other_sleds = add_sleds(cptestctx, 1).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let _project_id = setup_test_project(&client).await; @@ -698,7 +631,8 @@ mod tests { let state = test_helpers::instance_fetch(cptestctx, instance_id).await; let vmm = state.vmm().as_ref().unwrap(); - let dst_sled_id = select_first_alternate_sled(vmm, &other_sleds); + let dst_sled_id = + test_helpers::select_first_alternate_sled(vmm, &other_sleds[..]); let params = Params { serialized_authn: authn::saga::Serialized::for_opctx(&opctx), instance: state.instance().clone(), @@ -731,7 +665,7 @@ mod tests { cptestctx: &ControlPlaneTestContext, ) { let log = &cptestctx.logctx.log; - let other_sleds = add_sleds(cptestctx, 1).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let _project_id = setup_test_project(&client).await; @@ -756,8 +690,10 @@ mod tests { .as_ref() .expect("instance should have a vmm before migrating"); - let dst_sled_id = - select_first_alternate_sled(old_vmm, &other_sleds); + let dst_sled_id = test_helpers::select_first_alternate_sled( + old_vmm, + &other_sleds[..], + ); info!(log, "setting up new migration saga"; "old_instance" => ?old_instance, @@ -781,24 +717,44 @@ mod tests { let after_saga = || -> futures::future::BoxFuture<'_, ()> { Box::pin({ async { - // Unwinding at any step should clear the migration IDs from - // the instance record and leave the instance's location - // otherwise untouched. - let new_state = - test_helpers::instance_fetch(cptestctx, instance_id) - .await; - - let new_instance = new_state.instance(); - let new_vmm = - new_state.vmm().as_ref().expect("vmm should be active"); + let new_state = test_helpers::instance_fetch_all( + cptestctx, + instance_id, + ) + .await; + + let new_instance = new_state.instance; + let new_vmm = new_state + .active_vmm + .as_ref() + .expect("vmm should be active"); - assert!(new_instance.runtime().migration_id.is_none()); - assert!(new_instance.runtime().dst_propolis_id.is_none()); assert_eq!( new_instance.runtime().propolis_id.unwrap(), new_vmm.id ); + // If the instance has had migration IDs set, then both + // sides of the migration should be marked as failed. + if let Some(migration) = new_state.migration { + assert_eq!( + migration.source_state, + db::model::MigrationState::FAILED + ); + assert_eq!( + migration.target_state, + db::model::MigrationState::FAILED + ); + } + // If the instance has a target VMM ID left behind by the + // unwinding saga, that VMM must be in the `SagaUnwound` state. + if let Some(target_vmm) = new_state.target_vmm { + assert_eq!( + target_vmm.runtime.state, + db::model::VmmState::SagaUnwound + ); + } + info!( &log, "migration saga unwind: stopping instance after failed \ @@ -812,17 +768,19 @@ mod tests { test_helpers::instance_stop(cptestctx, &instance_id).await; test_helpers::instance_simulate(cptestctx, &instance_id) .await; - - let new_state = - test_helpers::instance_fetch(cptestctx, instance_id) - .await; + // Wait until the instance has advanced to the `NoVmm` + // state. This may not happen immediately, as an + // instance-update saga must complete to update the + // instance's state. + let new_state = test_helpers::instance_wait_for_state( + cptestctx, + instance_id, + nexus_db_model::InstanceState::NoVmm, + ) + .await; let new_instance = new_state.instance(); let new_vmm = new_state.vmm().as_ref(); - assert_eq!( - new_instance.runtime().nexus_state, - nexus_db_model::InstanceState::NoVmm, - ); assert!(new_instance.runtime().propolis_id.is_none()); assert!(new_vmm.is_none()); diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index adde040a77..9e4e010eea 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -235,21 +235,38 @@ async fn sis_move_to_starting( // For idempotency, refetch the instance to see if this step already applied // its desired update. - let (.., db_instance) = LookupPath::new(&opctx, &datastore) + let (_, _, authz_instance, ..) = LookupPath::new(&opctx, &datastore) .instance_id(instance_id.into_untyped_uuid()) .fetch_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; + let state = datastore + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)?; + + let db_instance = state.instance(); - match db_instance.runtime().propolis_id { + // If `true`, we have unlinked a Propolis ID left behind by a previous + // unwinding start saga, and we should activate the activate the abandoned + // VMM reaper background task once we've written back the instance record. + let mut abandoned_unwound_vmm = false; + match state.vmm() { // If this saga's Propolis ID is already written to the record, then // this step must have completed already and is being retried, so // proceed. - Some(db_id) if db_id == propolis_id.into_untyped_uuid() => { + Some(vmm) if vmm.id == propolis_id.into_untyped_uuid() => { info!(osagactx.log(), "start saga: Propolis ID already set"; "instance_id" => %instance_id); - Ok(db_instance) + return Ok(db_instance.clone()); + } + + // If the instance has a Propolis ID, but the Propolis was left behind + // by a previous start saga unwinding, that's fine, we can just clear it + // out and proceed as though there was no Propolis ID here. + Some(vmm) if vmm.runtime.state == db::model::VmmState::SagaUnwound => { + abandoned_unwound_vmm = true; } // If the instance has a different Propolis ID, a competing start saga @@ -266,33 +283,38 @@ async fn sis_move_to_starting( // this point causes the VMM's state, which is Starting, to supersede // the instance's state, so this won't cause the instance to appear to // be running before Propolis thinks it has started.) - None => { - let new_runtime = db::model::InstanceRuntimeState { - nexus_state: db::model::InstanceState::Vmm, - propolis_id: Some(propolis_id.into_untyped_uuid()), - time_updated: Utc::now(), - gen: db_instance.runtime().gen.next().into(), - ..db_instance.runtime_state - }; - - // Bail if another actor managed to update the instance's state in - // the meantime. - if !osagactx - .datastore() - .instance_update_runtime(&instance_id, &new_runtime) - .await - .map_err(ActionError::action_failed)? - { - return Err(ActionError::action_failed(Error::conflict( - "instance changed state before it could be started", - ))); - } + None => {} + } - let mut new_record = db_instance.clone(); - new_record.runtime_state = new_runtime; - Ok(new_record) - } + let new_runtime = db::model::InstanceRuntimeState { + nexus_state: db::model::InstanceState::Vmm, + propolis_id: Some(propolis_id.into_untyped_uuid()), + time_updated: Utc::now(), + gen: db_instance.runtime().gen.next().into(), + ..db_instance.runtime_state + }; + + // Bail if another actor managed to update the instance's state in + // the meantime. + if !osagactx + .datastore() + .instance_update_runtime(&instance_id, &new_runtime) + .await + .map_err(ActionError::action_failed)? + { + return Err(ActionError::action_failed(Error::conflict( + "instance changed state before it could be started", + ))); + } + + // Don't fear the reaper! + if abandoned_unwound_vmm { + osagactx.nexus().background_tasks.task_abandoned_vmm_reaper.activate(); } + + let mut new_record = db_instance.clone(); + new_record.runtime_state = new_runtime; + Ok(new_record) } async fn sis_move_to_starting_undo( @@ -363,9 +385,6 @@ async fn sis_account_virtual_resources_undo( ¶ms.serialized_authn, ); - let started_record = - sagactx.lookup::("started_record")?; - osagactx .datastore() .virtual_provisioning_collection_delete_instance( @@ -374,11 +393,6 @@ async fn sis_account_virtual_resources_undo( params.db_instance.project_id, i64::from(params.db_instance.ncpus.0 .0), nexus_db_model::ByteCount(*params.db_instance.memory), - // Use the next instance generation number as the generation limit - // to ensure the provisioning counters are released. (The "mark as - // starting" undo step will "publish" this new state generation when - // it moves the instance back to Stopped.) - (&started_record.runtime().gen.next()).into(), ) .await .map_err(ActionError::action_failed)?; @@ -810,28 +824,23 @@ mod test { }) }, || { - Box::pin({ - async { - let new_db_instance = test_helpers::instance_fetch( - cptestctx, - instance_id, - ) - .await.instance().clone(); - - info!(log, - "fetched instance runtime state after saga execution"; - "instance_id" => %instance.identity.id, - "instance_runtime" => ?new_db_instance.runtime()); - - assert!(new_db_instance.runtime().propolis_id.is_none()); - assert_eq!( - new_db_instance.runtime().nexus_state, - nexus_db_model::InstanceState::NoVmm - ); - - assert!(test_helpers::no_virtual_provisioning_resource_records_exist(cptestctx).await); - assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); - } + Box::pin(async { + let new_db_state = test_helpers::instance_wait_for_state( + cptestctx, + instance_id, + nexus_db_model::InstanceState::NoVmm, + ).await; + let new_db_instance = new_db_state.instance(); + + info!(log, + "fetched instance runtime state after saga execution"; + "instance_id" => %instance.identity.id, + "instance_runtime" => ?new_db_instance.runtime()); + + assert!(new_db_instance.runtime().propolis_id.is_none()); + + assert!(test_helpers::no_virtual_provisioning_resource_records_exist(cptestctx).await); + assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); }) }, log, diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs new file mode 100644 index 0000000000..243f952c8b --- /dev/null +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -0,0 +1,127 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{ + declare_saga_actions, ActionRegistry, DagBuilder, NexusActionContext, + NexusSaga, SagaInitError, +}; +use crate::app::sagas::ActionError; +use nexus_db_queries::authn; +use omicron_common::api::external::Error; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::PropolisUuid; +use serde::{Deserialize, Serialize}; + +// destroy VMM subsaga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(super) struct Params { + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub(super) serialized_authn: authn::saga::Serialized, + + /// Instance UUID of the instance being updated. This is only just used + /// for logging, so we just use the instance ID here instead of serializing + /// a whole instance record. + pub(super) instance_id: InstanceUuid, + + /// UUID of the VMM to destroy. + pub(super) vmm_id: PropolisUuid, +} + +// destroy VMM subsaga: actions + +declare_saga_actions! { + destroy_vmm; + + // Deallocate physical sled resources reserved for the destroyed VMM, as it + // is no longer using them. + RELEASE_SLED_RESOURCES -> "release_sled_resources" { + + siu_destroyed_release_sled_resources + } + + // Mark the VMM record as deleted. + MARK_VMM_DELETED -> "mark_vmm_deleted" { + + siu_destroyed_mark_vmm_deleted + } +} + +// destroy VMM subsaga: definition + +#[derive(Debug)] +pub(super) struct SagaDestroyVmm; +impl NexusSaga for SagaDestroyVmm { + const NAME: &'static str = "destroy-vmm"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + destroy_vmm_register_actions(registry) + } + + fn make_saga_dag( + _: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + builder.append(release_sled_resources_action()); + builder.append(mark_vmm_deleted_action()); + Ok(builder.build()?) + } +} + +async fn siu_destroyed_release_sled_resources( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, instance_id, vmm_id, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + info!( + osagactx.log(), + "instance update (VMM destroyed): deallocating sled resource reservation"; + "instance_id" => %instance_id, + "propolis_id" => %vmm_id, + ); + + osagactx + .datastore() + .sled_reservation_delete(&opctx, vmm_id.into_untyped_uuid()) + .await + .or_else(|err| { + // Necessary for idempotency + match err { + Error::ObjectNotFound { .. } => Ok(()), + _ => Err(err), + } + }) + .map_err(ActionError::action_failed) +} + +async fn siu_destroyed_mark_vmm_deleted( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, instance_id, vmm_id, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + info!( + osagactx.log(), + "instance update (VMM destroyed): marking VMM record deleted"; + "instance_id" => %instance_id, + "propolis_id" => %vmm_id, + ); + + osagactx + .datastore() + .vmm_mark_deleted(&opctx, &vmm_id) + .await + .map(|_| ()) + .map_err(ActionError::action_failed) +} diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs new file mode 100644 index 0000000000..71abe63bbd --- /dev/null +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -0,0 +1,2778 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Instance Update Saga +//! +//! ## Background +//! +//! The state of a VM instance, as understood by Nexus, consists of a +//! combination of database tables: +//! +//! - The `instance` table, owned exclusively by Nexus itself, represents the +//! user-facing "logical" VM instance. +//! - The `vmm` table, which represents a "physical" Propolis VMM process on +//! which a running instance is incarnated. +//! - The `migration` table, which represents the state of an in-progress live +//! migration of an instance between two VMMs. +//! +//! When an instance is incarnated on a sled, the `propolis_id` field in an +//! `instance` record contains a UUID foreign key into the `vmm` table that +//! points to the `vmm` record for the Propolis process on which the instance is +//! currently running. If an instance is undergoing live migration, its record +//! additionally contains a `dst_propolis_id` foreign key pointing at the `vmm` +//! row representing the *target* Propolis process that it is migrating to, and +//! a `migration_id` foreign key into the `migration` table record tracking the +//! state of that migration. +//! +//! Sled-agents report the state of the VMMs they manage to Nexus. This occurs +//! when a VMM state transition occurs and the sled-agent *pushes* an update to +//! Nexus' `cpapi_instances_put` internal API endpoint, when a Nexus' +//! `instance-watcher` background task *pulls* instance states from sled-agents +//! periodically, or as the return value of an API call from Nexus to a +//! sled-agent. When a Nexus receives a new [`SledInstanceState`] from a +//! sled-agent through any of these mechanisms, the Nexus will write any changed +//! state to the `vmm` and/or `migration` tables directly on behalf of the +//! sled-agent. +//! +//! Although Nexus is technically the party responsible for the database query +//! that writes VMM and migration state updates received from sled-agent, it is +//! the sled-agent that *logically* "owns" these records. A row in the `vmm` +//! table represents a particular Propolis process running on a particular sled, +//! and that sled's sled-agent is the sole source of truth for that process. The +//! generation number for a `vmm` record is the property of the sled-agent +//! responsible for that VMM. Similarly, a `migration` record has separate +//! generation numbers for the source and target VMM states, which are owned by +//! the sled-agents responsible for the source and target Propolis processes, +//! respectively. If a sled-agent pushes a state update to a particular Nexus +//! instance and that Nexus fails to write the state to the database, that isn't +//! the end of the world: the sled-agent can simply retry with a different +//! Nexus, and the generation number, which is incremented exclusively by the +//! sled-agent, ensures that state changes are idempotent and ordered. If a +//! faulty Nexus were to return 200 OK to a sled-agent's call to +//! `cpapi_instances_put` but choose to simply eat the received instance state +//! update rather than writing it to the database, even *that* wouldn't +//! necessarily mean that the state change was gone forever: the +//! `instance-watcher` background task on another Nexus instance would +//! eventually poll the sled-agent's state and observe any changes that were +//! accidentally missed. This is all very neat and tidy, and we should feel +//! proud of ourselves for having designed such a nice little system. +//! +//! Unfortunately, when we look beyond the `vmm` and `migration` tables, things +//! rapidly become interesting (in the "may you live in interesting times" +//! sense). The `instance` record *cannot* be owned exclusively by anyone. The +//! logical instance state it represents is a gestalt that may consist of state +//! that exists in multiple VMM processes on multiple sleds, as well as +//! control-plane operations triggered by operator inputs and performed by +//! multiple Nexus instances. This is, as they say, "hairy". The neat and tidy +//! little state updates published by sled-agents to Nexus in the previous +//! paragraph may, in some cases, represent a state transition that also +//! requires changes to the `instance` table: for instance, a live migration may +//! have completed, necessitating a change in the instance's `propolis_id` to +//! point to the new VMM. +//! +//! Oh, and one other thing: the `instance` table record in turn logically +//! "owns" other resources, such as the virtual-provisioning counters that +//! represent rack-level resources allocated to the instance, and the instance's +//! network configuration. When the instance's state changes, these resources +//! owned by the instance may also need to be updated, such as changing the +//! network configuration to point at an instance's new home after a successful +//! migration, or deallocating virtual provisioning counters when an instance is +//! destroyed. Naturally, these updates must also be performed reliably and +//! inconsistent states must be avoided. +//! +//! Thus, we arrive here, at the instance-update saga. +//! +//! ## Theory of Operation +//! +//! In order to ensure that changes to the state of an instance are handled +//! reliably, we require that all multi-stage operations on an instance --- +//! i.e., operations which cannot be done atomically in a single database query +//! --- on an instance are performed by a saga. The following sagas currently +//! touch the `instance` record: +//! +//! - [`instance_start`] +//! - [`instance_migrate`] +//! - [`instance_delete`] +//! - `instance_update` (this saga) +//! +//! For most of these sagas, the instance state machine itself guards against +//! potential race conditions. By considering the valid and invalid flows +//! through an instance's state machine, we arrive at some ground rules: +//! +//! - The `instance_migrate` and `instance_delete` sagas will only modify the +//! instance record if the instance *has* an active Propolis ID. +//! - The `instance_start` and instance_delete` sagas will only modify the +//! instance record if the instance does *not* have an active VMM. +//! - The presence of a migration ID prevents an `instance_migrate` saga from +//! succeeding until the current migration is resolved (either completes or +//! fails). +//! - Only the `instance_start` saga can set the instance's *active* Propolis +//! ID, and it can only do this if there is currently no active Propolis. +//! - Only the `instance_migrate` saga can set the instance's *target* Propolis +//! ID and migration ID, and it can only do that if these fields are unset, or +//! were left behind by a failed `instance_migrate` saga unwinding. +//! - Only the `instance_update` saga can unset a migration ID and target +//! Propolis ID, which it will do when handling an update from sled-agent that +//! indicates that a migration has succeeded or failed. +//! - Only the `instance_update` saga can unset an instance's active Propolis +//! ID, which it will do when handling an update from sled-agent that +//! indicates that the VMM has been destroyed (peacefully or violently). +//! +//! For the most part, this state machine prevents race conditions where +//! multiple sagas mutate the same fields in the instance record, because the +//! states from which a particular transition may start limited. However, this +//! is not the case for the `instance-update` saga, which may need to run any +//! time a sled-agent publishes a new instance state. Therefore, this saga +//! ensures mutual exclusion using one of the only distributed locking schemes +//! in Omicron: the "instance updater lock". +//! +//! ### The Instance-Updater Lock, or, "Distributed RAII" +//! +//! Distributed locks [are scary][dist-locking]. One of the *scariest* things +//! about distributed locks is that a process can die[^1] while holding a lock, +//! which results in the protected resource (in this case, the `instance` +//! record) being locked forever.[^2] It would be good for that to not happen. +//! Fortunately, *if* (and only if) we promise to *only* ever acquire the +//! instance-updater lock inside of a saga, we can guarantee forward progress: +//! should a saga fail while holding the lock, it will unwind into a reverse +//! action that releases the lock. This is essentially the distributed +//! equivalent to holding a RAII guard in a Rust program: if the thread holding +//! the lock panics, it unwinds its stack, drops the [`std::sync::MutexGuard`], +//! and the rest of the system is not left in a deadlocked state. As long as we +//! ensure that the instance-updater lock is only ever acquired by sagas, and +//! that any saga holding a lock will reliably release it when it unwinds, we're +//! ... *probably* ... okay. +//! +//! When an `instance-update` saga is started, it attempts to [acquire the +//! updater lock][instance_updater_lock]. If the lock is already held by another +//! update saga, then the update saga completes immediately. Otherwise, the saga +//! then queries CRDB for the current state of the `instance` record, the active +//! and migration-target `vmm` records (if any exist), and the current +//! `migration` record (if one exists). This snapshot represents the state from +//! which the update will be applied, and must be read only after locking the +//! instance to ensure that it cannot race with another saga. +//! +//! This is where another of this saga's weird quirks shows up: the shape of the +//! saga DAG we wish to execute depends on this instance, active VMM, target +//! VMM, and migration. But, because the precondition for the saga state may +//! only be read once the lock is acquired, and --- as we discussed above --- +//! the instance-updater lock may only ever be acquired within a saga, we arrive +//! at a bit of a weird impasse: we can't determine what saga DAG to build +//! without looking at the initial state, but we can't load the state until +//! we've already started a saga. To solve this, we've split this saga into two +//! pieces: the first, `start-instance-update`, is a very small saga that just +//! tries to lock the instance, and upon doing so, loads the instance state from +//! the database and prepares and executes the "real" instance update saga. Once +//! the "real" saga starts, it "inherits" the lock from the start saga by +//! performing [the SQL equivalent equivalent of a compare-and-swap +//! operation][instance_updater_inherit_lock] with its own UUID. +//! +//! The DAG for the "real" update saga depends on the state read within the +//! lock, and since the lock was never released, that state remains valid for +//! its execution. As the final action of the update saga, the instance record's +//! new runtime state is written back to the database and the lock is released, +//! in a [single atomic operation][instance_updater_unlock]. Should the update +//! saga fail, it will release the inherited lock. And, if the unwinding update +//! saga unwinds into the start saga, that's fine, because a double-unlock is +//! prevented by the saga ID having changed in the "inherit lock" operation. +//! +//! ### Interaction With Other Sagas +//! +//! The instance-updater lock only provides mutual exclusion with regards to +//! *other `instance-update` sagas*. It does *not* prevent modifications to the +//! instance record by other sagas, such as `instance-start`, +//! `instance-migrate`, and `instance-delete`. Instead, mutual exclusion between +//! the `instance-update` saga and `instance-start` and `instance-delete` sagas +//! is ensured by the actual state of the instance record, as discussed above: +//! start and delete sagas can be started only when the instance has no active +//! VMM, and the `instance-update` saga will only run when an instance *does* +//! have an active VMM that has transitioned to a state where it must be +//! unlinked from the instance. The update saga unlinks the VMM from the +//! instance record as its last action, which allows the instance to be a valid +//! target for a start or delete saga. +//! +//! On the other hand, an `instance-migrate` saga can, potentially, mutate the +//! instance record while an update saga is running, if it attempts to start a +//! migration while an update is still being processed. If the migrate saga +//! starts during an update and completes before the update saga, the update +//! saga writing back an updated instance state to the instance record could +//! result in an [ABA problem]-like issue, where the changes made by the migrate +//! saga are clobbered by the update saga. These issues are instead guarded +//! against by the instance record's state generation number: the update saga +//! determines the generation for the updated instance record by incrementing +//! the generation number observed when the initial state for the update is +//! read. The query that writes back the instance's runtime state fails if the +//! generation number has changed since the state was read at the beginning of +//! the saga, which causes the saga to unwind. An unwinding saga activates the +//! `instance-updater` background task, which may in turn start a new saga if +//! the instance's current state still requires an update. +//! +//! To avoid unnecessarily changing an instance's state generation and +//! invalidating in-progress update sagas, unwinding `instance-start` and +//! `instance-migrate` sagas don't remove the VMMs and migrations they create +//! from the instance's `propolis_id`, `target_propolis_id`, and `migration_id` +//! fields. Instead, they transition the `vmm` records to +//! [`VmmState::SagaUnwound`], which is treated as equivalent to having no VMM +//! in that position by other instances of those sagas. +//! +//! ### Avoiding Missed Updates, or, "The `InstanceRuntimeState` Will Always Get Through" +//! +//! The lock operation we've described above is really more of a "try-lock" +//! operation: if the lock is already held, the saga trying to acquire it just +//! ends immediately, rather than waiting for the lock to be released. This begs +//! the question, "what happens if an instance update comes in while the lock is +//! held?" Do we just...leave it on the floor? Wasn't the whole point of this +//! Rube Goldberg mechanism of sagas to *prevent* instance state changes from +//! being missed? +//! +//! We solve this using an ~~even more layers of complexity~~defense-in-depth +//! approach. Together, a number of mechanisms exist to ensure that (a) an +//! instance whose VMM and migration states require an update saga will always +//! have an update saga run eventually, and (b) update sagas are run in as +//! timely a manner as possible. +//! +//! The first of these ~~layers of nonsense~~redundant systems to prevent missed +//! updates is perhaps the simplest one: _avoiding unnecessary update sagas_. +//! The `cpapi_instances_put` API endpoint and instance-watcher background tasks +//! handle changes to VMM and migration states by calling the +//! [`notify_instance_updated`] method, which writes the new states to the +//! database and (potentially) starts an update saga. Naively, this method would +//! *always* start an update saga, but remember that --- as we discussed +//! [above](#background) --- many VMM/migration state changes don't actually +//! require modifying the instance record. For example, if an instance's VMM +//! transitions from [`VmmState::Starting`] to [`VmmState::Running`], that +//! changes the instance's externally-visible effective state, but it does *not* +//! require an instance record update. By not starting an update saga unless one +//! is actually required, we reduce updater lock contention, so that the lock is +//! less likely to be held when VMM and migration states that actually *do* +//! require an update saga are published. The [`update_saga_needed`] function in +//! this module contains the logic for determining whether an update saga is +//! required. +//! +//! The second mechanism for ensuring updates are performed in a timely manner +//! is what I'm calling _saga chaining_. When the final action in an +//! instance-update saga writes back the instance record and releases the +//! updater lock, it will then perform a second query to read the instance, VMM, +//! and migration records. If the current state of the instance indicates that +//! another update saga is needed, then the completing saga will execute a new +//! start saga as its final action. +//! +//! The last line of defense is the `instance-updater` background task. This +//! task periodically queries the database to list instances which require +//! update sagas (either their active VMM is `Destroyed` or their active +//! migration has terminated) and are not currently locked by another update +//! saga. A new update saga is started for any such instances found. Because +//! this task runs periodically, it ensures that eventually, an update saga will +//! be started for any instance that requires one.[^3] +//! +//! The background task ensures that sagas are started eventually, but because +//! it only runs occasionally, update sagas started by it may be somewhat +//! delayed. To improve the timeliness of update sagas, we will also explicitly +//! activate the background task at any point where we know that an update saga +//! *should* run but we were not able to run it. If an update saga cannot be +//! started, whether by [`notify_instance_updated`], a `start-instance-update` +//! saga attempting to start its real saga, or an `instance-update` saga +//! chaining into a new one as its last action, the `instance-watcher` +//! background task is activated. Similarly, when a `start-instance-update` saga +//! fails to acquire the lock and exits, it activates the background task as +//! well. This ensures that we will attempt the update again. +//! +//! ### On Unwinding +//! +//! Typically, when a Nexus saga unwinds, each node's reverse action undoes any +//! changes made by the forward action. The `instance-update` saga, however, is +//! a bit different: most of its nodes don't have reverse actions that undo the +//! action they performed. This is because, unlike `instance-start`, +//! `instance-migrate`, or `instance-delete`, the instance-update saga is +//! **not** attempting to perform a state change for the instance that was +//! requested by a user. Instead, it is attempting to update the +//! database and networking configuration *to match a state change that has +//! already occurred.* +//! +//! Consider the following: if we run an `instance-start` saga, and the instance +//! cannot actually be started, of course we would want the unwinding saga to +//! undo any database changes it has made, because the instance was not actually +//! started. Failing to undo those changes when an `instance-start` saga unwinds +//! would mean the database is left in a state that does not reflect reality, as +//! the instance was not actually started. On the other hand, suppose an +//! instance's active VMM shuts down and we start an `instance-update` saga to +//! move it to the `Destroyed` state. Even if some action along the way fails, the +//! instance is still `Destroyed``; that state transition has *already happened* +//! on the sled, and unwinding the update saga cannot and should not un-destroy +//! the VMM. +//! +//! So, unlike other sagas, we want to leave basically anything we've +//! successfully done in place when unwinding, because even if the update is +//! incomplete, we have still brought Nexus' understanding of the instance +//! *closer* to reality. If there was something we weren't able to do, one of +//! the instance-update-related RPWs[^rpws] will start a new update saga to try +//! it again. Because saga actions are idempotent, attempting to do something +//! that was already successfully performed a second time isn't a problem, and +//! we don't need to undo it. +//! +//! The one exception to this is, as [discussed +//! above](#the-instance-updater-lock-or-distributed-raii), unwinding instance +//! update sagas MUST always release the instance-updater lock, so that a +//! subsequent saga can update the instance. Thus, the saga actions which lock +//! the instance have reverse actions that release the updater lock. +//! +//! [`instance_start`]: super::instance_start +//! [`instance_migrate`]: super::instance_migrate +//! [`instance_delete`]: super::instance_delete +//! [instance_updater_lock]: +//! crate::app::db::datastore::DataStore::instance_updater_lock +//! [instance_updater_inherit_lock]: +//! crate::app::db::datastore::DataStore::instance_updater_inherit_lock +//! [instance_updater_unlock]: +//! crate::app::db::datastore::DataStore::instance_updater_unlock +//! [`notify_instance_updated`]: crate::app::Nexus::notify_instance_updated +//! +//! [dist-locking]: +//! https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html +//! [ABA problem]: https://en.wikipedia.org/wiki/ABA_problem +//! +//! [^1]: And, if a process *can* die, well...we can assume it *will*. +//! [^2]: Barring human intervention. +//! [^3]: Even if the Nexus instance that processed the state update died +//! between when it wrote the state to CRDB and when it started the +//! requisite update saga! +//! [^rpws]: Either the `instance-updater` or `abandoned-vmm-reaper` background +//! tasks, as appropriate. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::db::datastore::instance; +use crate::app::db::datastore::InstanceGestalt; +use crate::app::db::datastore::VmmStateUpdateResult; +use crate::app::db::lookup::LookupPath; +use crate::app::db::model::ByteCount; +use crate::app::db::model::Generation; +use crate::app::db::model::InstanceRuntimeState; +use crate::app::db::model::InstanceState; +use crate::app::db::model::MigrationState; +use crate::app::db::model::Vmm; +use crate::app::db::model::VmmState; +use crate::app::sagas::declare_saga_actions; +use anyhow::Context; +use chrono::Utc; +use nexus_db_queries::{authn, authz}; +use nexus_types::identity::Resource; +use omicron_common::api::external::Error; +use omicron_common::api::internal::nexus; +use omicron_common::api::internal::nexus::SledInstanceState; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::PropolisUuid; +use serde::{Deserialize, Serialize}; +use steno::{ActionError, DagBuilder, Node}; +use uuid::Uuid; + +// The public interface to this saga is actually a smaller saga that starts the +// "real" update saga, which inherits the lock from the start saga. This is +// because the decision of which subsaga(s) to run depends on the state of the +// instance record read from the database *once the lock has been acquired*, +// and the saga DAG for the "real" instance update saga may be constructed only +// after the instance state has been fetched. However, since the the instance +// state must be read inside the lock, that *also* needs to happen in a saga, +// so that the lock is always dropped when unwinding. Thus, we have a second, +// smaller saga which starts our real saga, and then the real saga, which +// decides what DAG to build based on the instance fetched by the start saga. +// +// Don't worry, this won't be on the test. +mod start; +pub(crate) use self::start::{Params, SagaInstanceUpdate}; + +mod destroyed; + +/// Returns `true` if an `instance-update` saga should be executed as a result +/// of writing the provided [`SledInstanceState`] to the database with the +/// provided [`VmmStateUpdateResult`]. +/// +/// We determine this only after actually updating the database records, +/// because we don't know whether a particular VMM or migration state is +/// *new* or not until we know whether the corresponding database record has +/// actually changed (determined by the generation number). For example, when +/// an instance has migrated into a Propolis process, Propolis will continue +/// to report the migration in in the `Completed` state as part of all state +/// updates regarding that instance, but we no longer need to act on it if +/// the migration record has already been updated to reflect that the +/// migration has completed. +/// +/// Once we know what rows have been updated, we can inspect the states +/// written to the DB to determine whether an instance-update saga is +/// required to bring the instance record's state in line with the new +/// VMM/migration states. +pub fn update_saga_needed( + log: &slog::Logger, + instance_id: InstanceUuid, + state: &SledInstanceState, + result: &VmmStateUpdateResult, +) -> bool { + // Currently, an instance-update saga is required if (and only if): + // + // - The instance's active VMM has transitioned to `Destroyed`. We don't + // actually know whether the VMM whose state was updated here was the + // active VMM or not, so we will always attempt to run an instance-update + // saga if the VMM was `Destroyed`. + let vmm_needs_update = result.vmm_updated + && state.vmm_state.state == nexus::VmmState::Destroyed; + // - A migration in to this VMM has transitioned to a terminal state + // (`Failed` or `Completed`). + let migrations = state.migrations(); + let migration_in_needs_update = result.migration_in_updated + && migrations + .migration_in + .map(|migration| migration.state.is_terminal()) + .unwrap_or(false); + // - A migration out from this VMM has transitioned to a terminal state + // (`Failed` or `Completed`). + let migration_out_needs_update = result.migration_out_updated + && migrations + .migration_out + .map(|migration| migration.state.is_terminal()) + .unwrap_or(false); + // If any of the above conditions are true, prepare an instance-update saga + // for this instance. + let needed = vmm_needs_update + || migration_in_needs_update + || migration_out_needs_update; + if needed { + debug!(log, + "new VMM runtime state from sled agent requires an \ + instance-update saga"; + "instance_id" => %instance_id, + "propolis_id" => %state.propolis_id, + "vmm_needs_update" => vmm_needs_update, + "migration_in_needs_update" => migration_in_needs_update, + "migration_out_needs_update" => migration_out_needs_update, + ); + } + needed +} + +/// The set of updates to the instance and its owned resources to perform in +/// response to a VMM/migration state update. +/// +/// Depending on the current state of the instance and its VMM(s) and migration, +/// an update saga may perform a variety of operations. Which operations need to +/// be performed for the current state snapshot of the instance, VMM, and +/// migration records is determined by the [`UpdatesRequired::for_instance`] +/// function. +#[derive(Debug, Deserialize, Serialize)] +struct UpdatesRequired { + /// The new runtime state that must be written back to the database when the + /// saga completes. + new_runtime: InstanceRuntimeState, + + /// If this is [`Some`], the instance's active VMM with this UUID has + /// transitioned to [`VmmState::Destroyed`], and its resources must be + /// cleaned up by a [`destroyed`] subsaga. + destroy_active_vmm: Option, + + /// If this is [`Some`], the instance's migration target VMM with this UUID + /// has transitioned to [`VmmState::Destroyed`], and its resources must be + /// cleaned up by a [`destroyed`] subsaga. + destroy_target_vmm: Option, + + /// If this is [`Some`], the instance no longer has an active VMM, and its + /// virtual provisioning resource records and Oximeter producer should be + /// deallocated. + deprovision: Option, + + /// If this is [`Some`], then a network configuration update must be + /// performed: either updating NAT configuration and V2P mappings when the + /// instance has moved to a new sled, or deleting them if it is no longer + /// incarnated. + network_config: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +enum NetworkConfigUpdate { + Delete, + Update { active_propolis_id: PropolisUuid, new_sled_id: Uuid }, +} + +/// Virtual provisioning counters to release when an instance no longer has a +/// VMM. +#[derive(Debug, Deserialize, Serialize)] +struct Deprovision { + project_id: Uuid, + cpus_diff: i64, + ram_diff: ByteCount, +} + +impl UpdatesRequired { + fn for_instance( + log: &slog::Logger, + snapshot: &InstanceGestalt, + ) -> Option { + let mut new_runtime = snapshot.instance.runtime().clone(); + new_runtime.gen = Generation(new_runtime.gen.next()); + new_runtime.time_updated = Utc::now(); + let instance_id = snapshot.instance.id(); + + let mut update_required = false; + let mut network_config = None; + + // Has the active VMM been destroyed? + let destroy_active_vmm = + snapshot.active_vmm.as_ref().and_then(|active_vmm| { + if active_vmm.runtime.state == VmmState::Destroyed { + let id = PropolisUuid::from_untyped_uuid(active_vmm.id); + // Unlink the active VMM ID. If the active VMM was destroyed + // because a migration out completed, the next block, which + // handles migration updates, will set this to the new VMM's ID, + // instead. + new_runtime.propolis_id = None; + update_required = true; + Some(id) + } else { + None + } + }); + + // Okay, what about the target? + let destroy_target_vmm = + snapshot.target_vmm.as_ref().and_then(|target_vmm| { + if target_vmm.runtime.state == VmmState::Destroyed { + // Unlink the target VMM ID. + new_runtime.dst_propolis_id = None; + update_required = true; + Some(PropolisUuid::from_untyped_uuid(target_vmm.id)) + } else { + None + } + }); + + // If there's an active migration, determine how to update the instance + // record to reflect the current migration state. + if let Some(ref migration) = snapshot.migration { + if migration.either_side_failed() { + // If the migration has failed, clear the instance record's + // migration IDs so that a new migration can begin. + info!( + log, + "instance update (migration failed): clearing migration IDs"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + new_runtime.migration_id = None; + new_runtime.dst_propolis_id = None; + update_required = true; + // If the active VMM was destroyed, the network config must be + // deleted (which was determined above). Otherwise, if the + // migration failed but the active VMM was still there, we must + // still ensure the correct networking configuration + // exists for its current home. + // + // TODO(#3107) This is necessary even if the instance didn't move, + // because registering a migration target on a sled creates OPTE ports + // for its VNICs, and that creates new V2P mappings on that sled that + // place the relevant virtual IPs on the local sled. Once OPTE stops + // creating these mappings, this path only needs to be taken if an + // instance has changed sleds. + if destroy_active_vmm.is_none() { + if let Some(ref active_vmm) = snapshot.active_vmm { + info!( + log, + "instance update (migration failed): pointing network \ + config back at current VMM"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + network_config = + Some(NetworkConfigUpdate::to_vmm(active_vmm)); + } else { + // Otherwise, the active VMM has already been destroyed, + // and the target is reporting a failure because of + // that. Just delete the network config. + } + } + } else if migration.either_side_completed() { + // If either side reports that the migration has completed, set + // the instance record's active Propolis ID to point at the new + // VMM, and update the network configuration to point at that VMM. + if new_runtime.propolis_id != Some(migration.target_propolis_id) + { + info!( + log, + "instance update (migration completed): setting active \ + VMM ID to target and updating network config"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + let new_vmm = snapshot.target_vmm.as_ref().expect( + "if we have gotten here, there must be a target VMM", + ); + debug_assert_eq!(new_vmm.id, migration.target_propolis_id); + new_runtime.propolis_id = + Some(migration.target_propolis_id); + network_config = Some(NetworkConfigUpdate::to_vmm(new_vmm)); + update_required = true; + } + + // Welp, the migration has succeeded, but the target Propolis + // has also gone away. This is functionally equivalent to having + // the active VMM go to `Destroyed`, so now we have no active + // VMM anymore. + if destroy_target_vmm.is_some() { + info!( + log, + "instance update (migration completed): target VMM \ + has gone away, destroying it!"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + new_runtime.propolis_id = None; + update_required = true; + } + + // If the target reports that the migration has completed, + // unlink the migration (allowing a new one to begin). This has + // to wait until the target has reported completion to ensure a + // migration out of the target can't start until the migration + // in has definitely finished. + if migration.target_state == MigrationState::COMPLETED { + info!( + log, + "instance update (migration target completed): \ + clearing migration IDs"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + new_runtime.migration_id = None; + new_runtime.dst_propolis_id = None; + update_required = true; + } + } + } + + // If the *new* state no longer has a `propolis_id` field, that means + // that the active VMM was destroyed without a successful migration out + // (or, we migrated out to a target VMM that was immediately destroyed, + // which could happen if a running VM shut down immediately after + // migrating). In that case, the instance is no longer incarnated on a + // sled, and we must update the state of the world to reflect that. + let deprovision = if new_runtime.propolis_id.is_none() { + // N.B. that this does *not* set `update_required`, because + // `new_runtime.propolis_id` might be `None` just because there was, + // already, no VMM there. `update_required` gets set above if there + // was any actual state change. + + // We no longer have a VMM. + new_runtime.nexus_state = InstanceState::NoVmm; + // If the active VMM was destroyed and the instance has not migrated + // out of it, we must delete the instance's network configuration. + // + // This clobbers a previously-set network config update to a new + // VMM, because if we set one above, we must have subsequently + // discovered that there actually *is* no new VMM anymore! + network_config = Some(NetworkConfigUpdate::Delete); + // The instance's virtual provisioning records must be deallocated, + // as it is no longer consuming any virtual resources. Providing a + // set of virtual provisioning counters to deallocate also indicates + // that the instance's oximeter producer should be cleaned up. + Some(Deprovision { + project_id: snapshot.instance.project_id, + cpus_diff: i64::from(snapshot.instance.ncpus.0 .0), + ram_diff: snapshot.instance.memory, + }) + } else { + None + }; + + if !update_required { + return None; + } + + Some(Self { + new_runtime, + destroy_active_vmm, + destroy_target_vmm, + deprovision, + network_config, + }) + } +} + +impl NetworkConfigUpdate { + fn to_vmm(vmm: &Vmm) -> Self { + Self::Update { + active_propolis_id: PropolisUuid::from_untyped_uuid(vmm.id), + new_sled_id: vmm.sled_id, + } + } +} + +/// Parameters to the "real" instance update saga. +#[derive(Debug, Deserialize, Serialize)] +struct RealParams { + serialized_authn: authn::saga::Serialized, + + authz_instance: authz::Instance, + + update: UpdatesRequired, + + orig_lock: instance::UpdaterLock, +} + +const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; +const INSTANCE_LOCK: &str = "updater_lock"; +const NETWORK_CONFIG_UPDATE: &str = "network_config_update"; + +// instance update saga: actions + +declare_saga_actions! { + instance_update; + + // Become the instance updater. + // + // This action inherits the instance-updater lock from the + // `start-instance-update` saga, which attempts to compare-and-swap in a new + // saga UUID. This ensuring that only one child update saga is + // actually allowed to proceed, even if the `start-instance-update` saga's + // "fetch_instance_and_start_real_saga" executes multiple times, avoiding + // duplicate work. + // + // Unwinding this action releases the updater lock. In addition, it + // activates the `instance-updater` background task to ensure that a new + // update saga is started in a timely manner, to perform the work that the + // unwinding saga was *supposed* to do. Since this action only succeeds if + // the lock was acquired, and this saga is only started if updates are + // required, having this action activate the background task when unwinding + // avoids unneeded activations when a saga fails just because it couldn't + // get the lock. + BECOME_UPDATER -> "updater_lock" { + + siu_become_updater + - siu_unbecome_updater + } + + // Update network configuration. + UPDATE_NETWORK_CONFIG -> "update_network_config" { + + siu_update_network_config + } + + // Deallocate virtual provisioning resources reserved by the instance, as it + // is no longer running. + RELEASE_VIRTUAL_PROVISIONING -> "release_virtual_provisioning" { + + siu_release_virtual_provisioning + } + + // Unassign the instance's Oximeter producer. + UNASSIGN_OXIMETER_PRODUCER -> "unassign_oximeter_producer" { + + siu_unassign_oximeter_producer + } + + // Write back the new instance record, releasing the instance updater lock, + // and re-fetch the VMM and migration states. If they have changed in a way + // that requires an additional update saga, attempt to execute an additional + // update saga immediately. + // + // Writing back the updated instance runtime state is conditional on both + // the instance updater lock *and* the instance record's state generation + // number. If the state generation has advanced since this update saga + // began, writing the new runtime state will fail, as the update was + // performed based on an initial state that is no longer current. In that + // case, this action will fail, causing the saga to unwind, release the + // updater lock, and activate the `instance-updater` background task to + // schedule new update saga if one is still required. + COMMIT_INSTANCE_UPDATES -> "commit_instance_updates" { + + siu_commit_instance_updates + } + +} + +// instance update saga: definition +struct SagaDoActualInstanceUpdate; + +impl NexusSaga for SagaDoActualInstanceUpdate { + const NAME: &'static str = "instance-update"; + type Params = RealParams; + + fn register_actions(registry: &mut ActionRegistry) { + instance_update_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + // Helper function for constructing a constant node. + fn const_node( + name: impl AsRef, + value: &impl serde::Serialize, + ) -> Result { + let value = serde_json::to_value(value).map_err(|e| { + SagaInitError::SerializeError(name.as_ref().to_string(), e) + })?; + Ok(Node::constant(name, value)) + } + + // Generate a new ID and attempt to inherit the lock from the start saga. + builder.append(Node::action( + INSTANCE_LOCK_ID, + "GenerateInstanceLockId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(become_updater_action()); + + // If a network config update is required, do that. + if let Some(ref update) = params.update.network_config { + builder.append(const_node(NETWORK_CONFIG_UPDATE, update)?); + builder.append(update_network_config_action()); + } + + // If the instance now has no active VMM, release its virtual + // provisioning resources and unassign its Oximeter producer. + if params.update.deprovision.is_some() { + builder.append(release_virtual_provisioning_action()); + builder.append(unassign_oximeter_producer_action()); + } + + // Once we've finished mutating everything owned by the instance, we can + // write back the updated state and release the instance lock. + builder.append(commit_instance_updates_action()); + + // If either VMM linked to this instance has been destroyed, append + // subsagas to clean up the VMMs resources and mark them as deleted. + // + // Note that we must not mark the VMMs as deleted until *after* we have + // written back the updated instance record. Otherwise, if we mark a VMM + // as deleted while the instance record still references its ID, we will + // have created a state where the instance record contains a "dangling + // pointer" (database version) where the foreign key points to a record + // that no longer exists. Other consumers of the instance record may be + // unpleasantly surprised by this, so we avoid marking these rows as + // deleted until they've been unlinked from the instance by the + // `update_and_unlock_instance` action. + let mut append_destroyed_vmm_subsaga = + |vmm_id: PropolisUuid, which_vmm: &'static str| { + let params = destroyed::Params { + vmm_id, + instance_id: InstanceUuid::from_untyped_uuid( + params.authz_instance.id(), + ), + serialized_authn: params.serialized_authn.clone(), + }; + let name = format!("destroy_{which_vmm}_vmm"); + + let subsaga = destroyed::SagaDestroyVmm::make_saga_dag( + ¶ms, + DagBuilder::new(steno::SagaName::new(&name)), + )?; + + let params_name = format!("{name}_params"); + builder.append(const_node(¶ms_name, ¶ms)?); + + let output_name = format!("{which_vmm}_vmm_destroyed"); + builder.append(Node::subsaga( + output_name.as_str(), + subsaga, + ¶ms_name, + )); + + Ok::<(), SagaInitError>(()) + }; + + if let Some(vmm_id) = params.update.destroy_active_vmm { + append_destroyed_vmm_subsaga(vmm_id, "active")?; + } + + if let Some(vmm_id) = params.update.destroy_target_vmm { + append_destroyed_vmm_subsaga(vmm_id, "target")?; + } + + Ok(builder.build()?) + } +} + +async fn siu_become_updater( + sagactx: NexusActionContext, +) -> Result { + let RealParams { + ref serialized_authn, ref authz_instance, orig_lock, .. + } = sagactx.saga_params::()?; + let saga_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let osagactx = sagactx.user_data(); + let log = osagactx.log(); + + debug!( + log, + "instance update: trying to become instance updater..."; + "instance_id" => %authz_instance.id(), + "saga_id" => %saga_id, + "parent_lock" => ?orig_lock, + ); + + let lock = osagactx + .datastore() + .instance_updater_inherit_lock( + &opctx, + &authz_instance, + orig_lock, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + info!( + log, + "instance_update: Now, I am become Updater, the destroyer of VMMs."; + "instance_id" => %authz_instance.id(), + "saga_id" => %saga_id, + ); + + Ok(lock) +} + +async fn siu_unbecome_updater( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + let lock = sagactx.lookup::(INSTANCE_LOCK)?; + + unwind_instance_lock(lock, serialized_authn, authz_instance, &sagactx) + .await; + + // Now that we've released the lock, activate the `instance-updater` + // background task to make sure that a new instance update saga is started + // if the instance still needs to be updated. + sagactx + .user_data() + .nexus() + .background_tasks + .task_instance_updater + .activate(); + + Ok(()) +} + +async fn siu_update_network_config( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + + let update = + sagactx.lookup::(NETWORK_CONFIG_UPDATE)?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + let osagactx = sagactx.user_data(); + let nexus = osagactx.nexus(); + let log = osagactx.log(); + + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + + match update { + NetworkConfigUpdate::Delete => { + info!( + log, + "instance update: deleting network config"; + "instance_id" => %instance_id, + ); + nexus + .instance_delete_dpd_config(&opctx, authz_instance) + .await + .map_err(ActionError::action_failed)?; + } + NetworkConfigUpdate::Update { active_propolis_id, new_sled_id } => { + info!( + log, + "instance update: ensuring updated instance network config"; + "instance_id" => %instance_id, + "active_propolis_id" => %active_propolis_id, + "new_sled_id" => %new_sled_id, + ); + + let (.., sled) = LookupPath::new(&opctx, osagactx.datastore()) + .sled_id(new_sled_id) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + nexus + .instance_ensure_dpd_config( + &opctx, + instance_id, + &sled.address(), + None, + ) + .await + .map_err(ActionError::action_failed)?; + } + } + + Ok(()) +} + +async fn siu_release_virtual_provisioning( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let RealParams { + ref serialized_authn, ref authz_instance, ref update, .. + } = sagactx.saga_params::()?; + let Some(Deprovision { project_id, cpus_diff, ram_diff }) = + update.deprovision + else { + return Err(ActionError::action_failed( + "a `siu_release_virtual_provisioning` action should never have \ + been added to the DAG if the update does not contain virtual \ + resources to deprovision" + .to_string(), + )); + }; + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + + let log = osagactx.log(); + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + let result = osagactx + .datastore() + .virtual_provisioning_collection_delete_instance( + &opctx, + instance_id, + project_id, + cpus_diff, + ram_diff, + ) + .await; + match result { + Ok(deleted) => { + info!( + log, + "instance update (no VMM): deallocated virtual provisioning \ + resources"; + "instance_id" => %instance_id, + "records_deleted" => ?deleted, + ); + } + // Necessary for idempotency --- the virtual provisioning resources may + // have been deleted already, that's fine. + Err(Error::ObjectNotFound { .. }) => { + info!( + log, + "instance update (no VMM): virtual provisioning record not \ + found; perhaps it has already been deleted?"; + "instance_id" => %instance_id, + ); + } + Err(err) => return Err(ActionError::action_failed(err)), + }; + + Ok(()) +} + +async fn siu_unassign_oximeter_producer( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let log = osagactx.log(); + + info!( + log, + "instance update (no VMM): unassigning oximeter producer"; + "instance_id" => %authz_instance.id(), + ); + crate::app::oximeter::unassign_producer( + osagactx.datastore(), + log, + &opctx, + &authz_instance.id(), + ) + .await + .map_err(ActionError::action_failed) +} + +async fn siu_commit_instance_updates( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let RealParams { serialized_authn, authz_instance, ref update, .. } = + sagactx.saga_params::()?; + let lock = sagactx.lookup::(INSTANCE_LOCK)?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + let log = osagactx.log(); + let nexus = osagactx.nexus(); + + let instance_id = authz_instance.id(); + + debug!( + log, + "instance update: committing new runtime state and unlocking..."; + "instance_id" => %instance_id, + "new_runtime" => ?update.new_runtime, + "lock" => ?lock, + ); + + let did_unlock = osagactx + .datastore() + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &update.new_runtime, + ) + .await + .map_err(ActionError::action_failed)?; + + info!( + log, + "instance update: committed update new runtime state!"; + "instance_id" => %instance_id, + "new_runtime" => ?update.new_runtime, + "did_unlock" => ?did_unlock, + ); + + if update.network_config.is_some() { + // If the update we performed changed networking configuration, activate + // the V2P manager and VPC router RPWs, to ensure that the V2P mapping + // and VPC for this instance are up to date. + // + // We do this here, rather than in the network config update action, so + // that the instance's state in the database reflects the new rather + // than the old state. Otherwise, if the networking RPW ran *before* + // writing the new state to CRDB, it will run with the old VMM, rather + // than the new one, and probably do nothing. Then, the networking + // config update would be delayed until the *next* background task + // activation. This way, we ensure that the RPW runs *after* we are in + // the new state. + + nexus.background_tasks.task_v2p_manager.activate(); + nexus.vpc_needed_notify_sleds(); + } + + // Check if the VMM or migration state has changed while the update saga was + // running and whether an additional update saga is now required. If one is + // required, try to start it. + // + // TODO(eliza): it would be nice if we didn't release the lock, determine + // the needed updates, and then start a new start-instance-update saga that + // re-locks the instance --- instead, perhaps we could keep the lock, and + // try to start a new "actual" instance update saga that inherits our lock. + // This way, we could also avoid computing updates required twice. + // But, I'm a bit sketched out by the implications of not committing update + // and dropping the lock in the same operation. This deserves more thought... + if let Err(error) = + chain_update_saga(&sagactx, authz_instance, serialized_authn).await + { + // If starting the new update saga failed, DO NOT unwind this saga and + // undo all the work we've done successfully! Instead, just kick the + // instance-updater background task to try and start a new saga + // eventually, and log a warning. + warn!( + log, + "instance update: failed to start successor saga!"; + "instance_id" => %instance_id, + "error" => %error, + ); + nexus.background_tasks.task_instance_updater.activate(); + } + + Ok(()) +} + +async fn chain_update_saga( + sagactx: &NexusActionContext, + authz_instance: authz::Instance, + serialized_authn: authn::saga::Serialized, +) -> Result<(), anyhow::Error> { + let opctx = + crate::context::op_context_for_saga_action(sagactx, &serialized_authn); + let osagactx = sagactx.user_data(); + let log = osagactx.log(); + + let instance_id = authz_instance.id(); + + // Fetch the state from the database again to see if we should immediately + // run a new saga. + let new_state = osagactx + .datastore() + .instance_fetch_all(&opctx, &authz_instance) + .await + .context("failed to fetch latest snapshot for instance")?; + + if let Some(update) = UpdatesRequired::for_instance(log, &new_state) { + debug!( + log, + "instance update: additional updates required, preparing a \ + successor update saga..."; + "instance_id" => %instance_id, + "update.new_runtime_state" => ?update.new_runtime, + "update.network_config_update" => ?update.network_config, + "update.destroy_active_vmm" => ?update.destroy_active_vmm, + "update.destroy_target_vmm" => ?update.destroy_target_vmm, + "update.deprovision" => ?update.deprovision, + ); + let saga_dag = SagaInstanceUpdate::prepare(&Params { + serialized_authn, + authz_instance, + }) + .context("failed to build new update saga DAG")?; + let saga = osagactx + .nexus() + .sagas + .saga_prepare(saga_dag) + .await + .context("failed to prepare new update saga")?; + saga.start().await.context("failed to start successor update saga")?; + // N.B. that we don't wait for the successor update saga to *complete* + // here. We just want to make sure it starts. + info!( + log, + "instance update: successor update saga started!"; + "instance_id" => %instance_id, + ); + } + + Ok(()) +} + +/// Unlock the instance record while unwinding. +/// +/// This is factored out of the actual reverse action, because the `Params` type +/// differs between the start saga and the actual instance update sagas, both of +/// which must unlock the instance in their reverse actions. +async fn unwind_instance_lock( + lock: instance::UpdaterLock, + serialized_authn: &authn::saga::Serialized, + authz_instance: &authz::Instance, + sagactx: &NexusActionContext, +) { + // /!\ EXTREMELY IMPORTANT WARNING /!\ + // + // This comment is a message, and part of a system of messages. Pay + // attention to it! The message is a warning about danger. + // + // The danger is still present in your time, as it was in ours. The danger + // is to the instance record, and it can deadlock. + // + // When unwinding, unlocking an instance MUST succeed at all costs. This is + // of the upmost importance. It's fine for unlocking an instance in a + // forward action to fail, since the reverse action will still unlock the + // instance when the saga is unwound. However, when unwinding, we must + // ensure the instance is unlocked, no matter what. This is because a + // failure to unlock the instance will leave the instance record in a + // PERMANENTLY LOCKED state, since no other update saga will ever be + // able to lock it again. If we can't unlock the instance here, our death + // will ruin the instance record forever and it will only be able to be + // removed by manual operator intervention. That would be...not great. + // + // Therefore, this action will retry the attempt to unlock the instance + // until it either: + // + // - succeeds, and we know the instance is now unlocked. + // - fails *because the instance doesn't exist*, in which case we can die + // happily because it doesn't matter if the instance is actually unlocked. + use dropshot::HttpError; + use futures::{future, TryFutureExt}; + use omicron_common::backoff; + + let osagactx = sagactx.user_data(); + let log = osagactx.log(); + let instance_id = authz_instance.id(); + let opctx = + crate::context::op_context_for_saga_action(sagactx, &serialized_authn); + + debug!( + log, + "instance update: unlocking instance on unwind"; + "instance_id" => %instance_id, + "lock" => ?lock, + ); + + const WARN_DURATION: std::time::Duration = + std::time::Duration::from_secs(20); + + let did_unlock = backoff::retry_notify_ext( + // This is an internal service query to CockroachDB. + backoff::retry_policy_internal_service(), + || { + osagactx + .datastore() + .instance_updater_unlock(&opctx, authz_instance, &lock) + .or_else(|err| future::ready(match err { + // The instance record was not found. It's probably been + // deleted. That's fine, we can now die happily, since we won't + // be leaving the instance permanently locked. + Error::ObjectNotFound { .. } => { + info!( + log, + "instance update: giving up on unlocking instance, \ + as it no longer exists"; + "instance_id" => %instance_id, + "lock" => ?lock, + ); + + Ok(false) + }, + // All other errors should be retried. + _ => Err(backoff::BackoffError::transient(err)), + })) + }, + |error, call_count, total_duration| { + let http_error = HttpError::from(error.clone()); + if http_error.status_code.is_client_error() { + error!( + log, + "instance update: client error while unlocking instance \ + (likely requires operator intervention), retrying anyway"; + "instance_id" => %instance_id, + "lock" => ?lock, + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else if total_duration > WARN_DURATION { + warn!( + log, + "instance update: server error while unlocking instance, \ + retrying"; + "instance_id" => %instance_id, + "lock" => ?lock, + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else { + info!( + log, + "instance update: server error while unlocking instance, \ + retrying"; + "instance_id" => %instance_id, + "lock" => ?lock, + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } + }, + ) + .await + .expect("errors should be retried indefinitely"); + + info!( + log, + "instance update: unlocked instance while unwinding"; + "instance_id" => %instance_id, + "lock" => ?lock, + "did_unlock" => did_unlock, + ); +} + +#[cfg(test)] +mod test { + use super::*; + use crate::app::db::model::Instance; + use crate::app::db::model::VmmRuntimeState; + use crate::app::saga::create_saga_dag; + use crate::app::sagas::test_helpers; + use crate::app::OpContext; + use crate::external_api::params; + use chrono::Utc; + use dropshot::test_util::ClientTestContext; + use nexus_db_queries::db::datastore::InstanceAndActiveVmm; + use nexus_db_queries::db::lookup::LookupPath; + use nexus_test_utils::resource_helpers::{ + create_default_ip_pool, create_project, object_create, + }; + use nexus_test_utils_macros::nexus_test; + use omicron_common::api::internal::nexus::{ + MigrationRuntimeState, MigrationState, Migrations, + }; + use omicron_uuid_kinds::GenericUuid; + use omicron_uuid_kinds::PropolisUuid; + use omicron_uuid_kinds::SledUuid; + use std::sync::Arc; + use std::sync::Mutex; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + const PROJECT_NAME: &str = "test-project"; + const INSTANCE_NAME: &str = "test-instance"; + + // Most Nexus sagas have test suites that follow a simple formula: there's + // usually a `test_saga_basic_usage_succeeds` that just makes sure the saga + // basically works, and then a `test_actions_succeed_idempotently` test that + // does the same thing, but runs every action twice. Then, there's usually a + // `test_action_failures_can_unwind` test, and often also a + // `test_action_failures_can_unwind_idempotently` test. + // + // For the instance-update saga, the test suite is a bit more complicated. + // This saga will do a number of different things depending on the ways in + // which the instance's migration and VMM records have changed since the + // last update. Therefore, we want to test all of the possible branches + // through the saga: + // + // 1. active VMM destroyed + // 2. migration source completed + // 3. migration target completed + // 4. migration source VMM completed and was destroyed, + // 5. migration target failed + // 6. migration source failed + + async fn setup_test_project(client: &ClientTestContext) -> Uuid { + create_default_ip_pool(&client).await; + let project = create_project(&client, PROJECT_NAME).await; + project.identity.id + } + + async fn create_instance( + client: &ClientTestContext, + ) -> omicron_common::api::external::Instance { + use omicron_common::api::external::{ + ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, + }; + let instances_url = format!("/v1/instances?project={}", PROJECT_NAME); + object_create( + client, + &instances_url, + ¶ms::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: INSTANCE_NAME.parse().unwrap(), + description: format!("instance {:?}", INSTANCE_NAME), + }, + ncpus: InstanceCpuCount(1), + memory: ByteCount::from_gibibytes_u32(1), + hostname: INSTANCE_NAME.parse().unwrap(), + user_data: b"#cloud-config".to_vec(), + ssh_public_keys: Some(Vec::new()), + network_interfaces: + params::InstanceNetworkInterfaceAttachment::None, + external_ips: vec![], + disks: vec![], + start: true, + }, + ) + .await + } + + #[track_caller] + fn assert_instance_unlocked(instance: &Instance) { + assert_eq!( + instance.updater_id, None, + "instance updater lock should have been released" + ) + } + + // Asserts that an instance record is in a consistent state (e.g., that all + // state changes performed by the update saga are either applied atomically, + // or have not been applied). This is particularly important to check when a + // saga unwinds. + #[track_caller] + fn assert_instance_record_is_consistent(instance: &Instance) { + let run_state = instance.runtime(); + match run_state.nexus_state { + InstanceState::Vmm => assert!( + run_state.propolis_id.is_some(), + "if the instance record is in the `Vmm` state, it must have \ + an active VMM\ninstance: {instance:#?}", + ), + state => assert_eq!( + run_state.propolis_id, None, + "if the instance record is in the `{state:?}` state, it must \ + not have an active VMM\ninstance: {instance:#?}", + ), + } + + if run_state.dst_propolis_id.is_some() { + assert!( + run_state.migration_id.is_some(), + "if the instance record has a target VMM ID, then it must \ + also have a migration\ninstance: {instance:#?}", + ); + } + + if run_state.migration_id.is_some() { + assert_eq!( + run_state.nexus_state, + InstanceState::Vmm, + "if an instance is migrating, it must be in the VMM state\n\ + instance: {instance:#?}", + ); + } + } + + async fn after_unwinding( + parent_saga_id: Option, + cptestctx: &ControlPlaneTestContext, + ) { + let state = test_helpers::instance_fetch_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; + let instance = state.instance(); + + // Unlike most other sagas, we actually don't unwind the work performed + // by an update saga, as we would prefer that at least some of it + // succeeds. The only thing that *needs* to be rolled back when an + // instance-update saga fails is that the updater lock *MUST* either + // remain locked by the parent start saga, or have been released so that + // a subsequent saga can run. See the section "on unwinding" in the + // documentation comment at the top of the instance-update module for + // details. + if let Some(parent_saga_id) = parent_saga_id { + if let Some(actual_lock_id) = instance.updater_id { + assert_eq!( + actual_lock_id, parent_saga_id, + "if the instance is locked after unwinding, it must be \ + locked by the `start-instance-update` saga, and not the \ + unwinding child saga!" + ); + } + } else { + assert_instance_unlocked(instance); + } + + // Additionally, we assert that the instance record is in a + // consistent state, ensuring that all changes to the instance record + // are atomic. This is important *because* we won't roll back changes + // to the instance: if we're going to leave them in place, they can't + // be partially applied, even if we unwound partway through the saga. + assert_instance_record_is_consistent(instance); + + // Throw away the instance so that subsequent unwinding + // tests also operate on an instance in the correct + // preconditions to actually run the saga path we mean + // to test. + let instance_id = InstanceUuid::from_untyped_uuid(instance.id()); + // Depending on where we got to in the update saga, the + // sled-agent may or may not actually be willing to stop + // the instance, so just manually update the DB record + // into a state where we can delete it to make sure + // everything is cleaned up for the next run. + cptestctx + .server + .server_context() + .nexus + .datastore() + .instance_update_runtime( + &instance_id, + &InstanceRuntimeState { + time_updated: Utc::now(), + gen: Generation(instance.runtime().gen.0.next()), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, + ) + .await + .unwrap(); + + test_helpers::instance_delete_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; + } + + // === Active VMM destroyed tests === + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + + // Run the instance-update saga. + let nexus = &cptestctx.server.server_context().nexus; + nexus + .sagas + .saga_execute::(params) + .await + .expect("update saga should succeed"); + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + + // Build the saga DAG with the provided test parameters + let real_params = make_real_params( + cptestctx, + &test_helpers::test_opctx(cptestctx), + params, + ) + .await; + let dag = + create_saga_dag::(real_params).unwrap(); + + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let nexus = &cptestctx.server.server_context().nexus; + let opctx = test_helpers::test_opctx(cptestctx); + // Stupid side channel for passing the expected parent start saga's lock + // ID into the "after unwinding" method, so that it can check that the + // lock is either released or was never acquired. + let parent_saga_id = Arc::new(Mutex::new(None)); + + test_helpers::action_failure_can_unwind::< + SagaDoActualInstanceUpdate, + _, + _, + >( + nexus, + || { + let parent_saga_id = parent_saga_id.clone(); + let opctx = &opctx; + Box::pin(async move { + let (_, start_saga_params) = + setup_active_vmm_destroyed_test(cptestctx).await; + + // Since the unwinding test will test unwinding from each + // individual saga node *in the saga DAG constructed by the + // provided params*, we need to give it the "real saga"'s + // params rather than the start saga's params. Otherwise, + // we're just testing the unwinding behavior of the trivial + // two-node start saga + let real_params = + make_real_params(cptestctx, opctx, start_saga_params) + .await; + *parent_saga_id.lock().unwrap() = + Some(real_params.orig_lock.updater_id); + real_params + }) + }, + || { + let parent_saga_id = parent_saga_id.clone(); + Box::pin(async move { + let parent_saga_id = + parent_saga_id.lock().unwrap().take().expect( + "parent saga's lock ID must have been set by the \ + `before_saga` function; this is a test bug", + ); + after_unwinding(Some(parent_saga_id), cptestctx).await + }) + }, + &cptestctx.logctx.log, + ) + .await; + } + + // === idempotency and unwinding tests for the start saga === + + // We only do these tests with an "active VMM destroyed" precondition, since + // the behavior of the `start-instance-update` saga does *not* depend on the + // specific update to perform, and it seems unnecessary to run the start + // saga's tests against every possible migration outcome combination tested + // below. + + #[nexus_test(server = crate::Server)] + async fn test_start_saga_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + let dag = create_saga_dag::(params).unwrap(); + + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_start_saga_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let nexus = &cptestctx.server.server_context().nexus; + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + let (_, params) = + setup_active_vmm_destroyed_test(cptestctx).await; + params + }) + }, + // Don't pass a parent saga ID here because the instance MUST be + // unlocked if the whole start saga unwinds. + || Box::pin(after_unwinding(None, cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + + // --- test helpers --- + + async fn setup_active_vmm_destroyed_test( + cptestctx: &ControlPlaneTestContext, + ) -> (InstanceAndActiveVmm, Params) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore().clone(); + + let opctx = test_helpers::test_opctx(cptestctx); + let instance = create_instance(client).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + // Poke the instance to get it into the Running state. + test_helpers::instance_simulate(cptestctx, &instance_id).await; + + let state = test_helpers::instance_fetch(cptestctx, instance_id).await; + // The instance should have an active VMM. + let instance_runtime = state.instance().runtime(); + assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); + assert!(instance_runtime.propolis_id.is_some()); + // Once we destroy the active VMM, we'll assert that the virtual + // provisioning and sled resource records it owns have been deallocated. + // In order to ensure we're actually testing the correct thing, let's + // make sure that those records exist now --- if not, the assertions + // later won't mean anything! + assert!( + !test_helpers::no_virtual_provisioning_resource_records_exist( + cptestctx + ) + .await, + "we can't assert that a destroyed VMM instance update deallocates \ + virtual provisioning records if none exist!", + ); + assert!( + !test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx) + .await, + "we can't assert that a destroyed VMM instance update deallocates \ + virtual provisioning records if none exist!", + ); + assert!( + !test_helpers::no_sled_resource_instance_records_exist(cptestctx) + .await, + "we can't assert that a destroyed VMM instance update deallocates \ + sled resource records if none exist!" + ); + + // Now, destroy the active VMM + let vmm = state.vmm().as_ref().unwrap(); + let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); + datastore + .vmm_update_runtime( + &vmm_id, + &VmmRuntimeState { + time_state_updated: Utc::now(), + gen: Generation(vmm.runtime.gen.0.next()), + state: VmmState::Destroyed, + }, + ) + .await + .unwrap(); + + let (_, _, authz_instance, ..) = LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .fetch() + .await + .expect("test instance should be present in datastore"); + let params = Params { + authz_instance, + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + }; + (state, params) + } + + async fn verify_active_vmm_destroyed( + cptestctx: &ControlPlaneTestContext, + instance_id: Uuid, + ) { + let state = test_helpers::instance_fetch( + cptestctx, + InstanceUuid::from_untyped_uuid(instance_id), + ) + .await; + + // The instance's active VMM has been destroyed, so its state should + // transition to `NoVmm`, and its active VMM ID should be unlinked. The + // virtual provisioning and sled resources allocated to the instance + // should be deallocated. + assert_instance_unlocked(state.instance()); + assert!(state.vmm().is_none()); + let instance_runtime = state.instance().runtime(); + assert_eq!(instance_runtime.nexus_state, InstanceState::NoVmm); + assert!(instance_runtime.propolis_id.is_none()); + assert!( + test_helpers::no_virtual_provisioning_resource_records_exist( + cptestctx + ) + .await + ); + assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); + assert!( + test_helpers::no_sled_resource_instance_records_exist(cptestctx) + .await + ); + } + + // === migration source completed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_completed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + MigrationOutcome::default() + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_completed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_completed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .source(MigrationState::Completed, VmmState::Stopping) + .run_unwinding_test(cptestctx) + .await; + } + + // === migration target completed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_completed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_completed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_completed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .run_unwinding_test(cptestctx) + .await; + } + + // === migration completed and source destroyed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_source_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .source(MigrationState::Completed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_source_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .source(MigrationState::Completed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_source_destroyed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .source(MigrationState::Completed, VmmState::Destroyed) + .run_unwinding_test(cptestctx) + .await; + } + + // === migration failed, target not destroyed === + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Failed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Failed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Failed) + .source(MigrationState::Failed, VmmState::Running) + .run_unwinding_test(cptestctx) + .await; + } + + // === migration failed, migration target destroyed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_destroyed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Running) + .run_unwinding_test(cptestctx) + .await; + } + + // === migration failed, migration source destroyed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_failed_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::InProgress, VmmState::Running) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_failed_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::InProgress, VmmState::Running) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_failed_destroyed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .target(MigrationState::InProgress, VmmState::Running) + .source(MigrationState::Failed, VmmState::Destroyed) + .run_unwinding_test(cptestctx) + .await; + } + + // === migration failed, source and target both destroyed === + + #[nexus_test(server = crate::Server)] + async fn test_migration_failed_everyone_died_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_failed_everyone_died_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_failed_everyone_died_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Destroyed) + .run_unwinding_test(cptestctx) + .await; + } + + // === migration completed, but then the target was destroyed === + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_but_target_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Destroyed) + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_but_target_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Destroyed) + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_but_target_destroyed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Destroyed) + .source(MigrationState::Completed, VmmState::Stopping) + .run_unwinding_test(cptestctx) + .await; + } + + #[derive(Clone, Copy, Default)] + struct MigrationOutcome { + source: Option<(MigrationState, VmmState)>, + target: Option<(MigrationState, VmmState)>, + failed: bool, + } + + impl MigrationOutcome { + fn source(self, migration: MigrationState, vmm: VmmState) -> Self { + let failed = self.failed + || migration == MigrationState::Failed + || vmm == VmmState::Failed; + Self { source: Some((migration, vmm)), failed, ..self } + } + + fn target(self, migration: MigrationState, vmm: VmmState) -> Self { + let failed = self.failed + || migration == MigrationState::Failed + || vmm == VmmState::Failed; + Self { target: Some((migration, vmm)), failed, ..self } + } + + async fn setup_test( + self, + cptestctx: &ControlPlaneTestContext, + other_sleds: &[(SledUuid, omicron_sled_agent::sim::Server)], + ) -> MigrationTest { + MigrationTest::setup(self, cptestctx, other_sleds).await + } + + async fn run_unwinding_test( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = + setup_test_project(&cptestctx.external_client).await; + let opctx = test_helpers::test_opctx(&cptestctx); + + // Stupid side channel for passing the expected parent start saga's lock + // ID into the "after unwinding" method, so that it can check that the + // lock is either released or was never acquired. + let parent_saga_id = Arc::new(Mutex::new(None)); + + test_helpers::action_failure_can_unwind::< + SagaDoActualInstanceUpdate, + _, + _, + >( + nexus, + || { + let parent_saga_id = parent_saga_id.clone(); + let other_sleds = &other_sleds; + let opctx = &opctx; + Box::pin(async move { + // Since the unwinding test will test unwinding from each + // individual saga node *in the saga DAG constructed by the + // provided params*, we need to give it the "real saga"'s + // params rather than the start saga's params. Otherwise, + // we're just testing the unwinding behavior of the trivial + // two-node start saga. + let start_saga_params = self + .setup_test(cptestctx, other_sleds) + .await + .start_saga_params(); + let real_params = make_real_params( + cptestctx, + opctx, + start_saga_params, + ) + .await; + *parent_saga_id.lock().unwrap() = + Some(real_params.orig_lock.updater_id); + real_params + }) + }, + || { + let parent_saga_id = parent_saga_id.clone(); + Box::pin(async move { + let parent_saga_id = + parent_saga_id.lock().unwrap().take().expect( + "parent saga's lock ID must have been set by \ + the `before_saga` function; this is a test \ + bug", + ); + after_unwinding(Some(parent_saga_id), cptestctx).await + }) + }, + &cptestctx.logctx.log, + ) + .await; + } + } + + struct MigrationTest { + outcome: MigrationOutcome, + instance_id: InstanceUuid, + initial_state: InstanceGestalt, + authz_instance: authz::Instance, + opctx: OpContext, + } + + impl MigrationTest { + fn target_vmm_id(&self) -> Uuid { + self.initial_state + .target_vmm + .as_ref() + .expect("migrating instance must have a target VMM") + .id + } + + fn src_vmm_id(&self) -> Uuid { + self.initial_state + .active_vmm + .as_ref() + .expect("migrating instance must have a source VMM") + .id + } + + async fn setup( + outcome: MigrationOutcome, + cptestctx: &ControlPlaneTestContext, + other_sleds: &[(SledUuid, omicron_sled_agent::sim::Server)], + ) -> Self { + use crate::app::sagas::instance_migrate; + + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + + let opctx = test_helpers::test_opctx(cptestctx); + let instance = create_instance(client).await; + let instance_id = + InstanceUuid::from_untyped_uuid(instance.identity.id); + + // Poke the instance to get it into the Running state. + let state = + test_helpers::instance_fetch(cptestctx, instance_id).await; + test_helpers::instance_simulate(cptestctx, &instance_id).await; + + let vmm = state.vmm().as_ref().unwrap(); + let dst_sled_id = + test_helpers::select_first_alternate_sled(vmm, other_sleds); + let params = instance_migrate::Params { + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + instance: state.instance().clone(), + src_vmm: vmm.clone(), + migrate_params: params::InstanceMigrate { + dst_sled_id: dst_sled_id.into_untyped_uuid(), + }, + }; + + nexus + .sagas + .saga_execute::(params) + .await + .expect("Migration saga should succeed"); + + // Poke the destination sled just enough to make it appear to have a VMM. + test_helpers::instance_single_step_on_sled( + cptestctx, + &instance_id, + &dst_sled_id, + ) + .await; + + let (_, _, authz_instance, ..) = + LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .fetch() + .await + .expect("test instance should be present in datastore"); + let initial_state = datastore + .instance_fetch_all(&opctx, &authz_instance) + .await + .expect("test instance should be present in datastore"); + + let this = Self { + authz_instance, + initial_state, + outcome, + opctx, + instance_id, + }; + if let Some((migration_state, vmm_state)) = this.outcome.source { + this.update_src_state(cptestctx, vmm_state, migration_state) + .await; + } + + if let Some((migration_state, vmm_state)) = this.outcome.target { + this.update_target_state(cptestctx, vmm_state, migration_state) + .await; + } + + this + } + + async fn run_saga_basic_usage_succeeds_test( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + // Run the instance-update saga. + let nexus = &cptestctx.server.server_context().nexus; + nexus + .sagas + .saga_execute::(self.start_saga_params()) + .await + .expect("update saga should succeed"); + + // Check the results + self.verify(cptestctx).await; + } + + async fn run_actions_succeed_idempotently_test( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + let params = make_real_params( + cptestctx, + &self.opctx, + self.start_saga_params(), + ) + .await; + + // Build the saga DAG with the provided test parameters + let dag = + create_saga_dag::(params).unwrap(); + + // Run the actions-succeed-idempotently test + test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + // Check the results + self.verify(cptestctx).await; + } + + async fn update_src_state( + &self, + cptestctx: &ControlPlaneTestContext, + vmm_state: VmmState, + migration_state: MigrationState, + ) { + let src_vmm = self + .initial_state + .active_vmm + .as_ref() + .expect("must have an active VMM"); + let vmm_id = PropolisUuid::from_untyped_uuid(src_vmm.id); + let new_runtime = nexus_db_model::VmmRuntimeState { + time_state_updated: Utc::now(), + gen: Generation(src_vmm.runtime.gen.0.next()), + state: vmm_state, + }; + + let migration = self + .initial_state + .migration + .as_ref() + .expect("must have an active migration"); + let migration_out = MigrationRuntimeState { + migration_id: migration.id, + state: migration_state, + gen: migration.source_gen.0.next(), + time_updated: Utc::now(), + }; + let migrations = Migrations { + migration_in: None, + migration_out: Some(&migration_out), + }; + + info!( + cptestctx.logctx.log, + "updating source VMM state..."; + "propolis_id" => %vmm_id, + "new_runtime" => ?new_runtime, + "migration_out" => ?migration_out, + ); + + cptestctx + .server + .server_context() + .nexus + .datastore() + .vmm_and_migration_update_runtime( + &self.opctx, + vmm_id, + &new_runtime, + migrations, + ) + .await + .expect("updating migration source state should succeed"); + } + + async fn update_target_state( + &self, + cptestctx: &ControlPlaneTestContext, + vmm_state: VmmState, + migration_state: MigrationState, + ) { + let target_vmm = self + .initial_state + .target_vmm + .as_ref() + .expect("must have a target VMM"); + let vmm_id = PropolisUuid::from_untyped_uuid(target_vmm.id); + let new_runtime = nexus_db_model::VmmRuntimeState { + time_state_updated: Utc::now(), + gen: Generation(target_vmm.runtime.gen.0.next()), + state: vmm_state, + }; + + let migration = self + .initial_state + .migration + .as_ref() + .expect("must have an active migration"); + let migration_in = MigrationRuntimeState { + migration_id: migration.id, + state: migration_state, + gen: migration.target_gen.0.next(), + time_updated: Utc::now(), + }; + let migrations = Migrations { + migration_in: Some(&migration_in), + migration_out: None, + }; + + info!( + cptestctx.logctx.log, + "updating target VMM state..."; + "propolis_id" => %vmm_id, + "new_runtime" => ?new_runtime, + "migration_in" => ?migration_in, + ); + + cptestctx + .server + .server_context() + .nexus + .datastore() + .vmm_and_migration_update_runtime( + &self.opctx, + vmm_id, + &new_runtime, + migrations, + ) + .await + .expect("updating migration target state should succeed"); + } + + fn start_saga_params(&self) -> Params { + Params { + authz_instance: self.authz_instance.clone(), + serialized_authn: authn::saga::Serialized::for_opctx( + &self.opctx, + ), + } + } + + async fn verify(&self, cptestctx: &ControlPlaneTestContext) { + info!( + cptestctx.logctx.log, + "checking update saga results after migration"; + "source_outcome" => ?dbg!(self.outcome.source.as_ref()), + "target_outcome" => ?dbg!(self.outcome.target.as_ref()), + "migration_failed" => dbg!(self.outcome.failed), + ); + + use test_helpers::*; + let state = + test_helpers::instance_fetch(cptestctx, self.instance_id).await; + let instance = state.instance(); + let instance_runtime = instance.runtime(); + + let active_vmm_id = instance_runtime.propolis_id; + + assert_instance_unlocked(instance); + assert_instance_record_is_consistent(instance); + + let target_destroyed = self + .outcome + .target + .as_ref() + .map(|(_, state)| state == &VmmState::Destroyed) + .unwrap_or(false); + + if self.outcome.failed { + assert_eq!( + instance_runtime.migration_id, None, + "migration ID must be unset when a migration has failed" + ); + assert_eq!( + instance_runtime.dst_propolis_id, None, + "target VMM ID must be unset when a migration has failed" + ); + } else { + if dbg!(target_destroyed) { + assert_eq!( + active_vmm_id, None, + "if the target VMM was destroyed, it should be unset, \ + even if a migration succeeded", + ); + assert_eq!( + instance_runtime.nexus_state, + InstanceState::NoVmm + ); + } else { + assert_eq!( + active_vmm_id, + Some(self.target_vmm_id()), + "target VMM must be in the active VMM position after \ + migration success", + ); + + assert_eq!( + instance_runtime.nexus_state, + InstanceState::Vmm + ); + } + if self + .outcome + .target + .as_ref() + .map(|(state, _)| state == &MigrationState::Completed) + .unwrap_or(false) + { + assert_eq!( + instance_runtime.dst_propolis_id, None, + "target VMM ID must be unset once target VMM reports success", + ); + assert_eq!( + instance_runtime.migration_id, None, + "migration ID must be unset once target VMM reports success", + ); + } else { + assert_eq!( + instance_runtime.dst_propolis_id, + Some(self.target_vmm_id()), + "target VMM ID must remain set until the target VMM reports success", + ); + assert_eq!( + instance_runtime.migration_id, + self.initial_state.instance.runtime().migration_id, + "migration ID must remain set until target VMM reports success", + ); + } + } + + let src_destroyed = self + .outcome + .source + .as_ref() + .map(|(_, state)| state == &VmmState::Destroyed) + .unwrap_or(false); + assert_eq!( + self.src_resource_records_exist(cptestctx).await, + !src_destroyed, + "source VMM should exist if and only if the source hasn't been destroyed", + ); + + assert_eq!( + self.target_resource_records_exist(cptestctx).await, + !target_destroyed, + "target VMM should exist if and only if the target hasn't been destroyed", + ); + + // VThe instance has a VMM if (and only if): + let has_vmm = if self.outcome.failed { + // If the migration failed, the instance should have a VMM if + // and only if the source VMM is still okay. It doesn't matter + // whether the target is still there or not, because we didn't + // migrate to it successfully. + !src_destroyed + } else { + // Otherwise, if the migration succeeded, the instance should be + // on the target VMM, and virtual provisioning records should + // exist as long as the + !target_destroyed + }; + + assert_eq!( + no_virtual_provisioning_resource_records_exist(cptestctx).await, + !has_vmm, + "virtual provisioning resource records must exist as long as \ + the instance has a VMM", + ); + assert_eq!( + no_virtual_provisioning_collection_records_using_instances( + cptestctx + ) + .await, + !has_vmm, + "virtual provisioning collection records must exist as long \ + as the instance has a VMM", + ); + + let instance_state = + if has_vmm { InstanceState::Vmm } else { InstanceState::NoVmm }; + assert_eq!(instance_runtime.nexus_state, instance_state); + } + + async fn src_resource_records_exist( + &self, + cptestctx: &ControlPlaneTestContext, + ) -> bool { + test_helpers::sled_resources_exist_for_vmm( + cptestctx, + PropolisUuid::from_untyped_uuid(self.src_vmm_id()), + ) + .await + } + + async fn target_resource_records_exist( + &self, + cptestctx: &ControlPlaneTestContext, + ) -> bool { + test_helpers::sled_resources_exist_for_vmm( + cptestctx, + PropolisUuid::from_untyped_uuid(self.target_vmm_id()), + ) + .await + } + } + + async fn make_real_params( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + Params { authz_instance, serialized_authn }: Params, + ) -> RealParams { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let log = &cptestctx.logctx.log; + + let lock_id = Uuid::new_v4(); + let orig_lock = datastore + .instance_updater_lock(opctx, &authz_instance, lock_id) + .await + .expect("must lock instance"); + let state = datastore + .instance_fetch_all(&opctx, &authz_instance) + .await + .expect("instance must exist"); + let update = UpdatesRequired::for_instance(&log, &state) + .expect("the test's precondition should require updates"); + + info!( + log, + "made params for real saga"; + "instance" => ?state.instance, + "active_vmm" => ?state.active_vmm, + "target_vmm" => ?state.target_vmm, + "migration" => ?state.migration, + "update.new_runtime" => ?update.new_runtime, + "update.destroy_active_vmm" => ?update.destroy_active_vmm, + "update.destroy_target_vmm" => ?update.destroy_target_vmm, + "update.deprovision" => ?update.deprovision, + "update.network_config" => ?update.network_config, + ); + + RealParams { authz_instance, serialized_authn, update, orig_lock } + } +} diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs new file mode 100644 index 0000000000..fbd8cbffc2 --- /dev/null +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -0,0 +1,308 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// instance update start saga + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, RealParams, + SagaDoActualInstanceUpdate, SagaInitError, UpdatesRequired, + ACTION_GENERATE_ID, INSTANCE_LOCK, INSTANCE_LOCK_ID, +}; +use crate::app::saga; +use crate::app::sagas::declare_saga_actions; +use nexus_db_queries::db::datastore::instance; +use nexus_db_queries::{authn, authz}; +use serde::{Deserialize, Serialize}; +use steno::{ActionError, DagBuilder, Node, SagaResultErr}; +use uuid::Uuid; + +/// Parameters to the start instance update saga. +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub(crate) serialized_authn: authn::saga::Serialized, + + pub(crate) authz_instance: authz::Instance, +} + +// instance update saga: actions + +declare_saga_actions! { + start_instance_update; + + // Acquire the instance updater" lock with this saga's ID if no other saga + // is currently updating the instance. + LOCK_INSTANCE -> "updater_lock" { + + siu_lock_instance + - siu_lock_instance_undo + } + + // Fetch the instance and VMM's state, and start the "real" instance update saga. + // N.B. that this must be performed as a separate action from + // `LOCK_INSTANCE`, so that if the lookup fails, we will still unwind the + // `LOCK_INSTANCE` action and release the lock. + FETCH_STATE_AND_START_REAL_SAGA -> "state" { + + siu_fetch_state_and_start_real_saga + } +} + +// instance update saga: definition + +#[derive(Debug)] +pub(crate) struct SagaInstanceUpdate; +impl NexusSaga for SagaInstanceUpdate { + const NAME: &'static str = "start-instance-update"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + start_instance_update_register_actions(registry); + super::SagaDoActualInstanceUpdate::register_actions(registry); + super::destroyed::SagaDestroyVmm::register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + builder.append(Node::action( + INSTANCE_LOCK_ID, + "GenerateInstanceLockId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(lock_instance_action()); + builder.append(fetch_state_and_start_real_saga_action()); + + Ok(builder.build()?) + } +} + +// start instance update saga: action implementations + +async fn siu_lock_instance( + sagactx: NexusActionContext, +) -> Result, ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + info!( + osagactx.log(), + "instance update: attempting to lock instance"; + "instance_id" => %authz_instance.id(), + "saga_id" => %lock_id, + ); + + let locked = osagactx + .datastore() + .instance_updater_lock(&opctx, authz_instance, lock_id) + .await; + match locked { + Ok(lock) => Ok(Some(lock)), + // Don't return an error if we can't take the lock. This saga will + // simply not start the real instance update saga, rather than having to unwind. + Err(instance::UpdaterLockError::AlreadyLocked) => Ok(None), + // Okay, that's a real error. Time to die! + Err(instance::UpdaterLockError::Query(e)) => { + Err(ActionError::action_failed(e)) + } + } +} + +async fn siu_lock_instance_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let Params { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + + // If the instance lock node in the saga context was `None`, that means + // we didn't acquire the lock, and we can die happily without having to + // worry about unlocking the instance. It would be pretty surprising if this + // saga unwound without having acquired the lock, but...whatever. + if let Some(lock) = + sagactx.lookup::>(INSTANCE_LOCK)? + { + super::unwind_instance_lock( + lock, + serialized_authn, + authz_instance, + &sagactx, + ) + .await; + } + + Ok(()) +} + +async fn siu_fetch_state_and_start_real_saga( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let Params { serialized_authn, authz_instance, .. } = + sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let instance_id = authz_instance.id(); + let log = osagactx.log(); + + // Did we get the lock? If so, we can start the next saga, otherwise, just + // exit gracefully. + let Some(orig_lock) = + sagactx.lookup::>(INSTANCE_LOCK)? + else { + info!( + log, + "instance update: instance is already locked! doing nothing..."; + "instance_id" => %instance_id, + "saga_id" => %lock_id, + ); + return Ok(()); + }; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + let datastore = osagactx.datastore(); + let nexus = osagactx.nexus(); + + let state = datastore + .instance_fetch_all(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)?; + + // Determine what updates are required based on the instance's current + // state snapshot. If there are updates to perform, execute the "real" + // update saga. Otherwise, if we don't need to do anything else, simply + // release the lock and finish this saga. + if let Some(update) = UpdatesRequired::for_instance(log, &state) { + info!( + log, + "instance update: starting real update saga..."; + "instance_id" => %instance_id, + "current.runtime_state" => ?state.instance.runtime(), + "current.migration" => ?state.migration, + "current.active_vmm" => ?state.active_vmm, + "current.target_vmm" => ?state.target_vmm, + "update.new_runtime_state" => ?update.new_runtime, + "update.network_config_update" => ?update.network_config, + "update.destroy_active_vmm" => ?update.destroy_active_vmm, + "update.destroy_target_vmm" => ?update.destroy_target_vmm, + "update.deprovision" => update.deprovision.is_some(), + ); + // Prepare the child saga. + // + // /!\ WARNING /!\ This is really finicky: whether or not the start saga + // should unwind depends on *whether the child `instance-update` saga + // has advanced far enough to have inherited the lock or not. If the + // child has not inherited the lock, we *must* unwind to ensure the lock + // is dropped. + // + // Note that we *don't* use `SagaExecutor::saga_execute`, which prepares + // the child saga and waits for it to complete. That function wraps all + // the errors returned by this whole process in an external API error, + // which makes it difficult for us to figure out *why* the child saga + // failed, and whether we should unwind or not. + + let dag = + saga::create_saga_dag::(RealParams { + serialized_authn, + authz_instance, + update, + orig_lock, + }) + // If we can't build a DAG for the child saga, we should unwind, so + // that we release the lock. + .map_err(|e| { + nexus.background_tasks.task_instance_updater.activate(); + ActionError::action_failed(e) + })?; + let child_result = nexus + .sagas + .saga_prepare(dag) + .await + // Similarly, if we can't prepare the child saga, we need to unwind + // and release the lock. + .map_err(|e| { + nexus.background_tasks.task_instance_updater.activate(); + ActionError::action_failed(e) + })? + .start() + .await + // And, if we can't start it, we need to unwind. + .map_err(|e| { + nexus.background_tasks.task_instance_updater.activate(); + ActionError::action_failed(e) + })? + .wait_until_stopped() + .await + .into_raw_result(); + match child_result.kind { + Ok(_) => { + debug!( + log, + "instance update: child saga completed successfully"; + "instance_id" => %instance_id, + "child_saga_id" => %child_result.saga_id, + ) + } + // Check if the child saga failed to inherit the updater lock from + // this saga. + Err(SagaResultErr { + error_node_name, + error_source: ActionError::ActionFailed { source_error }, + .. + }) if error_node_name.as_ref() == super::INSTANCE_LOCK => { + if let Ok(instance::UpdaterLockError::AlreadyLocked) = + serde_json::from_value(source_error) + { + // If inheriting the lock failed because the lock was held by another + // saga. If this is the case, that's fine: this action must have + // executed more than once, and created multiple child sagas. No big deal. + return Ok(()); + } else { + // Otherwise, the child saga could not inherit the lock for + // some other reason. That means we MUST unwind to ensure + // the lock is released. + return Err(ActionError::action_failed( + "child saga failed to inherit lock".to_string(), + )); + } + } + Err(error) => { + warn!( + log, + "instance update: child saga failed, unwinding..."; + "instance_id" => %instance_id, + "child_saga_id" => %child_result.saga_id, + "error" => ?error, + ); + + // If the real saga failed, kick the background task. If the real + // saga failed because this action was executed twice and the second + // child saga couldn't lock the instance, that's fine, because the + // background task will only start new sagas for instances whose DB + // state actually *needs* an update. + nexus.background_tasks.task_instance_updater.activate(); + return Err(error.error_source); + } + } + } else { + info!( + log, + "instance update: no updates required, releasing lock."; + "instance_id" => %authz_instance.id(), + "current.runtime_state" => ?state.instance.runtime(), + "current.migration" => ?state.migration, + "current.active_vmm" => ?state.active_vmm, + "current.target_vmm" => ?state.target_vmm, + ); + datastore + .instance_updater_unlock(&opctx, &authz_instance, &orig_lock) + .await + .map_err(ActionError::action_failed)?; + } + + Ok(()) +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 17f43b4950..0c57a5b2dc 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -33,6 +33,7 @@ pub mod instance_ip_attach; pub mod instance_ip_detach; pub mod instance_migrate; pub mod instance_start; +pub mod instance_update; pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; @@ -156,6 +157,9 @@ fn make_action_registry() -> ActionRegistry { ::register_actions( &mut registry, ); + ::register_actions( + &mut registry, + ); ::register_actions( &mut registry, ); diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 76a82e7491..eeb14091b2 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -2308,6 +2308,20 @@ mod test { PROJECT_NAME, ) .await; + // Wait until the instance has advanced to the `NoVmm` + // state before deleting it. This may not happen + // immediately, as the `Nexus::cpapi_instances_put` API + // endpoint simply writes the new VMM state to the + // database and *starts* an `instance-update` saga, and + // the instance record isn't updated until that saga + // completes. + test_helpers::instance_wait_for_state_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + nexus_db_model::InstanceState::NoVmm, + ) + .await; test_helpers::instance_delete_by_name( cptestctx, INSTANCE_NAME, diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index a5d9d0a843..b9388a1116 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -11,21 +11,31 @@ use crate::{ Nexus, }; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; +use camino::Utf8Path; use diesel::{ BoolExpressionMethods, ExpressionMethods, QueryDsl, SelectableHelper, }; use futures::future::BoxFuture; +use nexus_db_model::InstanceState; use nexus_db_queries::{ authz, context::OpContext, - db::{datastore::InstanceAndActiveVmm, lookup::LookupPath, DataStore}, + db::{ + datastore::{InstanceAndActiveVmm, InstanceGestalt}, + lookup::LookupPath, + DataStore, + }, }; +use nexus_test_interface::NexusServer; +use nexus_test_utils::start_sled_agent; use nexus_types::identity::Resource; +use omicron_common::api::external::Error; use omicron_common::api::external::NameOrId; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; +use omicron_test_utils::dev::poll; +use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use sled_agent_client::TestInterfaces as _; use slog::{info, warn, Logger}; -use std::{num::NonZeroU32, sync::Arc}; +use std::{num::NonZeroU32, sync::Arc, time::Duration}; use steno::SagaDag; type ControlPlaneTestContext = @@ -136,6 +146,26 @@ pub(crate) async fn instance_simulate( sa.instance_finish_transition(instance_id.into_untyped_uuid()).await; } +pub(crate) async fn instance_single_step_on_sled( + cptestctx: &ControlPlaneTestContext, + instance_id: &InstanceUuid, + sled_id: &SledUuid, +) { + info!( + &cptestctx.logctx.log, + "Single-stepping simulated instance on sled"; + "instance_id" => %instance_id, + "sled_id" => %sled_id, + ); + let nexus = &cptestctx.server.server_context().nexus; + let sa = nexus + .sled_client(sled_id) + .await + .expect("sled must exist to simulate a state change"); + + sa.instance_single_step(instance_id.into_untyped_uuid()).await; +} + pub(crate) async fn instance_simulate_by_name( cptestctx: &ControlPlaneTestContext, name: &str, @@ -188,9 +218,169 @@ pub async fn instance_fetch( db_state } +pub async fn instance_fetch_all( + cptestctx: &ControlPlaneTestContext, + instance_id: InstanceUuid, +) -> InstanceGestalt { + let datastore = cptestctx.server.server_context().nexus.datastore().clone(); + let opctx = test_opctx(&cptestctx); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Read) + .await + .expect("test instance should be present in datastore"); + + let db_state = datastore + .instance_fetch_all(&opctx, &authz_instance) + .await + .expect("test instance's info should be fetchable"); + + info!(&cptestctx.logctx.log, "refetched all instance info from db"; + "instance_id" => %instance_id, + "instance" => ?db_state.instance, + "active_vmm" => ?db_state.active_vmm, + "target_vmm" => ?db_state.target_vmm, + "migration" => ?db_state.migration, + ); + + db_state +} +pub async fn instance_fetch_by_name( + cptestctx: &ControlPlaneTestContext, + name: &str, + project_name: &str, +) -> InstanceAndActiveVmm { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(&cptestctx); + let instance_selector = + nexus_types::external_api::params::InstanceSelector { + project: Some(project_name.to_string().try_into().unwrap()), + instance: name.to_string().try_into().unwrap(), + }; + + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector).unwrap(); + let (_, _, authz_instance, ..) = instance_lookup.fetch().await.unwrap(); + + let db_state = datastore + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .expect("test instance's info should be fetchable"); + + info!(&cptestctx.logctx.log, "refetched instance info from db"; + "instance_name" => name, + "project_name" => project_name, + "instance_id" => %authz_instance.id(), + "instance_and_vmm" => ?db_state, + ); + + db_state +} + +pub(crate) async fn instance_wait_for_state( + cptestctx: &ControlPlaneTestContext, + instance_id: InstanceUuid, + desired_state: InstanceState, +) -> InstanceAndActiveVmm { + let opctx = test_opctx(&cptestctx); + let datastore = cptestctx.server.server_context().nexus.datastore(); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Read) + .await + .expect("test instance should be present in datastore"); + instance_poll_state(cptestctx, &opctx, authz_instance, desired_state).await +} + +pub async fn instance_wait_for_state_by_name( + cptestctx: &ControlPlaneTestContext, + name: &str, + project_name: &str, + desired_state: InstanceState, +) -> InstanceAndActiveVmm { + let nexus = &cptestctx.server.server_context().nexus; + let opctx = test_opctx(&cptestctx); + let instance_selector = + nexus_types::external_api::params::InstanceSelector { + project: Some(project_name.to_string().try_into().unwrap()), + instance: name.to_string().try_into().unwrap(), + }; + + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector).unwrap(); + let (_, _, authz_instance, ..) = instance_lookup.fetch().await.unwrap(); + + instance_poll_state(cptestctx, &opctx, authz_instance, desired_state).await +} + +async fn instance_poll_state( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + authz_instance: authz::Instance, + desired_state: InstanceState, +) -> InstanceAndActiveVmm { + const MAX_WAIT: Duration = Duration::from_secs(120); + + let datastore = cptestctx.server.server_context().nexus.datastore(); + let log = &cptestctx.logctx.log; + let instance_id = authz_instance.id(); + + info!( + log, + "waiting for instance {instance_id} to transition to {desired_state}..."; + "instance_id" => %instance_id, + ); + let result = poll::wait_for_condition( + || async { + let db_state = datastore + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .map_err(poll::CondCheckError::::Failed)?; + + if db_state.instance.runtime().nexus_state == desired_state { + info!( + log, + "instance {instance_id} transitioned to {desired_state}"; + "instance_id" => %instance_id, + "instance" => ?db_state.instance(), + "active_vmm" => ?db_state.vmm(), + ); + Ok(db_state) + } else { + info!( + log, + "instance {instance_id} has not yet transitioned to {desired_state}"; + "instance_id" => %instance_id, + "instance" => ?db_state.instance(), + "active_vmm" => ?db_state.vmm(), + ); + Err(poll::CondCheckError::::NotYet) + } + }, + &Duration::from_secs(1), + &MAX_WAIT, + ) + .await; + + match result { + Ok(i) => i, + Err(e) => panic!( + "instance {instance_id} did not transition to {desired_state} \ + after {MAX_WAIT:?}: {e}" + ), + } +} + pub async fn no_virtual_provisioning_resource_records_exist( cptestctx: &ControlPlaneTestContext, ) -> bool { + count_virtual_provisioning_resource_records(cptestctx).await == 0 +} + +pub async fn count_virtual_provisioning_resource_records( + cptestctx: &ControlPlaneTestContext, +) -> usize { use nexus_db_queries::db::model::VirtualProvisioningResource; use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl; @@ -198,7 +388,7 @@ pub async fn no_virtual_provisioning_resource_records_exist( let conn = datastore.pool_connection_for_tests().await.unwrap(); datastore - .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist") + .transaction_retry_wrapper("count_virtual_provisioning_resource_records") .transaction(&conn, |conn| async move { conn .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) @@ -212,7 +402,7 @@ pub async fn no_virtual_provisioning_resource_records_exist( .get_results_async::(&conn) .await .unwrap() - .is_empty() + .len() ) }).await.unwrap() } @@ -220,6 +410,14 @@ pub async fn no_virtual_provisioning_resource_records_exist( pub async fn no_virtual_provisioning_collection_records_using_instances( cptestctx: &ControlPlaneTestContext, ) -> bool { + count_virtual_provisioning_collection_records_using_instances(cptestctx) + .await + == 0 +} + +pub async fn count_virtual_provisioning_collection_records_using_instances( + cptestctx: &ControlPlaneTestContext, +) -> usize { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; @@ -228,7 +426,7 @@ pub async fn no_virtual_provisioning_collection_records_using_instances( datastore .transaction_retry_wrapper( - "no_virtual_provisioning_collection_records_using_instances", + "count_virtual_provisioning_collection_records_using_instances", ) .transaction(&conn, |conn| async move { conn.batch_execute_async( @@ -244,12 +442,70 @@ pub async fn no_virtual_provisioning_collection_records_using_instances( .get_results_async::(&conn) .await .unwrap() + .len()) + }) + .await + .unwrap() +} + +pub async fn no_sled_resource_instance_records_exist( + cptestctx: &ControlPlaneTestContext, +) -> bool { + use nexus_db_queries::db::model::SledResource; + use nexus_db_queries::db::model::SledResourceKind; + use nexus_db_queries::db::schema::sled_resource::dsl; + + let datastore = cptestctx.server.server_context().nexus.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper("no_sled_resource_instance_records_exist") + .transaction(&conn, |conn| async move { + conn.batch_execute_async( + nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, + ) + .await + .unwrap(); + + Ok(dsl::sled_resource + .filter(dsl::kind.eq(SledResourceKind::Instance)) + .select(SledResource::as_select()) + .get_results_async::(&conn) + .await + .unwrap() .is_empty()) }) .await .unwrap() } +pub async fn sled_resources_exist_for_vmm( + cptestctx: &ControlPlaneTestContext, + vmm_id: PropolisUuid, +) -> bool { + use nexus_db_queries::db::model::SledResource; + use nexus_db_queries::db::model::SledResourceKind; + use nexus_db_queries::db::schema::sled_resource::dsl; + + let datastore = cptestctx.server.server_context().nexus.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + let results = dsl::sled_resource + .filter(dsl::kind.eq(SledResourceKind::Instance)) + .filter(dsl::id.eq(vmm_id.into_untyped_uuid())) + .select(SledResource::as_select()) + .load_async(&*conn) + .await + .unwrap(); + info!( + cptestctx.logctx.log, + "queried sled reservation records for VMM"; + "vmm_id" => %vmm_id, + "results" => ?results, + ); + !results.is_empty() +} + /// Tests that the saga described by `dag` succeeds if each of its nodes is /// repeated. /// @@ -532,3 +788,51 @@ pub(crate) async fn assert_no_failed_undo_steps( assert!(saga_node_events.is_empty()); } + +pub(crate) async fn add_sleds( + cptestctx: &ControlPlaneTestContext, + num_sleds: usize, +) -> Vec<(SledUuid, omicron_sled_agent::sim::Server)> { + let mut sas = Vec::with_capacity(num_sleds); + for _ in 0..num_sleds { + let sa_id = SledUuid::new_v4(); + let log = cptestctx.logctx.log.new(o!("sled_id" => sa_id.to_string())); + let addr = cptestctx.server.get_http_server_internal_address().await; + + info!(&cptestctx.logctx.log, "Adding simulated sled"; "sled_id" => %sa_id); + let update_dir = Utf8Path::new("/should/be/unused"); + let sa = start_sled_agent( + log, + addr, + sa_id, + &update_dir, + omicron_sled_agent::sim::SimMode::Explicit, + ) + .await + .unwrap(); + sas.push((sa_id, sa)); + } + + sas +} + +pub(crate) fn select_first_alternate_sled( + db_vmm: &crate::app::db::model::Vmm, + other_sleds: &[(SledUuid, omicron_sled_agent::sim::Server)], +) -> SledUuid { + let default_sled_uuid: SledUuid = + nexus_test_utils::SLED_AGENT_UUID.parse().unwrap(); + if other_sleds.is_empty() { + panic!("need at least one other sled"); + } + + if other_sleds.iter().any(|sled| sled.0 == default_sled_uuid) { + panic!("default test sled agent was in other_sleds"); + } + + if db_vmm.sled_id == default_sled_uuid.into_untyped_uuid() { + other_sleds[0].0 + } else { + default_sled_uuid + } +} diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 28ff712c24..33b626a7fc 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -177,7 +177,7 @@ impl NexusInternalApi for NexusInternalApiImpl { nexus .notify_instance_updated( &opctx, - &InstanceUuid::from_untyped_uuid(path.instance_id), + InstanceUuid::from_untyped_uuid(path.instance_id), &new_state, ) .await?; diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index e231f665fa..8f65a73204 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -124,6 +124,19 @@ v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +# The purpose of the `instance-updater` background task is to ensure that update +# sagas are always *eventually* started for instances whose database state has +# changed, even if the update saga was not started by the Nexus replica handling +# an update from sled-agent. This is to ensure that updates are performed even +# in cases where a Nexus crashes or otherwise disappears between when the +# updated VMM and migration state is written to CRDB and when the resulting +# update saga actually starts executing. However, we would prefer update sagas +# to be executed in a timely manner, so for integration tests, we don't want to +# *rely* on the instance-updater background task for running these sagas. +# +# Therefore, disable the background task during tests. +instance_updater.disable = true +instance_updater.period_secs = 60 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index ded4a346fb..234ab5f382 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -4,6 +4,7 @@ //! Tests basic disk support in the API +use super::instances::instance_wait_for_state; use super::metrics::{get_latest_silo_metric, query_for_metrics}; use chrono::Utc; use dropshot::test_util::ClientTestContext; @@ -37,6 +38,7 @@ use omicron_common::api::external::Disk; use omicron_common::api::external::DiskState; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Instance; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; use omicron_nexus::app::{MAX_DISK_SIZE_BYTES, MIN_DISK_SIZE_BYTES}; @@ -236,18 +238,15 @@ async fn test_disk_create_attach_detach_delete( // Create an instance to attach the disk. let instance = create_instance(&client, PROJECT_NAME, INSTANCE_NAME).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); // TODO(https://github.com/oxidecomputer/omicron/issues/811): // // Instances must be stopped before disks can be attached - this // is an artificial limitation without hotplug support. - let instance_next = - set_instance_state(&client, INSTANCE_NAME, "stop").await; - instance_simulate( - nexus, - &InstanceUuid::from_untyped_uuid(instance_next.identity.id), - ) - .await; + set_instance_state(&client, INSTANCE_NAME, "stop").await; + instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Verify that there are no disks attached to the instance, and specifically // that our disk is not attached to this instance. @@ -395,6 +394,8 @@ async fn test_disk_slot_assignment(cptestctx: &ControlPlaneTestContext) { let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; + let url_instance_disks = get_instance_disks_url(instance.identity.name.as_str()); let listed_disks = disks_list(&client, &url_instance_disks).await; @@ -504,6 +505,7 @@ async fn test_disk_move_between_instances(cptestctx: &ControlPlaneTestContext) { // is an artificial limitation without hotplug support. set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; // Verify that there are no disks attached to the instance, and specifically // that our disk is not attached to this instance. @@ -541,6 +543,8 @@ async fn test_disk_move_between_instances(cptestctx: &ControlPlaneTestContext) { let instance2_id = InstanceUuid::from_untyped_uuid(instance2.identity.id); set_instance_state(&client, "instance2", "stop").await; instance_simulate(nexus, &instance2_id).await; + instance_wait_for_state(&client, instance2_id, InstanceState::Stopped) + .await; let url_instance2_attach_disk = get_disk_attach_url(&instance2.identity.id.into()); diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index 2789318855..0940c8675b 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -9,6 +9,7 @@ use std::net::Ipv4Addr; use crate::integration_tests::instances::fetch_instance_external_ips; use crate::integration_tests::instances::instance_simulate; +use crate::integration_tests::instances::instance_wait_for_state; use dropshot::test_util::ClientTestContext; use dropshot::HttpErrorResponseBody; use http::Method; @@ -47,6 +48,7 @@ use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::IdentityMetadataUpdateParams; use omicron_common::api::external::Instance; use omicron_common::api::external::InstanceCpuCount; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; use omicron_uuid_kinds::GenericUuid; @@ -696,6 +698,7 @@ async fn test_floating_ip_create_attachment( .unwrap(); instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; NexusRequest::object_delete( &client, diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 9c965ccf8a..2e41fac3a4 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -421,8 +421,9 @@ async fn test_instances_create_reboot_halt( let instance = instance_next; instance_simulate(nexus, &instance_id).await; - let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + let instance_next = + instance_wait_for_state(client, instance_id, InstanceState::Stopped) + .await; assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated @@ -516,8 +517,9 @@ async fn test_instances_create_reboot_halt( // assert_eq!(error.message, "cannot reboot instance in state \"stopping\""); let instance = instance_next; instance_simulate(nexus, &instance_id).await; - let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + let instance_next = + instance_wait_for_state(client, instance_id, InstanceState::Stopped) + .await; assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated @@ -629,8 +631,7 @@ async fn test_instance_start_creates_networking_state( instance_simulate(nexus, &instance_id).await; instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Forcibly clear the instance's V2P mappings to simulate what happens when // the control plane comes up when an instance is stopped. @@ -837,18 +838,56 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(migration.target_state, MigrationState::Pending.into()); assert_eq!(migration.source_state, MigrationState::Pending.into()); - // Explicitly simulate the migration action on the target. Simulated - // migrations always succeed. The state transition on the target is - // sufficient to move the instance back into a Running state (strictly - // speaking no further updates from the source are required if the target - // successfully takes over). - instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; - // Ensure that both sled agents report that the migration has completed. - instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + // Simulate the migration. We will use `instance_single_step_on_sled` to + // single-step both sled-agents through the migration state machine and + // ensure that the migration state looks nice at each step. + instance_simulate_migration_source( + cptestctx, + nexus, + original_sled, + instance_id, + migration_id, + ) + .await; + + // Move source to "migrating". + instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; + instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) .await; + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.source_state, MigrationState::InProgress.into()); + assert_eq!(migration.target_state, MigrationState::Pending.into()); let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Running); + assert_eq!(instance.runtime.run_state, InstanceState::Migrating); + + // Move target to "migrating". + instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + .await; + instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + .await; + + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.source_state, MigrationState::InProgress.into()); + assert_eq!(migration.target_state, MigrationState::InProgress.into()); + let instance = instance_get(&client, &instance_url).await; + assert_eq!(instance.runtime.run_state, InstanceState::Migrating); + + // Move the source to "completed" + instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; + + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.source_state, MigrationState::Completed.into()); + assert_eq!(migration.target_state, MigrationState::InProgress.into()); + let instance = dbg!(instance_get(&client, &instance_url).await); + assert_eq!(instance.runtime.run_state, InstanceState::Migrating); + + // Move the target to "completed". + instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + + instance_wait_for_state(&client, instance_id, InstanceState::Running).await; let current_sled = nexus .instance_sled_id(&instance_id) @@ -973,9 +1012,40 @@ async fn test_instance_migrate_v2p_and_routes( .parsed_body::() .unwrap(); + let migration_id = { + let datastore = apictx.nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance.identity.id) + .lookup_for(nexus_db_queries::authz::Action::Read) + .await + .unwrap(); + datastore + .instance_refetch(&opctx, &authz_instance) + .await + .unwrap() + .runtime_state + .migration_id + .expect("since we've started a migration, the instance record must have a migration id!") + }; + + // Tell both sled-agents to pretend to do the migration. + instance_simulate_migration_source( + cptestctx, + nexus, + original_sled_id, + instance_id, + migration_id, + ) + .await; + instance_simulate_on_sled(cptestctx, nexus, original_sled_id, instance_id) + .await; instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Running); + instance_wait_for_state(&client, instance_id, InstanceState::Running).await; + let current_sled = nexus .instance_sled_id(&instance_id) .await @@ -1186,9 +1256,7 @@ async fn test_instance_metrics(cptestctx: &ControlPlaneTestContext) { instance_post(&client, instance_name, InstanceOp::Stop).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); instance_simulate(nexus, &instance_id).await; - let instance = - instance_get(&client, &get_instance_url(&instance_name)).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; let virtual_provisioning_collection = datastore .virtual_provisioning_collection_get(&opctx, project_id) @@ -1328,14 +1396,54 @@ async fn test_instance_metrics_with_migration( .parsed_body::() .unwrap(); + let migration_id = { + let datastore = apictx.nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance.identity.id) + .lookup_for(nexus_db_queries::authz::Action::Read) + .await + .unwrap(); + datastore + .instance_refetch(&opctx, &authz_instance) + .await + .unwrap() + .runtime_state + .migration_id + .expect("since we've started a migration, the instance record must have a migration id!") + }; + + // Wait for the instance to be in the `Migrating` state. Otherwise, the + // subsequent `instance_wait_for_state(..., Running)` may see the `Running` + // state from the *old* VMM, rather than waiting for the migration to + // complete. + instance_simulate_migration_source( + cptestctx, + nexus, + original_sled, + instance_id, + migration_id, + ) + .await; + instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; + instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + .await; + instance_wait_for_state(&client, instance_id, InstanceState::Migrating) + .await; + check_provisioning_state(4, 1).await; // Complete migration on the target. Simulated migrations always succeed. // After this the instance should be running and should continue to appear // to be provisioned. + instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Running); + instance_wait_for_state(&client, instance_id, InstanceState::Running).await; check_provisioning_state(4, 1).await; @@ -1347,9 +1455,7 @@ async fn test_instance_metrics_with_migration( // logical states of instances ignoring migration). instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = - instance_get(&client, &get_instance_url(&instance_name)).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; check_provisioning_state(0, 0).await; } @@ -1449,8 +1555,7 @@ async fn test_instances_delete_fails_when_running_succeeds_when_stopped( // Stop the instance instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; // Now deletion should succeed. NexusRequest::object_delete(&client, &instance_url) @@ -2051,6 +2156,7 @@ async fn test_instance_create_delete_network_interface( let instance = instance_post(client, instance_name, InstanceOp::Stop).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Verify we can now make the requests again let mut interfaces = Vec::with_capacity(2); @@ -2120,6 +2226,7 @@ async fn test_instance_create_delete_network_interface( // Stop the instance and verify we can delete the interface instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // We should not be able to delete the primary interface, while the // secondary still exists @@ -2258,6 +2365,7 @@ async fn test_instance_update_network_interfaces( let instance = instance_post(client, instance_name, InstanceOp::Stop).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Create the first interface on the instance. let primary_iface = NexusRequest::objects_post( @@ -2318,6 +2426,8 @@ async fn test_instance_update_network_interfaces( // Stop the instance again, and now verify that the update works. instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; + let updated_primary_iface = NexusRequest::object_put( client, &format!("/v1/network-interfaces/{}", primary_iface.identity.id), @@ -2451,6 +2561,7 @@ async fn test_instance_update_network_interfaces( // Stop the instance again. instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Verify that we can set the secondary as the new primary, and that nothing // else changes about the NICs. @@ -3231,8 +3342,7 @@ async fn test_disks_detached_when_instance_destroyed( instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; NexusRequest::object_delete(&client, &instance_url) .authn_as(AuthnMode::PrivilegedUser) @@ -3750,6 +3860,8 @@ async fn test_cannot_provision_instance_beyond_cpu_capacity( instance_simulate(nexus, &instance_id).await; instances[1] = instance_post(client, configs[1].0, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; + expect_instance_start_ok(client, configs[2].0).await; } @@ -3857,6 +3969,8 @@ async fn test_cannot_provision_instance_beyond_ram_capacity( instance_simulate(nexus, &instance_id).await; instances[1] = instance_post(client, configs[1].0, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; + expect_instance_start_ok(client, configs[2].0).await; } @@ -3979,8 +4093,9 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) { let instance = instance_next; instance_simulate(nexus, &instance_id).await; - let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + let instance_next = + instance_wait_for_state(&client, instance_id, InstanceState::Stopped) + .await; assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated @@ -4146,12 +4261,10 @@ async fn stop_and_delete_instance( let client = &cptestctx.external_client; let instance = instance_post(&client, instance_name, InstanceOp::Stop).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); let nexus = &cptestctx.server.server_context().nexus; - instance_simulate( - nexus, - &InstanceUuid::from_untyped_uuid(instance.identity.id), - ) - .await; + instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; let url = format!("/v1/instances/{}?project={}", instance_name, PROJECT_NAME); object_delete(client, &url).await; @@ -4577,6 +4690,13 @@ async fn test_instance_create_in_silo(cptestctx: &ControlPlaneTestContext) { .expect("Failed to stop the instance"); instance_simulate_with_opctx(nexus, &instance_id, &opctx).await; + instance_wait_for_state_as( + client, + AuthnMode::SiloUser(user_id), + instance_id, + InstanceState::Stopped, + ) + .await; // Delete the instance NexusRequest::object_delete(client, &instance_url) @@ -4664,6 +4784,7 @@ async fn test_instance_v2p_mappings(cptestctx: &ControlPlaneTestContext) { instance_simulate(nexus, &instance_id).await; instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; let instance_url = get_instance_url(instance_name); NexusRequest::object_delete(client, &instance_url) @@ -4730,6 +4851,73 @@ pub enum InstanceOp { Reboot, } +pub async fn instance_wait_for_state( + client: &ClientTestContext, + instance_id: InstanceUuid, + state: omicron_common::api::external::InstanceState, +) -> Instance { + instance_wait_for_state_as( + client, + AuthnMode::PrivilegedUser, + instance_id, + state, + ) + .await +} + +/// Line [`instance_wait_for_state`], but with an [`AuthnMode`] parameter for +/// the instance lookup requests. +pub async fn instance_wait_for_state_as( + client: &ClientTestContext, + authn_as: AuthnMode, + instance_id: InstanceUuid, + state: omicron_common::api::external::InstanceState, +) -> Instance { + const MAX_WAIT: Duration = Duration::from_secs(120); + + slog::info!( + &client.client_log, + "waiting for instance {instance_id} to transition to {state}..."; + ); + let url = format!("/v1/instances/{instance_id}"); + let result = wait_for_condition( + || async { + let instance: Instance = NexusRequest::object_get(client, &url) + .authn_as(authn_as.clone()) + .execute() + .await? + .parsed_body()?; + if instance.runtime.run_state == state { + Ok(instance) + } else { + slog::info!( + &client.client_log, + "instance {instance_id} has not transitioned to {state}"; + "instance_id" => %instance.identity.id, + "instance_runtime_state" => ?instance.runtime, + ); + Err(CondCheckError::::NotYet) + } + }, + &Duration::from_secs(1), + &MAX_WAIT, + ) + .await; + match result { + Ok(instance) => { + slog::info!( + &client.client_log, + "instance {instance_id} has transitioned to {state}" + ); + instance + } + Err(e) => panic!( + "instance {instance_id} did not transition to {state:?} \ + after {MAX_WAIT:?}: {e}" + ), + } +} + pub async fn instance_post( client: &ClientTestContext, instance_name: &str, @@ -4896,6 +5084,22 @@ pub async fn instance_simulate(nexus: &Arc, id: &InstanceUuid) { sa.instance_finish_transition(id.into_untyped_uuid()).await; } +/// Simulate one step of an ongoing instance state transition. To do this, we +/// have to look up the instance, then get the sled agent associated with that +/// instance, and then tell it to finish simulating whatever async transition is +/// going on. +async fn instance_single_step_on_sled( + cptestctx: &ControlPlaneTestContext, + nexus: &Arc, + sled_id: SledUuid, + instance_id: InstanceUuid, +) { + info!(&cptestctx.logctx.log, "Single-stepping simulated instance on sled"; + "instance_id" => %instance_id, "sled_id" => %sled_id); + let sa = nexus.sled_client(&sled_id).await.unwrap(); + sa.instance_single_step(instance_id.into_untyped_uuid()).await; +} + pub async fn instance_simulate_with_opctx( nexus: &Arc, id: &InstanceUuid, @@ -4923,3 +5127,30 @@ async fn instance_simulate_on_sled( let sa = nexus.sled_client(&sled_id).await.unwrap(); sa.instance_finish_transition(instance_id.into_untyped_uuid()).await; } + +/// Simulates a migration source for the provided instance ID, sled ID, and +/// migration ID. +async fn instance_simulate_migration_source( + cptestctx: &ControlPlaneTestContext, + nexus: &Arc, + sled_id: SledUuid, + instance_id: InstanceUuid, + migration_id: Uuid, +) { + info!( + &cptestctx.logctx.log, + "Simulating migration source sled"; + "instance_id" => %instance_id, + "sled_id" => %sled_id, + "migration_id" => %migration_id, + ); + let sa = nexus.sled_client(&sled_id).await.unwrap(); + sa.instance_simulate_migration_source( + instance_id.into_untyped_uuid(), + sled_agent_client::SimulateMigrationSource { + migration_id, + result: sled_agent_client::SimulatedMigrationResult::Success, + }, + ) + .await; +} diff --git a/nexus/tests/integration_tests/ip_pools.rs b/nexus/tests/integration_tests/ip_pools.rs index d044eb735c..e872cc6fe3 100644 --- a/nexus/tests/integration_tests/ip_pools.rs +++ b/nexus/tests/integration_tests/ip_pools.rs @@ -6,6 +6,7 @@ use std::net::Ipv4Addr; +use crate::integration_tests::instances::instance_wait_for_state; use dropshot::test_util::ClientTestContext; use dropshot::HttpErrorResponseBody; use dropshot::ResultsPage; @@ -54,6 +55,7 @@ use nexus_types::external_api::views::SiloIpPool; use nexus_types::identity::Resource; use omicron_common::address::Ipv6Range; use omicron_common::api::external::IdentityMetadataUpdateParams; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::LookupType; use omicron_common::api::external::NameOrId; use omicron_common::api::external::SimpleIdentity; @@ -1348,6 +1350,7 @@ async fn test_ip_range_delete_with_allocated_external_ip_fails( .unwrap() .expect("running instance should be on a sled"); sa.instance_finish_transition(instance.identity.id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Delete the instance NexusRequest::object_delete(client, &instance_url) diff --git a/nexus/tests/integration_tests/pantry.rs b/nexus/tests/integration_tests/pantry.rs index 29e590b1a9..d77ad49db6 100644 --- a/nexus/tests/integration_tests/pantry.rs +++ b/nexus/tests/integration_tests/pantry.rs @@ -4,6 +4,7 @@ //! Tests Nexus' interactions with Crucible's pantry +use crate::integration_tests::instances::instance_wait_for_state; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; @@ -24,6 +25,7 @@ use omicron_common::api::external::Disk; use omicron_common::api::external::DiskState; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Instance; +use omicron_common::api::external::InstanceState; use omicron_nexus::Nexus; use omicron_nexus::TestInterfaces as _; use omicron_uuid_kinds::GenericUuid; @@ -157,6 +159,7 @@ async fn create_instance_and_attach_disk( // is an artificial limitation without hotplug support. set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; let url_instance_attach_disk = get_disk_attach_url(instance.identity.name.as_str()); diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index 987e8146de..91e0136960 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -25,6 +25,7 @@ use nexus_test_utils::resource_helpers::create_default_ip_pool; use nexus_test_utils::resource_helpers::create_disk; use nexus_test_utils::resource_helpers::create_project; use nexus_test_utils::resource_helpers::object_create; +use nexus_test_utils::SLED_AGENT_UUID; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::views; @@ -1650,3 +1651,64 @@ async fn test_region_allocation_for_snapshot( assert_eq!(allocated_regions.len(), 2); } + +#[nexus_test] +async fn test_snapshot_expunge(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create three 10 GiB zpools, each with one dataset. + let _disk_test = DiskTest::new(&cptestctx).await; + + // Assert default is still 10 GiB + assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + + // Create a disk, then a snapshot of that disk + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: "snapshot".parse().unwrap(), + description: String::from("a snapshot"), + }, + disk: disk.identity.name.into(), + }, + ) + .await; + + // Expunge the sled + let int_client = &cptestctx.internal_client; + int_client + .make_request( + Method::POST, + "/sleds/expunge", + Some(params::SledSelector { + sled: SLED_AGENT_UUID.parse().unwrap(), + }), + StatusCode::OK, + ) + .await + .unwrap(); + + // All three region snapshots should be returned + let expunged_region_snapshots = datastore + .find_region_snapshots_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + assert_eq!(expunged_region_snapshots.len(), 3); + + for expunged_region_snapshot in expunged_region_snapshots { + assert_eq!(expunged_region_snapshot.snapshot_id, snapshot.identity.id); + } +} diff --git a/nexus/tests/integration_tests/vpc_subnets.rs b/nexus/tests/integration_tests/vpc_subnets.rs index b12c43aecc..f063c7e9a2 100644 --- a/nexus/tests/integration_tests/vpc_subnets.rs +++ b/nexus/tests/integration_tests/vpc_subnets.rs @@ -4,6 +4,7 @@ use crate::integration_tests::instances::instance_post; use crate::integration_tests::instances::instance_simulate; +use crate::integration_tests::instances::instance_wait_for_state; use crate::integration_tests::instances::InstanceOp; use dropshot::HttpErrorResponseBody; use http::method::Method; @@ -20,6 +21,7 @@ use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::{params, views::VpcSubnet}; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::IdentityMetadataUpdateParams; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::Ipv6NetExt; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -80,6 +82,7 @@ async fn test_delete_vpc_subnet_with_interfaces_fails( // Stop and then delete the instance instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(&nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; NexusRequest::object_delete(&client, &instance_url) .authn_as(AuthnMode::PrivilegedUser) .execute() diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index a5feff067a..1af3636d0e 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -87,6 +87,10 @@ impl PlanningInput { &self.cockroachdb_settings } + pub fn target_boundary_ntp_zone_count(&self) -> usize { + self.policy.target_boundary_ntp_zone_count + } + pub fn target_nexus_zone_count(&self) -> usize { self.policy.target_nexus_zone_count } @@ -692,6 +696,9 @@ pub struct Policy { /// services (e.g., external DNS, Nexus, boundary NTP) pub service_ip_pool_ranges: Vec, + /// desired total number of deployed Boundary NTP zones + pub target_boundary_ntp_zone_count: usize, + /// desired total number of deployed Nexus zones pub target_nexus_zone_count: usize, @@ -749,6 +756,7 @@ impl PlanningInputBuilder { PlanningInput { policy: Policy { service_ip_pool_ranges: Vec::new(), + target_boundary_ntp_zone_count: 0, target_nexus_zone_count: 0, target_cockroachdb_zone_count: 0, target_cockroachdb_cluster_version: diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index cfbe028f9c..78f606b2f2 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2591,139 +2591,8 @@ ] }, "DatasetKind": { - "description": "Describes the purpose of the dataset.", - "oneOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "cockroachdb" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "crucible" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "clickhouse" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "clickhouse_keeper" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "external_dns" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "internal_dns" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "zone_root" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "type": { - "type": "string", - "enum": [ - "zone" - ] - } - }, - "required": [ - "name", - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "debug" - ] - } - }, - "required": [ - "type" - ] - } - ] + "description": "The kind of dataset. See the `DatasetKind` enum in omicron-common for possible values.", + "type": "string" }, "DatasetPutRequest": { "description": "Describes a dataset within a pool.", @@ -3306,53 +3175,6 @@ } ] }, - "InstanceRuntimeState": { - "description": "The dynamic runtime properties of an instance: its current VMM ID (if any), migration information (if any), and the instance state to report if there is no active VMM.", - "type": "object", - "properties": { - "dst_propolis_id": { - "nullable": true, - "description": "If a migration is active, the ID of the target VMM.", - "allOf": [ - { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - } - ] - }, - "gen": { - "description": "Generation number for this state.", - "allOf": [ - { - "$ref": "#/components/schemas/Generation" - } - ] - }, - "migration_id": { - "nullable": true, - "description": "If a migration is active, the ID of that migration.", - "type": "string", - "format": "uuid" - }, - "propolis_id": { - "nullable": true, - "description": "The instance's currently active VMM ID.", - "allOf": [ - { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - } - ] - }, - "time_updated": { - "description": "Timestamp for this information.", - "type": "string", - "format": "date-time" - } - }, - "required": [ - "gen", - "time_updated" - ] - }, "IpNet": { "x-rust-type": { "crate": "oxnet", @@ -3594,24 +3416,6 @@ "minLength": 5, "maxLength": 17 }, - "MigrationRole": { - "oneOf": [ - { - "description": "This update concerns the source VMM of a migration.", - "type": "string", - "enum": [ - "source" - ] - }, - { - "description": "This update concerns the target VMM of a migration.", - "type": "string", - "enum": [ - "target" - ] - } - ] - }, "MigrationRuntimeState": { "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", "type": "object", @@ -3623,9 +3427,6 @@ "type": "string", "format": "uuid" }, - "role": { - "$ref": "#/components/schemas/MigrationRole" - }, "state": { "$ref": "#/components/schemas/MigrationState" }, @@ -3638,7 +3439,6 @@ "required": [ "gen", "migration_id", - "role", "state", "time_updated" ] @@ -4839,17 +4639,18 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { - "instance_state": { - "description": "The sled's conception of the state of the instance.", + "migration_in": { + "nullable": true, + "description": "The current state of any inbound migration to this VMM.", "allOf": [ { - "$ref": "#/components/schemas/InstanceRuntimeState" + "$ref": "#/components/schemas/MigrationRuntimeState" } ] }, - "migration_state": { + "migration_out": { "nullable": true, - "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "description": "The state of any outbound migration from this VMM.", "allOf": [ { "$ref": "#/components/schemas/MigrationRuntimeState" @@ -4874,7 +4675,6 @@ } }, "required": [ - "instance_state", "propolis_id", "vmm_state" ] diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 4329a72bff..5f6c331633 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -475,49 +475,6 @@ } } }, - "/instances/{instance_id}/migration-ids": { - "put": { - "operationId": "instance_put_migration_ids", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstancePutMigrationIdsBody" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SledInstanceState" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/instances/{instance_id}/state": { "get": { "operationId": "instance_get_state", @@ -2205,139 +2162,8 @@ ] }, "DatasetKind": { - "description": "Describes the purpose of the dataset.", - "oneOf": [ - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "cockroachdb" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "crucible" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "clickhouse" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "clickhouse_keeper" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "external_dns" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "internal_dns" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "zone_root" - ] - } - }, - "required": [ - "type" - ] - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "type": { - "type": "string", - "enum": [ - "zone" - ] - } - }, - "required": [ - "name", - "type" - ] - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "debug" - ] - } - }, - "required": [ - "type" - ] - } - ] + "description": "The kind of dataset. See the `DatasetKind` enum in omicron-common for possible values.", + "type": "string" }, "DatasetManagementStatus": { "description": "Identifies how a single dataset management operation may have succeeded or failed.", @@ -3368,23 +3194,6 @@ "silo_id" ] }, - "InstanceMigrationSourceParams": { - "description": "Instance runtime state to update for a migration.", - "type": "object", - "properties": { - "dst_propolis_id": { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - }, - "migration_id": { - "type": "string", - "format": "uuid" - } - }, - "required": [ - "dst_propolis_id", - "migration_id" - ] - }, "InstanceMigrationTargetParams": { "description": "Parameters used when directing Propolis to initialize itself via live migration.", "type": "object", @@ -3429,32 +3238,6 @@ "ncpus" ] }, - "InstancePutMigrationIdsBody": { - "description": "The body of a request to set or clear the migration identifiers from a sled agent's instance state records.", - "type": "object", - "properties": { - "migration_params": { - "nullable": true, - "description": "The migration identifiers to set. If `None`, this operation clears the migration IDs.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceMigrationSourceParams" - } - ] - }, - "old_runtime": { - "description": "The last instance runtime state known to this requestor. This request will succeed if either (a) the state generation in the sled agent's runtime state matches the generation in this record, or (b) the sled agent's runtime state matches what would result from applying this request to the caller's runtime state. This latter condition provides idempotency.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceRuntimeState" - } - ] - } - }, - "required": [ - "old_runtime" - ] - }, "InstancePutStateBody": { "description": "The body of a request to move a previously-ensured instance into a specific runtime state.", "type": "object", @@ -3875,24 +3658,6 @@ "minLength": 5, "maxLength": 17 }, - "MigrationRole": { - "oneOf": [ - { - "description": "This update concerns the source VMM of a migration.", - "type": "string", - "enum": [ - "source" - ] - }, - { - "description": "This update concerns the target VMM of a migration.", - "type": "string", - "enum": [ - "target" - ] - } - ] - }, "MigrationRuntimeState": { "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", "type": "object", @@ -3904,9 +3669,6 @@ "type": "string", "format": "uuid" }, - "role": { - "$ref": "#/components/schemas/MigrationRole" - }, "state": { "$ref": "#/components/schemas/MigrationState" }, @@ -3919,7 +3681,6 @@ "required": [ "gen", "migration_id", - "role", "state", "time_updated" ] @@ -4990,17 +4751,18 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { - "instance_state": { - "description": "The sled's conception of the state of the instance.", + "migration_in": { + "nullable": true, + "description": "The current state of any inbound migration to this VMM.", "allOf": [ { - "$ref": "#/components/schemas/InstanceRuntimeState" + "$ref": "#/components/schemas/MigrationRuntimeState" } ] }, - "migration_state": { + "migration_out": { "nullable": true, - "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "description": "The state of any outbound migration from this VMM.", "allOf": [ { "$ref": "#/components/schemas/MigrationRuntimeState" @@ -5025,7 +4787,6 @@ } }, "required": [ - "instance_state", "propolis_id", "vmm_state" ] diff --git a/oximeter/impl/src/schema/codegen.rs b/oximeter/impl/src/schema/codegen.rs index ef686c3cdd..d433441718 100644 --- a/oximeter/impl/src/schema/codegen.rs +++ b/oximeter/impl/src/schema/codegen.rs @@ -559,7 +559,10 @@ impl quote::ToTokens for TimeseriesSchema { let created = quote_creation_time(self.created); let toks = quote! { ::oximeter::schema::TimeseriesSchema { - timeseries_name: ::oximeter::TimeseriesName::try_from(#timeseries_name).unwrap(), + timeseries_name: + <::oximeter::TimeseriesName as ::std::convert::TryFrom<&str>>::try_from( + #timeseries_name + ).unwrap(), description: ::oximeter::schema::TimeseriesDescription { target: String::from(#target_description), metric: String::from(#metric_description), diff --git a/oximeter/oximeter/schema/switch-data-link.toml b/oximeter/oximeter/schema/switch-data-link.toml index fa10759ca9..d6744e8c7f 100644 --- a/oximeter/oximeter/schema/switch-data-link.toml +++ b/oximeter/oximeter/schema/switch-data-link.toml @@ -89,6 +89,15 @@ versions = [ { added_in = 1, fields = [ "port_id", "link_id" ] } ] +[[metrics]] +name = "link_enabled" +description = "Reports whether the link is currently enabled" +units = "none" +datum_type = "bool" +versions = [ + { added_in = 1, fields = [ "port_id", "link_id" ] } +] + [[metrics]] name = "link_fsm" description = """\ @@ -164,6 +173,60 @@ versions = [ { added_in = 1, fields = [ "port_id", "link_id" ] } ] +[[metrics]] +name = "pcs_sync_loss" +description = "Total number of times PCS sync was lost on the data link" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "port_id", "link_id" ] } +] + +[[metrics]] +name = "fec_high_symbol_errors" +description = "FEC symbol error threshold exceeded" +units = "none" +datum_type = "bool" +versions = [ + { added_in = 1, fields = [ "port_id", "link_id" ] } +] + +[[metrics]] +name = "fec_sync_aligned" +description = "All lanes synchronized and aligned" +units = "none" +datum_type = "bool" +versions = [ + { added_in = 1, fields = [ "port_id", "link_id" ] } +] + +[[metrics]] +name = "fec_corrected_blocks" +description = "Total number of FEC blocks that were corrected" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "port_id", "link_id" ] } +] + +[[metrics]] +name = "fec_uncorrected_blocks" +description = "Total number of FEC blocks that were uncorrected" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "port_id", "link_id" ] } +] + +[[metrics]] +name = "fec_symbol_errors" +description = "Total number of FEC symbol errors" +units = "count" +datum_type = "cumulative_u64" +versions = [ + { added_in = 1, fields = [ "port_id", "lane", "link_id" ] } +] + [fields.rack_id] type = "uuid" description = "ID of the rack the link's switch is in" @@ -204,6 +267,10 @@ description = "Serial number of the switch the link is on" type = "string" description = "Physical switch port the link is on" +[fields.lane] +type = "u8" +description = "Lane (Tx/Rx pair) within a single link" + [fields.link_id] type = "u8" description = "ID of the link within its switch port" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index d70b170d3a..e3dc6ba131 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] # We choose a specific toolchain (rather than "stable") for repeatability. The # intent is to keep this up-to-date with recently-released stable Rust. -channel = "1.80.0" +channel = "1.80.1" profile = "default" diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index bb3d3c8fd1..0fccc6dd03 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4055,6 +4055,64 @@ CREATE INDEX IF NOT EXISTS lookup_any_disk_by_volume_id ON omicron.public.disk ( CREATE INDEX IF NOT EXISTS lookup_snapshot_by_destination_volume_id ON omicron.public.snapshot ( destination_volume_id ); +CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_state AS ENUM ( + 'requested', + 'allocating', + 'replacement_done', + 'deleting_old_volume', + 'running', + 'complete' +); + +CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( + id UUID PRIMARY KEY, + + request_time TIMESTAMPTZ NOT NULL, + + old_dataset_id UUID NOT NULL, + old_region_id UUID NOT NULL, + old_snapshot_id UUID NOT NULL, + + old_snapshot_volume_id UUID, + + new_region_id UUID, + + replacement_state omicron.public.region_snapshot_replacement_state NOT NULL, + + operating_saga_id UUID +); + +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_by_state on omicron.public.region_snapshot_replacement (replacement_state); + +CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_step_state AS ENUM ( + 'requested', + 'running', + 'complete', + 'volume_deleted' +); + +CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement_step ( + id UUID PRIMARY KEY, + + request_id UUID NOT NULL, + + request_time TIMESTAMPTZ NOT NULL, + + volume_id UUID NOT NULL, + + old_snapshot_volume_id UUID, + + replacement_state omicron.public.region_snapshot_replacement_step_state NOT NULL, + + operating_saga_id UUID +); + +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_step_by_state + on omicron.public.region_snapshot_replacement_step (replacement_state); + +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_step_by_old_volume_id + on omicron.public.region_snapshot_replacement_step (old_snapshot_volume_id); + /* * Metadata for the schema itself. This version number isn't great, as there's * nothing to ensure it gets bumped when it should be, but it's a start. @@ -4192,7 +4250,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '87.0.0', NULL) + (TRUE, NOW(), NOW(), '88.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/snapshot-replacement/up01.sql b/schema/crdb/snapshot-replacement/up01.sql new file mode 100644 index 0000000000..bb2a2af57f --- /dev/null +++ b/schema/crdb/snapshot-replacement/up01.sql @@ -0,0 +1,8 @@ +CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_state AS ENUM ( + 'requested', + 'allocating', + 'replacement_done', + 'deleting_old_volume', + 'running', + 'complete' +); diff --git a/schema/crdb/snapshot-replacement/up02.sql b/schema/crdb/snapshot-replacement/up02.sql new file mode 100644 index 0000000000..61e37c91cc --- /dev/null +++ b/schema/crdb/snapshot-replacement/up02.sql @@ -0,0 +1,17 @@ +CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( + id UUID PRIMARY KEY, + + request_time TIMESTAMPTZ NOT NULL, + + old_dataset_id UUID NOT NULL, + old_region_id UUID NOT NULL, + old_snapshot_id UUID NOT NULL, + + old_snapshot_volume_id UUID, + + new_region_id UUID, + + replacement_state omicron.public.region_snapshot_replacement_state NOT NULL, + + operating_saga_id UUID +); diff --git a/schema/crdb/snapshot-replacement/up03.sql b/schema/crdb/snapshot-replacement/up03.sql new file mode 100644 index 0000000000..db86e66ef4 --- /dev/null +++ b/schema/crdb/snapshot-replacement/up03.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_by_state on omicron.public.region_snapshot_replacement (replacement_state); diff --git a/schema/crdb/snapshot-replacement/up04.sql b/schema/crdb/snapshot-replacement/up04.sql new file mode 100644 index 0000000000..3640aae8c9 --- /dev/null +++ b/schema/crdb/snapshot-replacement/up04.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_step_state AS ENUM ( + 'requested', + 'running', + 'complete', + 'volume_deleted' +); diff --git a/schema/crdb/snapshot-replacement/up05.sql b/schema/crdb/snapshot-replacement/up05.sql new file mode 100644 index 0000000000..6afb623239 --- /dev/null +++ b/schema/crdb/snapshot-replacement/up05.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement_step ( + id UUID PRIMARY KEY, + + request_id UUID NOT NULL, + + request_time TIMESTAMPTZ NOT NULL, + + volume_id UUID NOT NULL, + + old_snapshot_volume_id UUID, + + replacement_state omicron.public.region_snapshot_replacement_step_state NOT NULL, + + operating_saga_id UUID +); diff --git a/schema/crdb/snapshot-replacement/up06.sql b/schema/crdb/snapshot-replacement/up06.sql new file mode 100644 index 0000000000..a0701694b0 --- /dev/null +++ b/schema/crdb/snapshot-replacement/up06.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_step_by_state + on omicron.public.region_snapshot_replacement_step (replacement_state); diff --git a/schema/crdb/snapshot-replacement/up07.sql b/schema/crdb/snapshot-replacement/up07.sql new file mode 100644 index 0000000000..cff835be78 --- /dev/null +++ b/schema/crdb/snapshot-replacement/up07.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_step_by_old_volume_id + on omicron.public.region_snapshot_replacement_step (old_snapshot_volume_id); diff --git a/schema/omicron-datasets.json b/schema/omicron-datasets.json index b675432172..8b4bf59ae9 100644 --- a/schema/omicron-datasets.json +++ b/schema/omicron-datasets.json @@ -75,139 +75,8 @@ } }, "DatasetKind": { - "description": "Describes the purpose of the dataset.", - "oneOf": [ - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "cockroachdb" - ] - } - } - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "crucible" - ] - } - } - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "clickhouse" - ] - } - } - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "clickhouse_keeper" - ] - } - } - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "external_dns" - ] - } - } - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "internal_dns" - ] - } - } - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "zone_root" - ] - } - } - }, - { - "type": "object", - "required": [ - "name", - "type" - ], - "properties": { - "name": { - "type": "string" - }, - "type": { - "type": "string", - "enum": [ - "zone" - ] - } - } - }, - { - "type": "object", - "required": [ - "type" - ], - "properties": { - "type": { - "type": "string", - "enum": [ - "debug" - ] - } - } - } - ] + "description": "The kind of dataset. See the `DatasetKind` enum in omicron-common for possible values.", + "type": "string" }, "DatasetName": { "type": "object", diff --git a/sled-agent/src/bin/zone-bundle.rs b/sled-agent/src/bin/zone-bundle.rs index 82433edaf5..e420644b52 100644 --- a/sled-agent/src/bin/zone-bundle.rs +++ b/sled-agent/src/bin/zone-bundle.rs @@ -246,54 +246,25 @@ async fn fetch_underlay_address() -> anyhow::Result { return Ok(Ipv6Addr::LOCALHOST); #[cfg(target_os = "illumos")] { + use illumos_utils::ipadm::Ipadm; + use std::net::IpAddr; const EXPECTED_ADDR_OBJ: &str = "underlay0/sled6"; - let output = Command::new("ipadm") - .arg("show-addr") - .arg("-p") - .arg("-o") - .arg("addr") - .arg(EXPECTED_ADDR_OBJ) - .output() - .await?; - // If we failed because there was no such interface, then fall back to - // localhost. - if !output.status.success() { - match std::str::from_utf8(&output.stderr) { - Err(_) => bail!( - "ipadm command failed unexpectedly, stderr:\n{}", - String::from_utf8_lossy(&output.stderr) + match Ipadm::addrobj_addr(EXPECTED_ADDR_OBJ) { + // If we failed because there was no such interface, then fall back + // to localhost. + Ok(None) => Ok(Ipv6Addr::LOCALHOST), + Ok(Some(addr)) => match addr.addr() { + IpAddr::V6(ipv6) => Ok(ipv6), + IpAddr::V4(ipv4) => bail!( + "Unexpectedly got IPv4 address for {}: {}", + EXPECTED_ADDR_OBJ, + ipv4 ), - Ok(out) => { - if out.contains("Address object not found") { - eprintln!( - "Expected addrobj '{}' not found, using localhost", - EXPECTED_ADDR_OBJ, - ); - return Ok(Ipv6Addr::LOCALHOST); - } else { - bail!( - "ipadm subcommand failed unexpectedly, stderr:\n{}", - String::from_utf8_lossy(&output.stderr), - ); - } - } - } + }, + Err(e) => bail!( + "failed to get address for addrobj {EXPECTED_ADDR_OBJ}: {e}", + ), } - let out = std::str::from_utf8(&output.stdout) - .context("non-UTF8 output in ipadm")?; - let lines: Vec<_> = out.trim().lines().collect(); - anyhow::ensure!( - lines.len() == 1, - "No addresses or more than one address on expected interface '{}'", - EXPECTED_ADDR_OBJ - ); - lines[0] - .trim() - .split_once('/') - .context("expected a /64 subnet")? - .0 - .parse() - .context("invalid IPv6 address") } } diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 0fe2e27698..adbeb9158f 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -4,26 +4,26 @@ //! Describes the states of VM instances. -use crate::params::InstanceMigrationSourceParams; use chrono::{DateTime, Utc}; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, MigrationRole, MigrationRuntimeState, MigrationState, - SledInstanceState, VmmRuntimeState, VmmState, + MigrationRuntimeState, MigrationState, SledInstanceState, VmmRuntimeState, + VmmState, }; use omicron_uuid_kinds::PropolisUuid; use propolis_client::types::{ - InstanceState as PropolisApiState, InstanceStateMonitorResponse, - MigrationState as PropolisMigrationState, + InstanceMigrationStatus, InstanceState as PropolisApiState, + InstanceStateMonitorResponse, MigrationState as PropolisMigrationState, }; +use uuid::Uuid; /// The instance and VMM state that sled agent maintains on a per-VMM basis. #[derive(Clone, Debug)] pub struct InstanceStates { - instance: InstanceRuntimeState, vmm: VmmRuntimeState, propolis_id: PropolisUuid, - migration: Option, + migration_in: Option, + migration_out: Option, } /// Newtype to allow conversion from Propolis API states (returned by the @@ -101,9 +101,8 @@ pub(crate) struct ObservedPropolisState { /// The state reported by Propolis's instance state monitor API. pub vmm_state: PropolisInstanceState, - /// Information about whether the state observer queried migration status at - /// all and, if so, what response it got from Propolis. - pub migration_status: ObservedMigrationStatus, + pub migration_in: Option, + pub migration_out: Option, /// The approximate time at which this observation was made. pub time: DateTime, @@ -111,68 +110,43 @@ pub(crate) struct ObservedPropolisState { impl ObservedPropolisState { /// Constructs a Propolis state observation from an instance's current - /// runtime state and an instance state monitor response received from + /// state and an instance state monitor response received from /// Propolis. - pub fn new( - instance_runtime: &InstanceRuntimeState, - propolis_state: &InstanceStateMonitorResponse, - ) -> Self { - // If there's no migration currently registered with this sled, report - // the current state and that no migration is currently in progress, - // even if Propolis has some migration data to share. (This case arises - // when Propolis returns state from a previous migration that sled agent - // has already retired.) - // - // N.B. This needs to be read from the instance runtime state and not - // the migration runtime state to ensure that, once a migration in - // completes, the "completed" observation is reported to - // `InstanceStates::apply_propolis_observation` exactly once. - // Otherwise that routine will try to apply the "inbound migration - // complete" instance state transition twice. - let Some(migration_id) = instance_runtime.migration_id else { - return Self { - vmm_state: PropolisInstanceState(propolis_state.state), - migration_status: ObservedMigrationStatus::NoMigration, - time: Utc::now(), - }; - }; - - // Sled agent believes a live migration may be in progress. See if - // either of the Propolis migrations corresponds to it. - let propolis_migration = match ( - &propolis_state.migration.migration_in, - &propolis_state.migration.migration_out, - ) { - (Some(inbound), _) if inbound.id == migration_id => inbound, - (_, Some(outbound)) if outbound.id == migration_id => outbound, - _ => { - // Sled agent believes this instance should be migrating, but - // Propolis isn't reporting a matching migration yet, so assume - // the migration is still pending. - return Self { - vmm_state: PropolisInstanceState(propolis_state.state), - migration_status: ObservedMigrationStatus::Pending, - time: Utc::now(), - }; - } - }; - + pub fn new(propolis_state: &InstanceStateMonitorResponse) -> Self { Self { vmm_state: PropolisInstanceState(propolis_state.state), - migration_status: match propolis_migration.state { - PropolisMigrationState::Finish => { - ObservedMigrationStatus::Succeeded - } - PropolisMigrationState::Error => { - ObservedMigrationStatus::Failed - } - _ => ObservedMigrationStatus::InProgress, - }, + migration_in: propolis_state + .migration + .migration_in + .as_ref() + .map(ObservedMigrationState::from), + migration_out: propolis_state + .migration + .migration_out + .as_ref() + .map(ObservedMigrationState::from), time: Utc::now(), } } } +#[derive(Copy, Clone, Debug)] +pub struct ObservedMigrationState { + state: MigrationState, + id: Uuid, +} + +impl From<&'_ InstanceMigrationStatus> for ObservedMigrationState { + fn from(observed: &InstanceMigrationStatus) -> Self { + let state = match observed.state { + PropolisMigrationState::Error => MigrationState::Failed, + PropolisMigrationState::Finish => MigrationState::Completed, + _ => MigrationState::InProgress, + }; + Self { state, id: observed.id } + } +} + /// The set of instance states that sled agent can publish to Nexus. This is /// a subset of the instance states Nexus knows about: the Creating and /// Destroyed states are reserved for Nexus to use for instances that are being @@ -191,20 +165,6 @@ impl From for VmmState { } } -/// The possible roles a VMM can have vis-a-vis an instance. -#[derive(Clone, Copy, Debug, PartialEq)] -enum PropolisRole { - /// The VMM is its instance's current active VMM. - Active, - - /// The VMM is its instance's migration target VMM. - MigrationTarget, - - /// The instance does not refer to this VMM (but it may have done so in the - /// past). - Retired, -} - /// Action to be taken on behalf of state transition. #[derive(Clone, Copy, Debug, PartialEq)] pub enum Action { @@ -214,30 +174,20 @@ pub enum Action { impl InstanceStates { pub fn new( - instance: InstanceRuntimeState, vmm: VmmRuntimeState, propolis_id: PropolisUuid, + migration_id: Option, ) -> Self { - let migration = instance.migration_id.map(|migration_id| { - let dst_propolis_id = instance.dst_propolis_id.expect("if an instance has a migration ID, it should also have a target VMM ID"); - let role = if dst_propolis_id == propolis_id { - MigrationRole::Target - } else { - MigrationRole::Source - }; - MigrationRuntimeState { + // If this instance is created with a migration ID, we are the intended + // target of a migration in. Set that up now. + let migration_in = + migration_id.map(|migration_id| MigrationRuntimeState { migration_id, - state: MigrationState::InProgress, - role, + state: MigrationState::Pending, gen: Generation::new(), time_updated: Utc::now(), - } - }); - InstanceStates { instance, vmm, propolis_id, migration } - } - - pub fn instance(&self) -> &InstanceRuntimeState { - &self.instance + }); + InstanceStates { vmm, propolis_id, migration_in, migration_out: None } } pub fn vmm(&self) -> &VmmRuntimeState { @@ -248,8 +198,12 @@ impl InstanceStates { self.propolis_id } - pub(crate) fn migration(&self) -> Option<&MigrationRuntimeState> { - self.migration.as_ref() + pub fn migration_in(&self) -> Option<&MigrationRuntimeState> { + self.migration_in.as_ref() + } + + pub fn migration_out(&self) -> Option<&MigrationRuntimeState> { + self.migration_out.as_ref() } /// Creates a `SledInstanceState` structure containing the entirety of this @@ -257,28 +211,10 @@ impl InstanceStates { /// use the `instance` or `vmm` accessors instead. pub fn sled_instance_state(&self) -> SledInstanceState { SledInstanceState { - instance_state: self.instance.clone(), vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, - migration_state: self.migration.clone(), - } - } - - fn transition_migration( - &mut self, - state: MigrationState, - time_updated: DateTime, - ) { - let migration = self.migration.as_mut().expect( - "an ObservedMigrationState should only be constructed when the \ - VMM has an active migration", - ); - // Don't generate spurious state updates if the migration is already in - // the state we're transitioning to. - if migration.state != state { - migration.state = state; - migration.time_updated = time_updated; - migration.gen = migration.gen.next(); + migration_in: self.migration_in.clone(), + migration_out: self.migration_out.clone(), } } @@ -288,6 +224,52 @@ impl InstanceStates { &mut self, observed: &ObservedPropolisState, ) -> Option { + fn transition_migration( + current: &mut Option, + ObservedMigrationState { id, state }: ObservedMigrationState, + now: DateTime, + ) { + if let Some(ref mut m) = current { + // Don't generate spurious state updates if the migration is already in + // the state we're transitioning to. + if m.migration_id == id && m.state == state { + return; + } + m.state = state; + if m.migration_id == id { + m.gen = m.gen.next(); + } else { + m.migration_id = id; + m.gen = Generation::new().next(); + } + m.time_updated = now; + } else { + *current = Some(MigrationRuntimeState { + migration_id: id, + // We are creating a new migration record, but the state + // will not be `Pending`, because we've actually gotten a + // migration observation from Propolis. Therefore, we have + // to advance the initial generation once to be ahead of + // what the generation in the database is when Nexus creates + // the initial migration record at generation 1. + gen: Generation::new().next(), + state, + time_updated: now, + }); + } + } + + fn destroy_migration( + migration: &mut MigrationRuntimeState, + now: DateTime, + ) { + if !migration.state.is_terminal() { + migration.gen = migration.gen.next(); + migration.time_updated = now; + migration.state = MigrationState::Failed; + } + } + let vmm_gone = matches!( observed.vmm_state.0, PropolisApiState::Destroyed | PropolisApiState::Failed @@ -303,78 +285,11 @@ impl InstanceStates { // Update the instance record to reflect the result of any completed // migration. - match observed.migration_status { - ObservedMigrationStatus::Succeeded => { - self.transition_migration( - MigrationState::Completed, - observed.time, - ); - match self.propolis_role() { - // This is a successful migration out. Point the instance to the - // target VMM, but don't clear migration IDs; let the target do - // that so that the instance will continue to appear to be - // migrating until it is safe to migrate again. - PropolisRole::Active => { - self.switch_propolis_id_to_target(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Retired); - } - - // This is a successful migration in. Point the instance to the - // target VMM and clear migration IDs so that another migration - // in can begin. Propolis will continue reporting that this - // migration was successful, but because its ID has been - // discarded the observed migration status will change from - // Succeeded to NoMigration. - // - // Note that these calls increment the instance's generation - // number twice. This is by design and allows the target's - // migration-ID-clearing update to overtake the source's update. - PropolisRole::MigrationTarget => { - self.switch_propolis_id_to_target(observed.time); - self.clear_migration_ids(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Active); - } - - // This is a migration source that previously reported success - // and removed itself from the active Propolis position. Don't - // touch the instance. - PropolisRole::Retired => {} - } - } - ObservedMigrationStatus::Failed => { - self.transition_migration( - MigrationState::Failed, - observed.time, - ); - - match self.propolis_role() { - // This is a failed migration out. CLear migration IDs so that - // Nexus can try again. - PropolisRole::Active => { - self.clear_migration_ids(observed.time); - } - - // This is a failed migration in. Leave the migration IDs alone - // so that the migration won't appear to have concluded until - // the source is ready to start a new one. - PropolisRole::MigrationTarget => {} - - // This VMM was part of a failed migration and was subsequently - // removed from the instance record entirely. There's nothing to - // update. - PropolisRole::Retired => {} - } - } - ObservedMigrationStatus::InProgress => { - self.transition_migration( - MigrationState::InProgress, - observed.time, - ); - } - ObservedMigrationStatus::NoMigration - | ObservedMigrationStatus::Pending => {} + if let Some(m) = observed.migration_in { + transition_migration(&mut self.migration_in, m, observed.time); + } + if let Some(m) = observed.migration_out { + transition_migration(&mut self.migration_out, m, observed.time); } // If this Propolis has exited, tear down its zone. If it was in the @@ -389,19 +304,13 @@ impl InstanceStates { // been transferred to the target, and what was once an active VMM // is now retired.) if vmm_gone { - if self.propolis_role() == PropolisRole::Active { - self.clear_migration_ids(observed.time); - self.retire_active_propolis(observed.time); - } // If there's an active migration and the VMM is suddenly gone, // that should constitute a migration failure! - if let Some(MigrationState::Pending | MigrationState::InProgress) = - self.migration.as_ref().map(|m| m.state) - { - self.transition_migration( - MigrationState::Failed, - observed.time, - ); + if let Some(ref mut m) = self.migration_in { + destroy_migration(m, observed.time); + } + if let Some(ref mut m) = self.migration_out { + destroy_migration(m, observed.time); } Some(Action::Destroy) } else { @@ -409,54 +318,6 @@ impl InstanceStates { } } - /// Yields the role that this structure's VMM has given the structure's - /// current instance state. - fn propolis_role(&self) -> PropolisRole { - if let Some(active_id) = self.instance.propolis_id { - if active_id == self.propolis_id { - return PropolisRole::Active; - } - } - - if let Some(dst_id) = self.instance.dst_propolis_id { - if dst_id == self.propolis_id { - return PropolisRole::MigrationTarget; - } - } - - PropolisRole::Retired - } - - /// Sets the no-VMM fallback state of the current instance to reflect the - /// state of its terminated VMM and clears the instance's current Propolis - /// ID. Note that this routine does not touch any migration IDs. - /// - /// This should only be called by the state block for an active VMM and only - /// when that VMM is in a terminal state (Destroyed or Failed). - fn retire_active_propolis(&mut self, now: DateTime) { - assert!(self.propolis_role() == PropolisRole::Active); - - self.instance.propolis_id = None; - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; - } - - /// Moves the instance's destination Propolis ID into the current active - /// position and updates the generation number, but does not clear the - /// destination ID or the active migration ID. This promotes a migration - /// target VMM into the active position without actually allowing a new - /// migration to begin. - /// - /// This routine should only be called when - /// `instance.dst_propolis_id.is_some()`. - fn switch_propolis_id_to_target(&mut self, now: DateTime) { - assert!(self.instance.dst_propolis_id.is_some()); - - self.instance.propolis_id = self.instance.dst_propolis_id; - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; - } - /// Forcibly transitions this instance's VMM into the specified `next` /// state and updates its generation number. pub(crate) fn transition_vmm( @@ -495,135 +356,29 @@ impl InstanceStates { let fake_observed = ObservedPropolisState { vmm_state, - migration_status: if self.instance.migration_id.is_some() { - ObservedMigrationStatus::Failed - } else { - ObservedMigrationStatus::NoMigration - }, + // We don't actually need to populate these, because observing a + // `Destroyed` instance state will fail any in progress migrations anyway. + migration_in: None, + migration_out: None, time: Utc::now(), }; self.apply_propolis_observation(&fake_observed); } - - /// Sets or clears this instance's migration IDs and advances its Propolis - /// generation number. - pub(crate) fn set_migration_ids( - &mut self, - ids: &Option, - now: DateTime, - ) { - if let Some(InstanceMigrationSourceParams { - migration_id, - dst_propolis_id, - }) = *ids - { - self.instance.migration_id = Some(migration_id); - self.instance.dst_propolis_id = Some(dst_propolis_id); - let role = if dst_propolis_id == self.propolis_id { - MigrationRole::Target - } else { - MigrationRole::Source - }; - self.migration = Some(MigrationRuntimeState { - migration_id, - state: MigrationState::Pending, - role, - gen: Generation::new(), - time_updated: now, - }) - } else { - self.instance.migration_id = None; - self.instance.dst_propolis_id = None; - self.migration = None; - } - - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; - } - - /// Unconditionally clears the instance's migration IDs and advances its - /// Propolis generation. Not public; used internally to conclude migrations. - fn clear_migration_ids(&mut self, now: DateTime) { - self.instance.migration_id = None; - self.instance.dst_propolis_id = None; - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; - } - - /// Returns true if the migration IDs in this instance are already set as they - /// would be on a successful transition from the migration IDs in - /// `old_runtime` to the ones in `migration_ids`. - pub(crate) fn migration_ids_already_set( - &self, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> bool { - // For the old and new records to match, the new record's Propolis - // generation must immediately succeed the old record's. - // - // This is an equality check to try to avoid the following A-B-A - // problem: - // - // 1. Instance starts on sled 1. - // 2. Parallel sagas start, one to migrate the instance to sled 2 - // and one to migrate the instance to sled 3. - // 3. The "migrate to sled 2" saga completes. - // 4. A new migration starts that migrates the instance back to sled 1. - // 5. The "migrate to sled 3" saga attempts to set its migration - // ID. - // - // A simple less-than check allows the migration to sled 3 to proceed - // even though the most-recently-expressed intent to migrate put the - // instance on sled 1. - if old_runtime.gen.next() != self.instance.gen { - return false; - } - - match (self.instance.migration_id, migration_ids) { - // If the migration ID is already set, and this is a request to set - // IDs, the records match if the relevant IDs match. - (Some(current_migration_id), Some(ids)) => { - let current_dst_id = self.instance.dst_propolis_id.expect( - "migration ID and destination ID must be set together", - ); - - current_migration_id == ids.migration_id - && current_dst_id == ids.dst_propolis_id - } - // If the migration ID is already cleared, and this is a request to - // clear IDs, the records match. - (None, None) => { - assert!(self.instance.dst_propolis_id.is_none()); - true - } - _ => false, - } - } } #[cfg(test)] mod test { use super::*; - use crate::params::InstanceMigrationSourceParams; - use chrono::Utc; use omicron_common::api::external::Generation; - use omicron_common::api::internal::nexus::InstanceRuntimeState; use propolis_client::types::InstanceState as Observed; use uuid::Uuid; fn make_instance() -> InstanceStates { let propolis_id = PropolisUuid::new_v4(); let now = Utc::now(); - let instance = InstanceRuntimeState { - propolis_id: Some(propolis_id), - dst_propolis_id: None, - migration_id: None, - gen: Generation::new(), - time_updated: now, - }; let vmm = VmmRuntimeState { state: VmmState::Starting, @@ -631,19 +386,16 @@ mod test { time_updated: now, }; - InstanceStates::new(instance, vmm, propolis_id) + InstanceStates::new(vmm, propolis_id, None) } fn make_migration_source_instance() -> InstanceStates { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; let migration_id = Uuid::new_v4(); - state.instance.migration_id = Some(migration_id); - state.instance.dst_propolis_id = Some(PropolisUuid::new_v4()); - state.migration = Some(MigrationRuntimeState { + state.migration_out = Some(MigrationRuntimeState { migration_id, state: MigrationState::InProgress, - role: MigrationRole::Source, // advance the generation once, since we are starting out in the // `InProgress` state. gen: Generation::new().next(), @@ -654,22 +406,16 @@ mod test { } fn make_migration_target_instance() -> InstanceStates { - let mut state = make_instance(); - state.vmm.state = VmmState::Migrating; - let migration_id = Uuid::new_v4(); - state.instance.migration_id = Some(migration_id); - state.propolis_id = PropolisUuid::new_v4(); - state.instance.dst_propolis_id = Some(state.propolis_id); - state.migration = Some(MigrationRuntimeState { - migration_id, - state: MigrationState::InProgress, - role: MigrationRole::Target, - // advance the generation once, since we are starting out in the - // `InProgress` state. - gen: Generation::new().next(), - time_updated: Utc::now(), - }); - state + let propolis_id = PropolisUuid::new_v4(); + let now = Utc::now(); + + let vmm = VmmRuntimeState { + state: VmmState::Migrating, + gen: Generation::new(), + time_updated: now, + }; + + InstanceStates::new(vmm, propolis_id, Some(Uuid::new_v4())) } fn make_observed_state( @@ -677,7 +423,8 @@ mod test { ) -> ObservedPropolisState { ObservedPropolisState { vmm_state: propolis_state, - migration_status: ObservedMigrationStatus::NoMigration, + migration_in: None, + migration_out: None, time: Utc::now(), } } @@ -689,36 +436,6 @@ mod test { prev: &InstanceStates, next: &InstanceStates, ) { - // The predicate under test below is "if an interesting field changed, - // then the generation number changed." Testing the contrapositive is a - // little nicer because the assertion that trips identifies exactly - // which field changed without updating the generation number. - // - // The else branch tests the converse to make sure the generation number - // does not update unexpectedly. While this won't cause an important - // state update to be dropped, it can interfere with updates from other - // sleds that expect their own attempts to advance the generation number - // to cause new state to be recorded. - if prev.instance.gen == next.instance.gen { - assert_eq!(prev.instance.propolis_id, next.instance.propolis_id); - assert_eq!( - prev.instance.dst_propolis_id, - next.instance.dst_propolis_id - ); - assert_eq!(prev.instance.migration_id, next.instance.migration_id); - } else { - assert!( - (prev.instance.propolis_id != next.instance.propolis_id) - || (prev.instance.dst_propolis_id - != next.instance.dst_propolis_id) - || (prev.instance.migration_id - != next.instance.migration_id), - "prev: {:?}, next: {:?}", - prev, - next - ); - } - // Propolis is free to publish no-op VMM state updates (e.g. when an // in-progress migration's state changes but the migration is not yet // complete), so don't test the converse here. @@ -731,60 +448,63 @@ mod test { fn propolis_terminal_states_request_destroy_action() { for state in [Observed::Destroyed, Observed::Failed] { let mut instance_state = make_instance(); - let original_instance_state = instance_state.clone(); let requested_action = instance_state .apply_propolis_observation(&make_observed_state(state.into())); assert!(matches!(requested_action, Some(Action::Destroy))); - assert!( - instance_state.instance.gen - > original_instance_state.instance.gen - ); } } - fn test_termination_fails_in_progress_migration( - mk_instance: impl Fn() -> InstanceStates, - ) { + #[test] + fn source_termination_fails_in_progress_migration() { for state in [Observed::Destroyed, Observed::Failed] { - let mut instance_state = mk_instance(); - let original_migration = instance_state.clone().migration.unwrap(); + let mut instance_state = make_migration_source_instance(); + let original_migration = + instance_state.clone().migration_out.unwrap(); let requested_action = instance_state .apply_propolis_observation(&make_observed_state(state.into())); - let migration = - instance_state.migration.expect("state must have a migration"); + let migration = instance_state + .migration_out + .expect("state must have a migration"); assert_eq!(migration.state, MigrationState::Failed); assert!(migration.gen > original_migration.gen); assert!(matches!(requested_action, Some(Action::Destroy))); } } - #[test] - fn source_termination_fails_in_progress_migration() { - test_termination_fails_in_progress_migration( - make_migration_source_instance, - ) - } - #[test] fn target_termination_fails_in_progress_migration() { - test_termination_fails_in_progress_migration( - make_migration_target_instance, - ) + for state in [Observed::Destroyed, Observed::Failed] { + let mut instance_state = make_migration_target_instance(); + let original_migration = + instance_state.clone().migration_in.unwrap(); + let requested_action = instance_state + .apply_propolis_observation(&make_observed_state(state.into())); + + let migration = instance_state + .migration_in + .expect("state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > original_migration.gen); + assert!(matches!(requested_action, Some(Action::Destroy))); + } } #[test] fn destruction_after_migration_out_does_not_transition() { let mut state = make_migration_source_instance(); - assert!(state.instance.dst_propolis_id.is_some()); - assert_ne!(state.instance.propolis_id, state.instance.dst_propolis_id); + let migration_id = state.migration_out.as_ref().unwrap().migration_id; // After a migration succeeds, the source VM appears to stop but reports // that the migration has succeeded. let mut observed = ObservedPropolisState { vmm_state: PropolisInstanceState(Observed::Stopping), - migration_status: ObservedMigrationStatus::Succeeded, + migration_out: Some(ObservedMigrationState { + state: MigrationState::Completed, + id: migration_id, + }), + migration_in: None, time: Utc::now(), }; @@ -794,21 +514,14 @@ mod test { let prev = state.clone(); assert!(state.apply_propolis_observation(&observed).is_none()); assert_state_change_has_gen_change(&prev, &state); - assert!(state.instance.gen > prev.instance.gen); - assert_eq!( - state.instance.dst_propolis_id, - prev.instance.dst_propolis_id - ); - assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); - assert!(state.instance.migration_id.is_some()); // The migration state should transition to "completed" let migration = state - .migration + .migration_out .clone() .expect("instance must have a migration state"); let prev_migration = - prev.migration.expect("previous state must have a migration"); + prev.migration_out.expect("previous state must have a migration"); assert_eq!(migration.state, MigrationState::Completed); assert!(migration.gen > prev_migration.gen); let prev_migration = migration; @@ -820,7 +533,6 @@ mod test { observed.vmm_state = PropolisInstanceState(Observed::Stopped); assert!(state.apply_propolis_observation(&observed).is_none()); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); // The Stopped state is translated internally to Stopping to prevent // external viewers from perceiving that the instance is stopped before @@ -830,7 +542,7 @@ mod test { // Now that the migration has completed, it should not transition again. let migration = state - .migration + .migration_out .clone() .expect("instance must have a migration state"); assert_eq!(migration.state, MigrationState::Completed); @@ -844,12 +556,19 @@ mod test { Some(Action::Destroy) )); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Destroyed); assert!(state.vmm.gen > prev.vmm.gen); let migration = state - .migration + .migration_out + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert_eq!(migration.gen, prev_migration.gen); + + state.terminate_rudely(false); + let migration = state + .migration_out .clone() .expect("instance must have a migration state"); assert_eq!(migration.state, MigrationState::Completed); @@ -859,12 +578,17 @@ mod test { #[test] fn failure_after_migration_in_does_not_transition() { let mut state = make_migration_target_instance(); + let migration_id = state.migration_in.as_ref().unwrap().migration_id; // Failure to migrate into an instance should mark the VMM as destroyed // but should not change the instance's migration IDs. let observed = ObservedPropolisState { vmm_state: PropolisInstanceState(Observed::Failed), - migration_status: ObservedMigrationStatus::Failed, + migration_in: Some(ObservedMigrationState { + state: MigrationState::Failed, + id: migration_id, + }), + migration_out: None, time: Utc::now(), }; @@ -874,15 +598,14 @@ mod test { Some(Action::Destroy) )); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Failed); assert!(state.vmm.gen > prev.vmm.gen); // The migration state should transition. let migration = - state.migration.expect("instance must have a migration state"); + state.migration_in.expect("instance must have a migration state"); let prev_migration = - prev.migration.expect("previous state must have a migration"); + prev.migration_in.expect("previous state must have a migration"); assert_eq!(migration.state, MigrationState::Failed); assert!(migration.gen > prev_migration.gen); } @@ -896,192 +619,19 @@ mod test { #[test] fn rude_terminate_of_migration_target_does_not_transition_instance() { let mut state = make_migration_target_instance(); - assert_eq!(state.propolis_role(), PropolisRole::MigrationTarget); let prev = state.clone(); let mark_failed = false; state.terminate_rudely(mark_failed); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); // The migration state should transition. let migration = - state.migration.expect("instance must have a migration state"); + state.migration_in.expect("instance must have a migration state"); let prev_migration = - prev.migration.expect("previous state must have a migration"); + prev.migration_in.expect("previous state must have a migration"); assert_eq!(migration.state, MigrationState::Failed); assert!(migration.gen > prev_migration.gen); } - - #[test] - fn migration_out_after_migration_in() { - let mut state = make_migration_target_instance(); - let mut observed = ObservedPropolisState { - vmm_state: PropolisInstanceState(Observed::Running), - migration_status: ObservedMigrationStatus::Succeeded, - time: Utc::now(), - }; - - // The transition into the Running state on the migration target should - // take over for the source, updating the Propolis generation. - let prev = state.clone(); - assert!(state.apply_propolis_observation(&observed).is_none()); - assert_state_change_has_gen_change(&prev, &state); - assert!(state.instance.migration_id.is_none()); - assert!(state.instance.dst_propolis_id.is_none()); - assert!(state.instance.gen > prev.instance.gen); - assert_eq!(state.vmm.state, VmmState::Running); - assert!(state.vmm.gen > prev.vmm.gen); - - // The migration state should transition to completed. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - let prev_migration = - prev.migration.expect("previous state must have a migration"); - assert_eq!(migration.state, MigrationState::Completed); - assert!(migration.gen > prev_migration.gen); - - // Pretend Nexus set some new migration IDs. - let migration_id = Uuid::new_v4(); - let prev = state.clone(); - state.set_migration_ids( - &Some(InstanceMigrationSourceParams { - migration_id, - dst_propolis_id: PropolisUuid::new_v4(), - }), - Utc::now(), - ); - assert_state_change_has_gen_change(&prev, &state); - assert!(state.instance.gen > prev.instance.gen); - assert_eq!(state.vmm.gen, prev.vmm.gen); - - // There should be a new, pending migration state. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - assert_eq!(migration.state, MigrationState::Pending); - assert_eq!(migration.migration_id, migration_id); - let prev_migration = migration; - - // Mark that the new migration out is in progress. This doesn't change - // anything in the instance runtime state, but does update the VMM state - // generation. - let prev = state.clone(); - observed.vmm_state = PropolisInstanceState(Observed::Migrating); - observed.migration_status = ObservedMigrationStatus::InProgress; - assert!(state.apply_propolis_observation(&observed).is_none()); - assert_state_change_has_gen_change(&prev, &state); - assert_eq!( - state.instance.migration_id.unwrap(), - prev.instance.migration_id.unwrap() - ); - assert_eq!( - state.instance.dst_propolis_id.unwrap(), - prev.instance.dst_propolis_id.unwrap() - ); - assert_eq!(state.vmm.state, VmmState::Migrating); - assert!(state.vmm.gen > prev.vmm.gen); - assert_eq!(state.instance.gen, prev.instance.gen); - - // The migration state should transition to in progress. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - assert_eq!(migration.state, MigrationState::InProgress); - assert!(migration.gen > prev_migration.gen); - let prev_migration = migration; - - // Propolis will publish that the migration succeeds before changing any - // state. This should transfer control to the target but should not - // touch the migration ID (that is the new target's job). - let prev = state.clone(); - observed.vmm_state = PropolisInstanceState(Observed::Migrating); - observed.migration_status = ObservedMigrationStatus::Succeeded; - assert!(state.apply_propolis_observation(&observed).is_none()); - assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.vmm.state, VmmState::Migrating); - assert!(state.vmm.gen > prev.vmm.gen); - assert_eq!(state.instance.migration_id, prev.instance.migration_id); - assert_eq!( - state.instance.dst_propolis_id, - prev.instance.dst_propolis_id, - ); - assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); - assert!(state.instance.gen > prev.instance.gen); - - // The migration state should transition to completed. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - assert_eq!(migration.state, MigrationState::Completed); - assert!(migration.gen > prev_migration.gen); - - // The rest of the destruction sequence is covered by other tests. - } - - #[test] - fn test_migration_ids_already_set() { - let orig_instance = make_instance(); - let mut old_instance = orig_instance.clone(); - let mut new_instance = old_instance.clone(); - - // Advancing the old instance's migration IDs and then asking if the - // new IDs are present should indicate that they are indeed present. - let migration_ids = InstanceMigrationSourceParams { - migration_id: Uuid::new_v4(), - dst_propolis_id: PropolisUuid::new_v4(), - }; - - new_instance.set_migration_ids(&Some(migration_ids), Utc::now()); - assert!(new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - // The IDs aren't already set if the new record has an ID that's - // advanced from the old record by more than one generation. - let mut newer_instance = new_instance.clone(); - newer_instance.instance.gen = newer_instance.instance.gen.next(); - assert!(!newer_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - // They also aren't set if the old generation has somehow equaled or - // surpassed the current generation. - old_instance.instance.gen = old_instance.instance.gen.next(); - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - // If the generation numbers are right, but either requested ID is not - // present in the current instance, the requested IDs aren't set. - old_instance = orig_instance; - new_instance.instance.migration_id = Some(Uuid::new_v4()); - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - new_instance.instance.migration_id = Some(migration_ids.migration_id); - new_instance.instance.dst_propolis_id = Some(PropolisUuid::new_v4()); - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - new_instance.instance.migration_id = None; - new_instance.instance.dst_propolis_id = None; - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - } } diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 97671b42e6..11c998bf9e 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -8,9 +8,9 @@ use super::sled_agent::SledAgent; use crate::bootstrap::params::AddSledRequest; use crate::params::{ BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, - InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, TimeSync, - VpcFirewallRulesEnsureBody, ZoneBundleId, ZoneBundleMetadata, Zpool, + InstanceExternalIpBody, InstancePutStateBody, InstancePutStateResponse, + InstanceUnregisterResponse, TimeSync, VpcFirewallRulesEnsureBody, + ZoneBundleId, ZoneBundleMetadata, Zpool, }; use crate::sled_agent::Error as SledAgentError; use crate::zone_bundle; @@ -57,7 +57,6 @@ pub fn api() -> SledApiDescription { api.register(disk_put)?; api.register(cockroachdb_init)?; api.register(instance_issue_disk_snapshot_request)?; - api.register(instance_put_migration_ids)?; api.register(instance_put_state)?; api.register(instance_get_state)?; api.register(instance_put_external_ip)?; @@ -528,28 +527,6 @@ async fn instance_get_state( Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/migration-ids", -}] -async fn instance_put_migration_ids( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_put_migration_ids( - instance_id, - &body_args.old_runtime, - &body_args.migration_params, - ) - .await?, - )) -} - #[endpoint { method = PUT, path = "/instances/{instance_id}/external-ip", diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 7bfe308f94..631f2b83f6 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -16,9 +16,9 @@ use crate::nexus::NexusClientWithResolver; use crate::params::ZoneBundleMetadata; use crate::params::{InstanceExternalIpBody, ZoneBundleCause}; use crate::params::{ - InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams, - InstanceMigrationTargetParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, VpcFirewallRule, + InstanceHardware, InstanceMetadata, InstanceMigrationTargetParams, + InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, VpcFirewallRule, }; use crate::profile::*; use crate::zone_bundle::BundleError; @@ -33,7 +33,7 @@ use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory}; use illumos_utils::svc::wait_for_service; use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, SledInstanceState, VmmRuntimeState, + SledInstanceState, VmmRuntimeState, }; use omicron_common::api::internal::shared::{ NetworkInterface, SledIdentifiers, SourceNatConfig, @@ -228,11 +228,6 @@ enum InstanceRequest { state: crate::params::InstanceStateRequested, tx: oneshot::Sender>, }, - PutMigrationIds { - old_runtime: InstanceRuntimeState, - migration_ids: Option, - tx: oneshot::Sender>, - }, Terminate { mark_failed: bool, tx: oneshot::Sender>, @@ -384,10 +379,7 @@ impl InstanceRunner { use InstanceMonitorRequest::*; match request { Some(Update { state, tx }) => { - let observed = ObservedPropolisState::new( - self.state.instance(), - &state, - ); + let observed = ObservedPropolisState::new(&state); let reaction = self.observe_state(&observed).await; self.publish_state_to_nexus().await; @@ -431,15 +423,6 @@ impl InstanceRunner { .map_err(|e| e.into())) .map_err(|_| Error::FailedSendClientClosed) }, - Some(PutMigrationIds{ old_runtime, migration_ids, tx }) => { - tx.send( - self.put_migration_ids( - &old_runtime, - &migration_ids - ).await.map_err(|e| e.into()) - ) - .map_err(|_| Error::FailedSendClientClosed) - }, Some(Terminate { mark_failed, tx }) => { tx.send(Ok(InstanceUnregisterResponse { updated_runtime: Some(self.terminate(mark_failed).await) @@ -504,9 +487,6 @@ impl InstanceRunner { PutState { tx, .. } => { tx.send(Err(Error::Terminating.into())).map_err(|_| ()) } - PutMigrationIds { tx, .. } => { - tx.send(Err(Error::Terminating.into())).map_err(|_| ()) - } Terminate { tx, .. } => { tx.send(Err(Error::Terminating.into())).map_err(|_| ()) } @@ -649,7 +629,6 @@ impl InstanceRunner { self.log, "updated state after observing Propolis state change"; "propolis_id" => %self.state.propolis_id(), - "new_instance_state" => ?self.state.instance(), "new_vmm_state" => ?self.state.vmm() ); @@ -711,10 +690,27 @@ impl InstanceRunner { let migrate = match migrate { Some(params) => { - let migration_id = - self.state.instance().migration_id.ok_or_else(|| { - Error::Migration(anyhow!("Missing Migration UUID")) - })?; + let migration_id = self.state + .migration_in() + // TODO(eliza): This is a bit of an unfortunate dance: the + // initial instance-ensure-registered request is what sends + // the migration ID, but it's the subsequent + // instance-ensure-state request (which we're handling here) + // that includes migration the source VMM's UUID and IP + // address. Because the API currently splits the migration + // IDs between the instance-ensure-registered and + // instance-ensure-state requests, we have to stash the + // migration ID in an `Option` and `expect()` it here, + // panicking if we get an instance-ensure-state request with + // a source Propolis ID if the instance wasn't registered + // with a migration in ID. + // + // This is kind of a shame. Eventually, we should consider + // reworking the API ensure-state request contains the + // migration ID, and we don't have to unwrap here. See: + // https://github.com/oxidecomputer/omicron/issues/6073 + .expect("if we have migration target params, we should also have a migration in") + .migration_id; Some(propolis_client::types::InstanceMigrateInitiateRequest { src_addr: params.src_propolis_addr.to_string(), src_uuid: params.src_propolis_id, @@ -969,9 +965,11 @@ pub struct Instance { #[derive(Debug)] pub(crate) struct InstanceInitialState { pub hardware: InstanceHardware, - pub instance_runtime: InstanceRuntimeState, pub vmm_runtime: VmmRuntimeState, pub propolis_addr: SocketAddr, + /// UUID of the migration in to this VMM, if the VMM is being created as the + /// target of an active migration. + pub migration_id: Option, } impl Instance { @@ -1002,13 +1000,14 @@ impl Instance { info!(log, "initializing new Instance"; "instance_id" => %id, "propolis_id" => %propolis_id, + "migration_id" => ?state.migration_id, "state" => ?state); let InstanceInitialState { hardware, - instance_runtime, vmm_runtime, propolis_addr, + migration_id, } = state; let InstanceManagerServices { @@ -1098,11 +1097,7 @@ impl Instance { dhcp_config, requested_disks: hardware.disks, cloud_init_bytes: hardware.cloud_init_bytes, - state: InstanceStates::new( - instance_runtime, - vmm_runtime, - propolis_id, - ), + state: InstanceStates::new(vmm_runtime, propolis_id, migration_id), running_state: None, nexus_client, storage, @@ -1173,23 +1168,6 @@ impl Instance { Ok(()) } - pub async fn put_migration_ids( - &self, - tx: oneshot::Sender>, - old_runtime: InstanceRuntimeState, - migration_ids: Option, - ) -> Result<(), Error> { - self.tx - .send(InstanceRequest::PutMigrationIds { - old_runtime, - migration_ids, - tx, - }) - .await - .map_err(|_| Error::FailedSendChannelClosed)?; - Ok(()) - } - /// Rudely terminates this instance's Propolis (if it has one) and /// immediately transitions the instance to the Destroyed state. pub async fn terminate( @@ -1376,36 +1354,6 @@ impl InstanceRunner { Ok(self.state.sled_instance_state()) } - async fn put_migration_ids( - &mut self, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result { - // Check that the instance's current generation matches the one the - // caller expects to transition from. This helps Nexus ensure that if - // multiple migration sagas launch at Propolis generation N, then only - // one of them will successfully set the instance's migration IDs. - if self.state.instance().gen != old_runtime.gen { - // Allow this transition for idempotency if the instance is - // already in the requested goal state. - if self.state.migration_ids_already_set(old_runtime, migration_ids) - { - return Ok(self.state.sled_instance_state()); - } - - return Err(Error::Transition( - omicron_common::api::external::Error::conflict(format!( - "wrong instance state generation: expected {}, got {}", - self.state.instance().gen, - old_runtime.gen - )), - )); - } - - self.state.set_migration_ids(migration_ids, Utc::now()); - Ok(self.state.sled_instance_state()) - } - async fn setup_propolis_inner(&mut self) -> Result { // Create OPTE ports for the instance. We also store the names of all // those ports to notify the metrics task to start collecting statistics @@ -1637,7 +1585,9 @@ mod tests { use omicron_common::api::external::{ ByteCount, Generation, Hostname, InstanceCpuCount, }; - use omicron_common::api::internal::nexus::{InstanceProperties, VmmState}; + use omicron_common::api::internal::nexus::{ + InstanceProperties, InstanceRuntimeState, VmmState, + }; use omicron_common::api::internal::shared::SledIdentifiers; use omicron_common::FileKv; use sled_storage::manager_test_harness::StorageManagerTestHarness; @@ -1819,8 +1769,7 @@ mod tests { let ticket = InstanceTicket::new_without_manager_for_test(id); - let initial_state = - fake_instance_initial_state(propolis_id, propolis_addr); + let initial_state = fake_instance_initial_state(propolis_addr); let (services, rx) = fake_instance_manager_services( log, @@ -1856,7 +1805,6 @@ mod tests { } fn fake_instance_initial_state( - propolis_id: PropolisUuid, propolis_addr: SocketAddr, ) -> InstanceInitialState { let hardware = InstanceHardware { @@ -1886,19 +1834,13 @@ mod tests { InstanceInitialState { hardware, - instance_runtime: InstanceRuntimeState { - propolis_id: Some(propolis_id), - dst_propolis_id: None, - migration_id: None, - gen: Generation::new(), - time_updated: Default::default(), - }, vmm_runtime: VmmRuntimeState { state: VmmState::Starting, gen: Generation::new(), time_updated: Default::default(), }, propolis_addr, + migration_id: None, } } @@ -2283,10 +2225,10 @@ mod tests { let propolis_id = PropolisUuid::from_untyped_uuid(PROPOLIS_ID); let InstanceInitialState { hardware, - instance_runtime, vmm_runtime, propolis_addr, - } = fake_instance_initial_state(propolis_id, propolis_addr); + migration_id: _, + } = fake_instance_initial_state(propolis_addr); let metadata = InstanceMetadata { silo_id: Uuid::new_v4(), @@ -2300,6 +2242,14 @@ mod tests { serial: "fake-serial".into(), }; + let instance_runtime = InstanceRuntimeState { + propolis_id: Some(propolis_id), + dst_propolis_id: None, + migration_id: None, + gen: Generation::new(), + time_updated: Default::default(), + }; + mgr.ensure_registered( instance_id, propolis_id, diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index bb9303f5e2..1b2fb204d0 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -12,8 +12,8 @@ use crate::params::InstanceExternalIpBody; use crate::params::InstanceMetadata; use crate::params::ZoneBundleMetadata; use crate::params::{ - InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, + InstanceHardware, InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, }; use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::BundleError; @@ -166,7 +166,7 @@ impl InstanceManager { instance_runtime, vmm_runtime, propolis_addr, - sled_identifiers, + sled_identifiers: Box::new(sled_identifiers), metadata, tx, }) @@ -225,26 +225,6 @@ impl InstanceManager { } } - pub async fn put_migration_ids( - &self, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result { - let (tx, rx) = oneshot::channel(); - self.inner - .tx - .send(InstanceManagerRequest::PutMigrationIds { - instance_id, - old_runtime: old_runtime.clone(), - migration_ids: *migration_ids, - tx, - }) - .await - .map_err(|_| Error::FailedSendInstanceManagerClosed)?; - rx.await? - } - pub async fn instance_issue_disk_snapshot_request( &self, instance_id: InstanceUuid, @@ -369,7 +349,12 @@ enum InstanceManagerRequest { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, - sled_identifiers: SledIdentifiers, + // These are boxed because they are, apparently, quite large, and Clippy + // whinges about the overall size of this variant relative to the + // others. Since we will generally send `EnsureRegistered` requests much + // less frequently than most of the others, boxing this seems like a + // reasonable choice... + sled_identifiers: Box, metadata: InstanceMetadata, tx: oneshot::Sender>, }, @@ -382,12 +367,7 @@ enum InstanceManagerRequest { target: InstanceStateRequested, tx: oneshot::Sender>, }, - PutMigrationIds { - instance_id: InstanceUuid, - old_runtime: InstanceRuntimeState, - migration_ids: Option, - tx: oneshot::Sender>, - }, + InstanceIssueDiskSnapshot { instance_id: InstanceUuid, disk_id: Uuid, @@ -505,7 +485,7 @@ impl InstanceManagerRunner { instance_runtime, vmm_runtime, propolis_addr, - sled_identifiers, + *sled_identifiers, metadata ).await).map_err(|_| Error::FailedSendClientClosed) }, @@ -515,9 +495,6 @@ impl InstanceManagerRunner { Some(EnsureState { instance_id, target, tx }) => { self.ensure_state(tx, instance_id, target).await }, - Some(PutMigrationIds { instance_id, old_runtime, migration_ids, tx }) => { - self.put_migration_ids(tx, instance_id, &old_runtime, &migration_ids).await - }, Some(InstanceIssueDiskSnapshot { instance_id, disk_id, snapshot_id, tx }) => { self.instance_issue_disk_snapshot_request(tx, instance_id, disk_id, snapshot_id).await }, @@ -631,7 +608,8 @@ impl InstanceManagerRunner { info!(&self.log, "registering new instance"; "instance_id" => ?instance_id); - let instance_log = self.log.new(o!()); + let instance_log = + self.log.new(o!("instance_id" => format!("{instance_id}"))); let ticket = InstanceTicket::new(instance_id, self.terminate_tx.clone()); @@ -647,9 +625,9 @@ impl InstanceManagerRunner { let state = crate::instance::InstanceInitialState { hardware, - instance_runtime, vmm_runtime, propolis_addr, + migration_id: instance_runtime.migration_id, }; let instance = Instance::new( @@ -729,25 +707,6 @@ impl InstanceManagerRunner { Ok(()) } - /// Idempotently attempts to set the instance's migration IDs to the - /// supplied IDs. - async fn put_migration_ids( - &mut self, - tx: oneshot::Sender>, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result<(), Error> { - let (_, instance) = self - .instances - .get(&instance_id) - .ok_or_else(|| Error::NoSuchInstance(instance_id))?; - instance - .put_migration_ids(tx, old_runtime.clone(), *migration_ids) - .await?; - Ok(()) - } - async fn instance_issue_disk_snapshot_request( &self, tx: oneshot::Sender>, diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 29f0b9c9c3..8bf38bde0e 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -208,23 +208,6 @@ pub struct InstanceMigrationSourceParams { pub dst_propolis_id: PropolisUuid, } -/// The body of a request to set or clear the migration identifiers from a -/// sled agent's instance state records. -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct InstancePutMigrationIdsBody { - /// The last instance runtime state known to this requestor. This request - /// will succeed if either (a) the state generation in the sled agent's - /// runtime state matches the generation in this record, or (b) the sled - /// agent's runtime state matches what would result from applying this - /// request to the caller's runtime state. This latter condition provides - /// idempotency. - pub old_runtime: InstanceRuntimeState, - - /// The migration identifiers to set. If `None`, this operation clears the - /// migration IDs. - pub migration_params: Option, -} - #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] pub enum DiskType { U2, diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index ec19863bef..d7caf673c9 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -15,9 +15,10 @@ use nexus_sled_agent_shared::inventory::{ }; use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, ReservedRackSubnet, - COCKROACHDB_REDUNDANCY, DENDRITE_PORT, DNS_HTTP_PORT, DNS_PORT, - DNS_REDUNDANCY, MAX_DNS_REDUNDANCY, MGD_PORT, MGS_PORT, NEXUS_REDUNDANCY, - NTP_PORT, NUM_SOURCE_NAT_PORTS, RSS_RESERVED_ADDRESSES, SLED_PREFIX, + BOUNDARY_NTP_REDUNDANCY, COCKROACHDB_REDUNDANCY, DENDRITE_PORT, + DNS_HTTP_PORT, DNS_PORT, DNS_REDUNDANCY, MAX_DNS_REDUNDANCY, MGD_PORT, + MGS_PORT, NEXUS_REDUNDANCY, NTP_PORT, NUM_SOURCE_NAT_PORTS, + RSS_RESERVED_ADDRESSES, SLED_PREFIX, }; use omicron_common::api::external::{Generation, MacAddr, Vni}; use omicron_common::api::internal::shared::{ @@ -49,9 +50,6 @@ use std::num::Wrapping; use thiserror::Error; use uuid::Uuid; -// The number of boundary NTP servers to create from RSS. -const BOUNDARY_NTP_COUNT: usize = 2; - // TODO(https://github.com/oxidecomputer/omicron/issues/732): Remove // when Nexus provisions Oximeter. const OXIMETER_COUNT: usize = 1; @@ -735,7 +733,7 @@ impl Plan { let ntp_address = SocketAddrV6::new(address, NTP_PORT, 0, 0); let filesystem_pool = Some(sled.alloc_zpool_from_u2s()?); - let (zone_type, svcname) = if idx < BOUNDARY_NTP_COUNT { + let (zone_type, svcname) = if idx < BOUNDARY_NTP_REDUNDANCY { boundary_ntp_servers .push(Host::for_zone(Zone::Other(id)).fqdn()); let (nic, snat_cfg) = svc_port_builder.next_snat(id)?; diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 3e29a544cb..bb8701771f 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -60,6 +60,8 @@ use illumos_utils::zfs::ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT; use illumos_utils::zone::AddressRequest; use illumos_utils::zpool::{PathInPool, ZpoolName}; use illumos_utils::{execute, PFEXEC}; +use internal_dns::names::BOUNDARY_NTP_DNS_NAME; +use internal_dns::names::DNS_ZONE; use internal_dns::resolver::Resolver; use itertools::Itertools; use nexus_config::{ConfigDropshotWithTls, DeploymentConfig}; @@ -1993,15 +1995,17 @@ impl ServiceManager { .add_property( "boundary", "boolean", - &is_boundary.to_string(), + is_boundary.to_string(), + ) + .add_property( + "boundary_pool", + "astring", + format!("{BOUNDARY_NTP_DNS_NAME}.{DNS_ZONE}"), ); for s in ntp_servers { - chrony_config = chrony_config.add_property( - "server", - "astring", - &s.to_string(), - ); + chrony_config = + chrony_config.add_property("server", "astring", s); } let dns_client_service; diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 8af71ac026..ffb7327ce7 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -422,7 +422,6 @@ mod test { use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::DiskRuntimeState; - use omicron_common::api::internal::nexus::InstanceRuntimeState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::api::internal::nexus::VmmRuntimeState; use omicron_common::api::internal::nexus::VmmState; @@ -433,14 +432,6 @@ mod test { logctx: &LogContext, ) -> (SimObject, Receiver<()>) { let propolis_id = PropolisUuid::new_v4(); - let instance_vmm = InstanceRuntimeState { - propolis_id: Some(propolis_id), - dst_propolis_id: None, - migration_id: None, - gen: Generation::new(), - time_updated: Utc::now(), - }; - let vmm_state = VmmRuntimeState { state: VmmState::Starting, gen: Generation::new(), @@ -448,10 +439,10 @@ mod test { }; let state = SledInstanceState { - instance_state: instance_vmm, vmm_state, propolis_id, - migration_state: None, + migration_in: None, + migration_out: None, }; SimObject::new_simulated_auto(&state, logctx.log.new(o!())) @@ -501,14 +492,8 @@ mod test { assert!(dropped.is_none()); assert!(instance.object.desired().is_none()); let rnext = instance.object.current(); - assert!(rnext.instance_state.gen > rprev.instance_state.gen); assert!(rnext.vmm_state.gen > rprev.vmm_state.gen); - assert!( - rnext.instance_state.time_updated - >= rprev.instance_state.time_updated - ); assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated); - assert!(rnext.instance_state.propolis_id.is_none()); assert_eq!(rnext.vmm_state.state, VmmState::Destroyed); assert!(rx.try_next().is_err()); @@ -632,7 +617,6 @@ mod test { assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated); assert_eq!(rprev.vmm_state.state, VmmState::Stopping); assert_eq!(rnext.vmm_state.state, VmmState::Destroyed); - assert!(rnext.instance_state.gen > rprev.instance_state.gen); logctx.cleanup_successful(); } diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 268e8a9cf1..d042e19814 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -4,11 +4,11 @@ //! HTTP entrypoint functions for the sled agent's exposed API +use super::collection::PokeMode; use crate::bootstrap::params::AddSledRequest; use crate::params::{ DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, - InstancePutMigrationIdsBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, + InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, VpcFirewallRulesEnsureBody, }; use dropshot::ApiDescription; @@ -45,7 +45,6 @@ pub fn api() -> SledApiDescription { fn register_endpoints( api: &mut SledApiDescription, ) -> Result<(), ApiDescriptionRegisterError> { - api.register(instance_put_migration_ids)?; api.register(instance_put_state)?; api.register(instance_get_state)?; api.register(instance_register)?; @@ -53,6 +52,8 @@ pub fn api() -> SledApiDescription { api.register(instance_put_external_ip)?; api.register(instance_delete_external_ip)?; api.register(instance_poke_post)?; + api.register(instance_poke_single_step_post)?; + api.register(instance_post_sim_migration_source)?; api.register(disk_put)?; api.register(disk_poke_post)?; api.register(update_artifact)?; @@ -157,28 +158,6 @@ async fn instance_get_state( Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/migration-ids", -}] -async fn instance_put_migration_ids( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, -) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_put_migration_ids( - instance_id, - &body_args.old_runtime, - &body_args.migration_params, - ) - .await?, - )) -} - #[endpoint { method = PUT, path = "/instances/{instance_id}/external-ip", @@ -221,7 +200,37 @@ async fn instance_poke_post( ) -> Result { let sa = rqctx.context(); let instance_id = path_params.into_inner().instance_id; - sa.instance_poke(instance_id).await; + sa.instance_poke(instance_id, PokeMode::Drain).await; + Ok(HttpResponseUpdatedNoContent()) +} + +#[endpoint { + method = POST, + path = "/instances/{instance_id}/poke-single-step", +}] +async fn instance_poke_single_step_post( + rqctx: RequestContext>, + path_params: Path, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + sa.instance_poke(instance_id, PokeMode::SingleStep).await; + Ok(HttpResponseUpdatedNoContent()) +} + +#[endpoint { + method = POST, + path = "/instances/{instance_id}/sim-migration-source", +}] +async fn instance_post_sim_migration_source( + rqctx: RequestContext>, + path_params: Path, + body: TypedBody, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + sa.instance_simulate_migration_source(instance_id, body.into_inner()) + .await?; Ok(HttpResponseUpdatedNoContent()) } diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index e94b3b4984..8ee0130262 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -8,16 +8,14 @@ use super::simulatable::Simulatable; use crate::common::instance::{ObservedPropolisState, PublishedVmmState}; use crate::nexus::NexusClient; -use crate::params::{InstanceMigrationSourceParams, InstanceStateRequested}; +use crate::params::InstanceStateRequested; use async_trait::async_trait; use chrono::Utc; use nexus_client; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; -use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, MigrationRole, SledInstanceState, VmmState, -}; +use omicron_common::api::internal::nexus::{SledInstanceState, VmmState}; use propolis_client::types::{ InstanceMigrateStatusResponse as PropolisMigrateResponse, InstanceMigrationStatus as PropolisMigrationStatus, @@ -30,6 +28,10 @@ use uuid::Uuid; use crate::common::instance::{Action as InstanceAction, InstanceStates}; +pub use sled_agent_client::{ + SimulateMigrationSource, SimulatedMigrationResult, +}; + #[derive(Clone, Debug)] enum MonitorChange { PropolisState(PropolisInstanceState), @@ -79,56 +81,67 @@ impl SimInstanceInner { self.queue.push_back(MonitorChange::MigrateStatus(migrate_status)) } - /// Queue a successful simulated migration. - /// - fn queue_successful_migration(&mut self, role: MigrationRole) { + /// Queue a simulated migration out. + fn queue_migration_out( + &mut self, + migration_id: Uuid, + result: SimulatedMigrationResult, + ) { + let migration_update = |state| PropolisMigrateResponse { + migration_in: None, + migration_out: Some(PropolisMigrationStatus { + id: migration_id, + state, + }), + }; // Propolis transitions to the Migrating state once before // actually starting migration. self.queue_propolis_state(PropolisInstanceState::Migrating); - let migration_id = - self.state.instance().migration_id.unwrap_or_else(|| { - panic!( - "should have migration ID set before getting request to - migrate in (current state: {:?})", - self - ) - }); - - match role { - MigrationRole::Source => { - self.queue_migration_update(PropolisMigrateResponse { - migration_in: None, - migration_out: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Sync, - }), - }); - self.queue_migration_update(PropolisMigrateResponse { - migration_in: None, - migration_out: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Finish, - }), - }); + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Sync, + )); + match result { + SimulatedMigrationResult::Success => { + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Finish, + )); self.queue_graceful_stop(); } - MigrationRole::Target => { - self.queue_migration_update(PropolisMigrateResponse { - migration_in: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Sync, - }), - migration_out: None, - }); - self.queue_migration_update(PropolisMigrateResponse { - migration_in: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Finish, - }), - migration_out: None, - }); + SimulatedMigrationResult::Failure => { + todo!("finish this part when we actuall need it...") + } + } + } + + /// Queue a simulated migration in. + fn queue_migration_in( + &mut self, + migration_id: Uuid, + result: SimulatedMigrationResult, + ) { + let migration_update = |state| PropolisMigrateResponse { + migration_in: Some(PropolisMigrationStatus { + id: migration_id, + state, + }), + migration_out: None, + }; + // Propolis transitions to the Migrating state once before + // actually starting migration. + self.queue_propolis_state(PropolisInstanceState::Migrating); + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Sync, + )); + match result { + SimulatedMigrationResult::Success => { + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Finish, + )); self.queue_propolis_state(PropolisInstanceState::Running) } + SimulatedMigrationResult::Failure => { + todo!("finish this part when we actually need it...") + } } } @@ -179,7 +192,20 @@ impl SimInstanceInner { ))); } - self.queue_successful_migration(MigrationRole::Target) + let migration_id = self + .state + .migration_in() + .ok_or_else(|| { + Error::invalid_request( + "can't request migration in for a vmm that wasn't \ + created with a migration ID", + ) + })? + .migration_id; + self.queue_migration_in( + migration_id, + SimulatedMigrationResult::Success, + ); } InstanceStateRequested::Running => { match self.next_resting_state() { @@ -279,7 +305,6 @@ impl SimInstanceInner { } self.state.apply_propolis_observation(&ObservedPropolisState::new( - self.state.instance(), &self.last_response, )) } else { @@ -370,46 +395,6 @@ impl SimInstanceInner { self.destroyed = true; self.state.sled_instance_state() } - - /// Stores a set of migration IDs in the instance's runtime state. - fn put_migration_ids( - &mut self, - old_runtime: &InstanceRuntimeState, - ids: &Option, - ) -> Result { - if self.state.migration_ids_already_set(old_runtime, ids) { - return Ok(self.state.sled_instance_state()); - } - - if self.state.instance().gen != old_runtime.gen { - return Err(Error::invalid_request(format!( - "wrong Propolis ID generation: expected {}, got {}", - self.state.instance().gen, - old_runtime.gen - ))); - } - - self.state.set_migration_ids(ids, Utc::now()); - - // If we set migration IDs and are the migration source, ensure that we - // will perform the correct state transitions to simulate a successful - // migration. - if ids.is_some() { - let role = self - .state - .migration() - .expect( - "we just got a `put_migration_ids` request with `Some` IDs, \ - so we should have a migration" - ) - .role; - if role == MigrationRole::Source { - self.queue_successful_migration(MigrationRole::Source) - } - } - - Ok(self.state.sled_instance_state()) - } } /// A simulation of an Instance created by the external Oxide API. @@ -437,13 +422,14 @@ impl SimInstance { self.inner.lock().unwrap().terminate() } - pub async fn put_migration_ids( + pub(crate) fn set_simulated_migration_source( &self, - old_runtime: &InstanceRuntimeState, - ids: &Option, - ) -> Result { - let mut inner = self.inner.lock().unwrap(); - inner.put_migration_ids(old_runtime, ids) + migration: SimulateMigrationSource, + ) { + self.inner + .lock() + .unwrap() + .queue_migration_out(migration.migration_id, migration.result); } } @@ -466,9 +452,9 @@ impl Simulatable for SimInstance { SimInstance { inner: Arc::new(Mutex::new(SimInstanceInner { state: InstanceStates::new( - current.instance_state, current.vmm_state, current.propolis_id, + current.migration_in.map(|m| m.migration_id), ), last_response: InstanceStateMonitorResponse { gen: 1, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 05339c201c..e555276d15 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -7,14 +7,14 @@ use super::collection::{PokeMode, SimCollection}; use super::config::Config; use super::disk::SimDisk; -use super::instance::SimInstance; +use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; use crate::nexus::NexusClient; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, + InstanceMetadata, InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, }; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; @@ -30,7 +30,7 @@ use omicron_common::api::external::{ ByteCount, DiskState, Error, Generation, ResourceType, }; use omicron_common::api::internal::nexus::{ - DiskRuntimeState, SledInstanceState, + DiskRuntimeState, MigrationRuntimeState, MigrationState, SledInstanceState, }; use omicron_common::api::internal::nexus::{ InstanceRuntimeState, VmmRuntimeState, @@ -368,15 +368,24 @@ impl SledAgent { } } + let migration_in = instance_runtime.migration_id.map(|migration_id| { + MigrationRuntimeState { + migration_id, + state: MigrationState::Pending, + gen: Generation::new(), + time_updated: chrono::Utc::now(), + } + }); + let instance_run_time_state = self .instances .sim_ensure( &instance_id.into_untyped_uuid(), SledInstanceState { - instance_state: instance_runtime, vmm_state: vmm_runtime, propolis_id, - migration_state: None, + migration_in, + migration_out: None, }, None, ) @@ -540,6 +549,24 @@ impl SledAgent { Ok(instance.current()) } + pub async fn instance_simulate_migration_source( + &self, + instance_id: InstanceUuid, + migration: instance::SimulateMigrationSource, + ) -> Result<(), HttpError> { + let instance = self + .instances + .sim_get_cloned_object(&instance_id.into_untyped_uuid()) + .await + .map_err(|_| { + crate::sled_agent::Error::Instance( + crate::instance_manager::Error::NoSuchInstance(instance_id), + ) + })?; + instance.set_simulated_migration_source(migration); + Ok(()) + } + pub async fn set_instance_ensure_state_error(&self, error: Option) { *self.instance_ensure_state_error.lock().await = error; } @@ -563,20 +590,6 @@ impl SledAgent { Ok(()) } - pub async fn instance_put_migration_ids( - self: &Arc, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result { - let instance = self - .instances - .sim_get_cloned_object(&instance_id.into_untyped_uuid()) - .await?; - - instance.put_migration_ids(old_runtime, migration_ids).await - } - /// Idempotently ensures that the given API Disk (described by `api_disk`) /// is attached (or not) as specified. This simulates disk attach and /// detach, similar to instance boot and halt. @@ -601,8 +614,8 @@ impl SledAgent { self.disks.size().await } - pub async fn instance_poke(&self, id: InstanceUuid) { - self.instances.sim_poke(id.into_untyped_uuid(), PokeMode::Drain).await; + pub async fn instance_poke(&self, id: InstanceUuid, mode: PokeMode) { + self.instances.sim_poke(id.into_untyped_uuid(), mode).await; } pub async fn disk_poke(&self, id: Uuid) { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1296cfb378..3eb3805784 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -18,9 +18,9 @@ use crate::nexus::{ }; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, OmicronZoneTypeExt, - TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, + InstanceMetadata, InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, OmicronZoneTypeExt, TimeSync, VpcFirewallRule, + ZoneBundleMetadata, Zpool, }; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; @@ -1037,23 +1037,6 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } - /// Idempotently ensures that the instance's runtime state contains the - /// supplied migration IDs, provided that the caller continues to meet the - /// conditions needed to change those IDs. See the doc comments for - /// [`crate::params::InstancePutMigrationIdsBody`]. - pub async fn instance_put_migration_ids( - &self, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result { - self.inner - .instances - .put_migration_ids(instance_id, old_runtime, migration_ids) - .await - .map_err(|e| Error::Instance(e)) - } - /// Idempotently ensures that an instance's OPTE/port state includes the /// specified external IP address. /// diff --git a/smf/chrony-setup/manifest.xml b/smf/chrony-setup/manifest.xml index f31f13a2ea..fca5d3f2e0 100644 --- a/smf/chrony-setup/manifest.xml +++ b/smf/chrony-setup/manifest.xml @@ -12,7 +12,7 @@ - + + + + + diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 396e3615b2..c502c20b1b 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -64,6 +64,7 @@ instance_watcher.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +instance_updater.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index df49476eed..30a0243122 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -64,6 +64,7 @@ instance_watcher.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +instance_updater.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. diff --git a/smf/sled-agent/non-gimlet/config-rss.toml b/smf/sled-agent/non-gimlet/config-rss.toml index 071cf496bb..90f5339e84 100644 --- a/smf/sled-agent/non-gimlet/config-rss.toml +++ b/smf/sled-agent/non-gimlet/config-rss.toml @@ -100,7 +100,7 @@ bgp = [] # You can configure multiple uplinks by repeating the following stanza [[rack_network_config.ports]] # Routes associated with this port. -routes = [{nexthop = "192.168.1.1", destination = "0.0.0.0/0"}] +routes = [{nexthop = "192.168.1.199", destination = "0.0.0.0/0"}] # Addresses associated with this port. addresses = [{address = "192.168.1.30/24"}] # Name of the uplink port. This should always be "qsfp0" when using softnpu. diff --git a/smf/sled-agent/non-gimlet/config.toml b/smf/sled-agent/non-gimlet/config.toml index 42068de0f6..77ca52a647 100644 --- a/smf/sled-agent/non-gimlet/config.toml +++ b/smf/sled-agent/non-gimlet/config.toml @@ -84,7 +84,7 @@ data_links = ["net0", "net1"] request_body_max_bytes = 2_147_483_648 [log] -level = "debug" +level = "info" mode = "file" path = "/dev/stdout" if_exists = "append" diff --git a/tools/permslip_staging b/tools/permslip_staging index a38bff708e..6f5f925eb0 100644 --- a/tools/permslip_staging +++ b/tools/permslip_staging @@ -1,5 +1,5 @@ c28eaa13638f55100a42916727227242ee02d18cebecb1412d6af5c8aa945b99 manifest-gimlet-v1.0.22.toml 201ff5580bb4b0b01419d7c5e580af9926103e2b6d3024e6b49cee6fab415519 manifest-oxide-rot-1-v1.0.12.toml 6d53bfbfdd6baa3fc150153a003abfac6d4b46c34f61fa7a8ec2af8af19a7d5a manifest-psc-v1.0.21.toml -d608dba3fa5a1fce3592ff3f643319787218b84706134147e5918f5bd1c0345d manifest-sidecar-v1.0.22.toml +26b6096a377edb3d7da50b1b499af104e6195bc7c7c6eb1b2751b32434d7ac9e manifest-sidecar-v1.0.23.toml c0fecaefac7674138337f3bd4ce4ce5b884053dead5ec27b575701471631ea2f manifest-bootleby-v1.3.0.toml diff --git a/wicket/src/cli/command.rs b/wicket/src/cli/command.rs index bae98130b5..899b28971a 100644 --- a/wicket/src/cli/command.rs +++ b/wicket/src/cli/command.rs @@ -10,7 +10,7 @@ use anyhow::Result; use clap::{Args, ColorChoice, Parser, Subcommand}; use super::{ - preflight::PreflightArgs, rack_setup::SetupArgs, + inventory::InventoryArgs, preflight::PreflightArgs, rack_setup::SetupArgs, rack_update::RackUpdateArgs, upload::UploadArgs, }; @@ -49,6 +49,9 @@ impl ShellApp { args.exec(log, wicketd_addr, self.global_opts).await } ShellCommand::Preflight(args) => args.exec(log, wicketd_addr).await, + ShellCommand::Inventory(args) => { + args.exec(log, wicketd_addr, output).await + } } } } @@ -100,4 +103,8 @@ enum ShellCommand { /// Run checks prior to setting up the rack. #[command(subcommand)] Preflight(PreflightArgs), + + /// Enumerate rack components + #[command(subcommand)] + Inventory(InventoryArgs), } diff --git a/wicket/src/cli/inventory.rs b/wicket/src/cli/inventory.rs new file mode 100644 index 0000000000..54bfa304c2 --- /dev/null +++ b/wicket/src/cli/inventory.rs @@ -0,0 +1,133 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Support for inventory checks via wicketd. + +use crate::cli::CommandOutput; +use crate::wicketd::create_wicketd_client; +use anyhow::Context; +use anyhow::Result; +use clap::{Subcommand, ValueEnum}; +use owo_colors::OwoColorize; +use sled_hardware_types::Baseboard; +use slog::Logger; +use std::fmt; +use std::net::SocketAddrV6; +use std::time::Duration; +use wicket_common::rack_setup::BootstrapSledDescription; + +const WICKETD_TIMEOUT: Duration = Duration::from_secs(5); + +#[derive(Debug, Subcommand)] +pub(crate) enum InventoryArgs { + /// List state of all bootstrap sleds, as configured with rack-setup + ConfiguredBootstrapSleds { + /// Select output format + #[clap(long, default_value_t = OutputFormat::Table)] + format: OutputFormat, + }, +} + +#[derive(Debug, ValueEnum, Clone)] +pub enum OutputFormat { + /// Print output as operator-readable table + Table, + + /// Print output as json + Json, +} + +impl fmt::Display for OutputFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + OutputFormat::Table => write!(f, "table"), + OutputFormat::Json => write!(f, "json"), + } + } +} + +impl InventoryArgs { + pub(crate) async fn exec( + self, + log: Logger, + wicketd_addr: SocketAddrV6, + mut output: CommandOutput<'_>, + ) -> Result<()> { + let client = create_wicketd_client(&log, wicketd_addr, WICKETD_TIMEOUT); + + match self { + InventoryArgs::ConfiguredBootstrapSleds { format } => { + // We don't use the /bootstrap-sleds endpoint, because that + // gets all sleds visible on the bootstrap network. We want + // something subtly different here. + // - We want the status of only sleds we've configured wicket + // to use for setup. /bootstrap-sleds will give us sleds + // we don't want + // - We want the status even if they aren't visible on the + // bootstrap network yet. + // + // In other words, we want the sled information displayed at the + // bottom of the rack setup screen in the TUI, and we get it the + // same way it does. + let conf = client + .get_rss_config() + .await + .context("failed to get rss config")?; + + let bootstrap_sleds = &conf.insensitive.bootstrap_sleds; + match format { + OutputFormat::Json => { + let json_str = + serde_json::to_string_pretty(bootstrap_sleds) + .context("serializing sled data failed")?; + writeln!(output.stdout, "{}", json_str) + .expect("writing to stdout failed"); + } + OutputFormat::Table => { + for sled in bootstrap_sleds { + print_bootstrap_sled_data(sled, &mut output); + } + } + } + + Ok(()) + } + } + } +} + +fn print_bootstrap_sled_data( + desc: &BootstrapSledDescription, + output: &mut CommandOutput<'_>, +) { + let slot = desc.id.slot; + + let identifier = match &desc.baseboard { + Baseboard::Gimlet { identifier, .. } => identifier.clone(), + Baseboard::Pc { identifier, .. } => identifier.clone(), + Baseboard::Unknown => "unknown".to_string(), + }; + + let address = desc.bootstrap_ip; + + // Create status indicators + let status = match address { + None => format!("{}", '⚠'.red()), + Some(_) => format!("{}", '✔'.green()), + }; + + let addr_fmt = match address { + None => "(not available)".to_string(), + Some(addr) => format!("{}", addr), + }; + + // Print out this entry. We say "Cubby" rather than "Slot" here purely + // because the TUI also says "Cubby". + writeln!( + output.stdout, + "{status} Cubby {:02}\t{identifier}\t{addr_fmt}", + slot + ) + .expect("writing to stdout failed"); +} diff --git a/wicket/src/cli/mod.rs b/wicket/src/cli/mod.rs index e63ef467e7..ac406823fe 100644 --- a/wicket/src/cli/mod.rs +++ b/wicket/src/cli/mod.rs @@ -11,6 +11,7 @@ //! support for that. mod command; +mod inventory; mod preflight; mod rack_setup; mod rack_update; diff --git a/wicketd/tests/integration_tests/inventory.rs b/wicketd/tests/integration_tests/inventory.rs index ea696d21c9..ed5ad22d5d 100644 --- a/wicketd/tests/integration_tests/inventory.rs +++ b/wicketd/tests/integration_tests/inventory.rs @@ -9,6 +9,10 @@ use std::time::Duration; use super::setup::WicketdTestContext; use gateway_messages::SpPort; use gateway_test_utils::setup as gateway_setup; +use sled_hardware_types::Baseboard; +use wicket::OutputKind; +use wicket_common::inventory::{SpIdentifier, SpType}; +use wicket_common::rack_setup::BootstrapSledDescription; use wicketd_client::types::{GetInventoryParams, GetInventoryResponse}; #[tokio::test] @@ -45,5 +49,62 @@ async fn test_inventory() { // 4 SPs attached to the inventory. assert_eq!(inventory.sps.len(), 4); + // Test CLI with JSON output + { + let args = + vec!["inventory", "configured-bootstrap-sleds", "--format", "json"]; + let mut stdout = Vec::new(); + let mut stderr = Vec::new(); + let output = OutputKind::Captured { + log: wicketd_testctx.log().clone(), + stdout: &mut stdout, + stderr: &mut stderr, + }; + + wicket::exec_with_args(wicketd_testctx.wicketd_addr, args, output) + .await + .expect("wicket inventory configured-bootstrap-sleds failed"); + + // stdout should contain a JSON object. + let response: Vec = + serde_json::from_slice(&stdout).expect("stdout is valid JSON"); + + // This only tests the case that we get sleds back with no current + // bootstrap IP. This does provide svalue: it check that the command + // exists, accesses data within wicket, and returns it in the schema we + // expect. But it does not test the case where a sled does have a + // bootstrap IP. + // + // Unfortunately, that's a difficult thing to test today. Wicket gets + // that information by enumerating the IPs on the bootstrap network and + // reaching out to the bootstrap_agent on them directly to ask them who + // they are. Our testing setup does not have a way to provide such an + // IP, or run a bootstrap_agent on an IP to respond. We should update + // this test when we do have that capabilitiy. + assert_eq!( + response, + vec![ + BootstrapSledDescription { + id: SpIdentifier { type_: SpType::Sled, slot: 0 }, + baseboard: Baseboard::Gimlet { + identifier: "SimGimlet00".to_string(), + model: "i86pc".to_string(), + revision: 0 + }, + bootstrap_ip: None + }, + BootstrapSledDescription { + id: SpIdentifier { type_: SpType::Sled, slot: 1 }, + baseboard: Baseboard::Gimlet { + identifier: "SimGimlet01".to_string(), + model: "i86pc".to_string(), + revision: 0 + }, + bootstrap_ip: None + }, + ] + ); + } + wicketd_testctx.teardown().await; } diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index a332133f3f..5f34c76db9 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -41,7 +41,6 @@ crossbeam-utils = { version = "0.8.19" } crossterm = { version = "0.27.0", features = ["event-stream", "serde"] } crypto-common = { version = "0.1.6", default-features = false, features = ["getrandom", "std"] } der = { version = "0.7.9", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } -diesel = { version = "2.1.6", features = ["chrono", "i-implement-a-third-party-backend-and-opt-into-breaking-changes", "network-address", "postgres", "r2d2", "serde_json", "uuid"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } either = { version = "1.13.0" } elliptic-curve = { version = "0.13.8", features = ["ecdh", "hazmat", "pem", "std"] } @@ -65,7 +64,6 @@ hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "0.14.30", features = ["full"] } indexmap = { version = "2.3.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } -ipnetwork = { version = "0.20.0", features = ["schemars"] } itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } lalrpop-util = { version = "0.19.12" } @@ -94,7 +92,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.204", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.205", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } @@ -147,7 +145,6 @@ crossbeam-utils = { version = "0.8.19" } crossterm = { version = "0.27.0", features = ["event-stream", "serde"] } crypto-common = { version = "0.1.6", default-features = false, features = ["getrandom", "std"] } der = { version = "0.7.9", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } -diesel = { version = "2.1.6", features = ["chrono", "i-implement-a-third-party-backend-and-opt-into-breaking-changes", "network-address", "postgres", "r2d2", "serde_json", "uuid"] } digest = { version = "0.10.7", features = ["mac", "oid", "std"] } either = { version = "1.13.0" } elliptic-curve = { version = "0.13.8", features = ["ecdh", "hazmat", "pem", "std"] } @@ -171,7 +168,6 @@ hmac = { version = "0.12.1", default-features = false, features = ["reset"] } hyper = { version = "0.14.30", features = ["full"] } indexmap = { version = "2.3.0", features = ["serde"] } inout = { version = "0.1.3", default-features = false, features = ["std"] } -ipnetwork = { version = "0.20.0", features = ["schemars"] } itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12.1" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10.5" } lalrpop-util = { version = "0.19.12" } @@ -200,7 +196,7 @@ ring = { version = "0.17.8", features = ["std"] } schemars = { version = "0.8.21", features = ["bytes", "chrono", "uuid1"] } scopeguard = { version = "1.2.0" } semver = { version = "1.0.23", features = ["serde"] } -serde = { version = "1.0.204", features = ["alloc", "derive", "rc"] } +serde = { version = "1.0.205", features = ["alloc", "derive", "rc"] } serde_json = { version = "1.0.122", features = ["raw_value", "unbounded_depth"] } sha2 = { version = "0.10.8", features = ["oid"] } similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] } diff --git a/zone-setup/src/bin/zone-setup.rs b/zone-setup/src/bin/zone-setup.rs index f335512d83..167adf04bf 100644 --- a/zone-setup/src/bin/zone-setup.rs +++ b/zone-setup/src/bin/zone-setup.rs @@ -104,6 +104,13 @@ struct ChronySetupArgs { /// allowed IPv6 range #[arg(short, long)] allow: Ipv6Net, + /// DNS name for the boundary NTP zone pool + #[arg( + short = 'p', + long, + value_parser = NonEmptyStringValueParser::default(), + )] + boundary_pool: String, } // The default clap parser for `serde_json::Value` is to wrap the argument in a @@ -396,6 +403,9 @@ makestep 1.0 3 leapsecmode slew maxslewrate 2708.333 +# Refresh boundary NTP servers every two minutes instead of every two weeks +refresh 120 + "; let boundary_ntp_tpl = "# @@ -447,6 +457,7 @@ maxslewrate 2708.333 boundary: is_boundary, servers, allow, + boundary_pool, } = args; let mut new_config = @@ -464,10 +475,19 @@ maxslewrate 2708.333 .expect("write to String is infallible"); } } else { + // TODO-cleanup: Remove specific boundary NTP servers after R10 is cut; + // once all racks are setting up the boundary NTP pool we can drop + // individual server lines: + // https://github.com/oxidecomputer/omicron/issues/6261 for s in servers { writeln!(&mut new_config, "server {s} iburst minpoll 0 maxpoll 4") .expect("write to String is infallible"); } + writeln!( + &mut new_config, + "pool {boundary_pool} iburst maxdelay 0.1 maxsources 16", + ) + .expect("write to String is infallible"); } // We read the contents from the old configuration file if it existed