From 92da0a96c963967377038a081701e59a99ffd73c Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 9 Oct 2024 18:46:46 +0000 Subject: [PATCH 01/29] Handle region snapshot replacement volume deletes Volumes can be deleted at any time, but the tasks and sagas that perform region snapshot replacement did not account for this. This commit adds checks in a few places for if a volume is soft-deleted or hard-deleted, and bails out of any affected region snapshot replacement accordingly: - if a volume that has the region snapshot being replaced is soft-deleted, then skip making a region snapshot replacement step for it - if a region snapshot replacement step has the volume deleted after the step was created, transition it directly to the VolumeDeleted state - if a region snapshot replacement step has the volume deleted during the saga invocation, then skip notifying any Upstairs and allow the saga to transition the request to Complete, where then associated clean up can proceed An interesting race condition emerged during unit testing: the read-only region allocated to replace a region snapshot would be swapped into the snapshot volume, but would be susceptible to being deleted by the user, and therefore unable to be swapped into other volumes that have that snapshot volume as a read-only parent. This requires an additional volume that used that read-only region in order to bump the reference count associated with that region, so that the user cannot delete it before it was used to replace all other uses of the region snapshot it was meant to replace. This additional volume's lifetime lives as long as the region snapshot replacement, and therefore needs to be deleted when the region snapshot replacement is finished. This required a new region snapshot replacement finish saga, which required a new "Completing" state to perform the same type of state based lock on the replacement request done for all the other sagas. Testing also revealed that there were scenarios where `find_deleted_volume_regions` would return volumes for hard-deletion prematurely. The function now returns a struct instead of a list of tuples, and in that struct, regions freed for deletion are now distinct from volumes freed for deletion. Volumes are now only returned for hard-deletion when all associated read/write regions have been (or are going to be) deleted. Fixes #6353 --- dev-tools/omdb/src/bin/omdb/db.rs | 41 +- dev-tools/omdb/src/bin/omdb/nexus.rs | 23 +- dev-tools/omdb/tests/successes.out | 8 +- .../src/region_snapshot_replacement.rs | 18 +- nexus/db-model/src/schema.rs | 1 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/mod.rs | 11 +- .../datastore/region_snapshot_replacement.rs | 212 ++- nexus/db-queries/src/db/datastore/volume.rs | 113 +- nexus/src/app/background/init.rs | 2 +- .../region_snapshot_replacement_finish.rs | 63 +- .../region_snapshot_replacement_start.rs | 4 +- .../tasks/region_snapshot_replacement_step.rs | 107 +- nexus/src/app/sagas/mod.rs | 4 +- .../region_snapshot_replacement_finish.rs | 211 +++ .../region_snapshot_replacement_start.rs | 111 +- .../sagas/region_snapshot_replacement_step.rs | 14 + nexus/src/app/sagas/volume_delete.rs | 42 +- nexus/test-utils/src/background.rs | 2 +- .../crucible_replacements.rs | 1585 ++++++++++++++--- .../integration_tests/volume_management.rs | 7 +- nexus/types/src/internal_api/background.rs | 4 +- .../up01.sql | 1 + .../up02.sql | 1 + schema/crdb/dbinit.sql | 9 +- 25 files changed, 2234 insertions(+), 363 deletions(-) create mode 100644 nexus/src/app/sagas/region_snapshot_replacement_finish.rs create mode 100644 schema/crdb/add-completing-and-new-region-volume/up01.sql create mode 100644 schema/crdb/add-completing-and-new-region-volume/up02.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 3ceba3fc25..800946a571 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2558,39 +2558,48 @@ async fn cmd_db_region_used_by( async fn cmd_db_region_find_deleted( datastore: &DataStore, ) -> Result<(), anyhow::Error> { - let datasets_regions_volumes = + let freed_crucible_resources = datastore.find_deleted_volume_regions().await?; #[derive(Tabled)] - struct Row { + struct RegionRow { dataset_id: Uuid, region_id: Uuid, - volume_id: String, } - let rows: Vec = datasets_regions_volumes - .into_iter() + #[derive(Tabled)] + struct VolumeRow { + volume_id: Uuid, + } + + let region_rows: Vec = freed_crucible_resources + .datasets_and_regions + .iter() .map(|row| { - let (dataset, region, volume) = row; + let (dataset, region) = row; - Row { - dataset_id: dataset.id(), - region_id: region.id(), - volume_id: if let Some(volume) = volume { - volume.id().to_string() - } else { - String::from("") - }, - } + RegionRow { dataset_id: dataset.id(), region_id: region.id() } }) .collect(); - let table = tabled::Table::new(rows) + let table = tabled::Table::new(region_rows) .with(tabled::settings::Style::psql()) .to_string(); println!("{}", table); + let volume_rows: Vec = freed_crucible_resources + .volumes + .iter() + .map(|volume_id| VolumeRow { volume_id: *volume_id }) + .collect(); + + let volume_table = tabled::Table::new(volume_rows) + .with(tabled::settings::Style::psql()) + .to_string(); + + println!("{}", volume_table); + Ok(()) } diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index cb05bb575b..3e8d70818e 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1655,6 +1655,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total requests completed ok: {}", + status.requests_completed_ok.len(), + ); + for line in &status.requests_completed_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1720,6 +1728,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total steps set to volume_deleted ok: {}", + status.step_set_volume_deleted_ok.len(), + ); + for line in &status.step_set_volume_deleted_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1831,10 +1847,11 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { Ok(status) => { println!( - " total records transitioned to done: {}", - status.records_set_to_done.len(), + " region snapshot replacement finish sagas started \ + ok: {}", + status.finish_invoked_ok.len() ); - for line in &status.records_set_to_done { + for line in &status.finish_invoked_ok { println!(" > {line}"); } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 6974c0b36b..4890f0052e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -627,7 +627,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -645,6 +645,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -655,6 +656,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" @@ -1070,7 +1072,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -1088,6 +1090,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -1098,6 +1101,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" diff --git a/nexus/db-model/src/region_snapshot_replacement.rs b/nexus/db-model/src/region_snapshot_replacement.rs index 183c9034c0..3267038663 100644 --- a/nexus/db-model/src/region_snapshot_replacement.rs +++ b/nexus/db-model/src/region_snapshot_replacement.rs @@ -25,6 +25,7 @@ impl_enum_type!( ReplacementDone => b"replacement_done" DeletingOldVolume => b"deleting_old_volume" Running => b"running" + Completing => b"completing" Complete => b"complete" ); @@ -43,6 +44,7 @@ impl std::str::FromStr for RegionSnapshotReplacementState { Ok(RegionSnapshotReplacementState::DeletingOldVolume) } "running" => Ok(RegionSnapshotReplacementState::Running), + "completing" => Ok(RegionSnapshotReplacementState::Completing), "complete" => Ok(RegionSnapshotReplacementState::Complete), _ => Err(format!("unrecognized value {} for enum", s)), } @@ -77,8 +79,13 @@ impl std::str::FromStr for RegionSnapshotReplacementState { /// v --- /// --- /// Running | -/// | set in region snapshot replacement -/// | | finish background task +/// | +/// | | +/// v | +/// | responsibility of region snapshot +/// Completing | replacement finish saga +/// | +/// | | /// v | /// | /// Complete --- @@ -130,6 +137,12 @@ pub struct RegionSnapshotReplacement { pub replacement_state: RegionSnapshotReplacementState, pub operating_saga_id: Option, + + /// In order for the newly created region not to be deleted inadvertently, + /// an additional reference count bump is required. This volume should live + /// as long as this request so that all necessary replacements can be + /// completed. + pub new_region_volume_id: Option, } impl RegionSnapshotReplacement { @@ -154,6 +167,7 @@ impl RegionSnapshotReplacement { old_snapshot_id, old_snapshot_volume_id: None, new_region_id: None, + new_region_volume_id: None, replacement_state: RegionSnapshotReplacementState::Requested, operating_saga_id: None, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 6ca7171793..4eb6539aef 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1929,6 +1929,7 @@ table! { new_region_id -> Nullable, replacement_state -> crate::RegionSnapshotReplacementStateEnum, operating_saga_id -> Nullable, + new_region_volume_id -> Nullable, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 70450a7776..2e246b30d0 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(114, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(115, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(115, "add-completing-and-new-region-volume"), KnownVersion::new(114, "crucible-ref-count-records"), KnownVersion::new(113, "add-tx-eq"), KnownVersion::new(112, "blueprint-dataset"), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 059d43b8c7..29a1cbac07 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -125,16 +125,7 @@ pub use sled::TransitionError; pub use switch_port::SwitchPortSettingsCombinedResult; pub use virtual_provisioning_collection::StorageType; pub use vmm::VmmStateUpdateResult; -pub use volume::read_only_resources_associated_with_volume; -pub use volume::CrucibleResources; -pub use volume::CrucibleTargets; -pub use volume::ExistingTarget; -pub use volume::ReplacementTarget; -pub use volume::VolumeCheckoutReason; -pub use volume::VolumeReplaceResult; -pub use volume::VolumeReplacementParams; -pub use volume::VolumeToDelete; -pub use volume::VolumeWithTarget; +pub use volume::*; // Number of unique datasets required to back a region. // TODO: This should likely turn into a configuration option. diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 6498ef3855..546f66be82 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -328,6 +328,7 @@ impl DataStore { region_snapshot_replacement_id: Uuid, operating_saga_id: Uuid, new_region_id: Uuid, + new_region_volume_id: Uuid, old_snapshot_volume_id: Uuid, ) -> Result<(), Error> { use db::schema::region_snapshot_replacement::dsl; @@ -343,6 +344,7 @@ impl DataStore { .eq(RegionSnapshotReplacementState::ReplacementDone), dsl::old_snapshot_volume_id.eq(Some(old_snapshot_volume_id)), dsl::new_region_id.eq(Some(new_region_id)), + dsl::new_region_volume_id.eq(Some(new_region_volume_id)), dsl::operating_saga_id.eq(Option::::None), )) .check_if_exists::( @@ -361,6 +363,8 @@ impl DataStore { && record.replacement_state == RegionSnapshotReplacementState::ReplacementDone && record.new_region_id == Some(new_region_id) + && record.new_region_volume_id + == Some(new_region_volume_id) && record.old_snapshot_volume_id == Some(old_snapshot_volume_id) { @@ -543,15 +547,122 @@ impl DataStore { } } - /// Transition a RegionSnapshotReplacement record from Running to Complete. - /// Also removes the `volume_repair` record that is taking a "lock" on the - /// Volume. Note this doesn't occur from a saga context, and therefore 1) - /// doesn't accept an operating saga id parameter, and 2) checks that - /// operating_saga_id is null for the corresponding record. + /// Transition a RegionSnapshotReplacement record from Running to + /// Completing, setting a unique id at the same time. + pub async fn set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + ) + .filter(dsl::operating_saga_id.is_null()) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + dsl::operating_saga_id.eq(operating_saga_id), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == Some(operating_saga_id) + && record.replacement_state + == RegionSnapshotReplacementState::Completing + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionReplacement record from Completing to Running, + /// clearing the operating saga id. + pub async fn undo_set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::Running + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from Completing to + /// Complete. Also removes the `volume_repair` record that is taking a + /// "lock" on the Volume. pub async fn set_region_snapshot_replacement_complete( &self, opctx: &OpContext, region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, ) -> Result<(), Error> { type TxnError = TransactionError; @@ -575,11 +686,14 @@ impl DataStore { .filter(dsl::id.eq(region_snapshot_replacement_id)) .filter( dsl::replacement_state - .eq(RegionSnapshotReplacementState::Running), + .eq(RegionSnapshotReplacementState::Completing), ) - .filter(dsl::operating_saga_id.is_null()) - .set((dsl::replacement_state - .eq(RegionSnapshotReplacementState::Complete),)) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete), + dsl::operating_saga_id.eq(Option::::None), + )) .check_if_exists::( region_snapshot_replacement_id, ) @@ -1053,6 +1167,86 @@ impl DataStore { Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), } } + + /// Transition from Requested to VolumeDeleted, and remove the associated + /// `volume_repair` record. This occurs when the associated snapshot's + /// volume is deleted. Note this doesn't occur from a saga context, and + /// therefore 1) doesn't accept an operating saga id parameter, and 2) + /// checks that operating_saga_id is null for the corresponding record. + pub async fn set_region_snapshot_replacement_step_volume_deleted_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_step: RegionSnapshotReplacementStep, + ) -> Result<(), Error> { + type TxnError = TransactionError; + + self.pool_connection_authorized(opctx) + .await? + .transaction_async(|conn| async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_step.id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement_step::dsl; + let result = + diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step.id)) + .filter(dsl::operating_saga_id.is_null()) + .filter(dsl::old_snapshot_volume_id.is_null()) + .filter( + dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::Requested, + ), + ) + .set(dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::VolumeDeleted, + )) + .check_if_exists::( + region_snapshot_replacement_step.id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementStepState::VolumeDeleted + { + Ok(()) + } else { + Err(TxnError::CustomError(Error::conflict( + format!( + "region snapshot replacement step {} set \ + to {:?} (operating saga id {:?})", + region_snapshot_replacement_step.id, + record.replacement_state, + record.operating_saga_id, + ), + ))) + } + } + } + }) + .await + .map_err(|e| match e { + TxnError::CustomError(error) => error, + + TxnError::Database(error) => { + public_error_from_diesel(error, ErrorHandler::Server) + } + }) + } } #[cfg(test)] diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 4cd83fff3f..7a4822acbd 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -57,6 +57,7 @@ use serde::Deserialize; use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; +use std::collections::HashSet; use std::collections::VecDeque; use std::net::AddrParseError; use std::net::SocketAddr; @@ -177,6 +178,23 @@ enum ReplaceSnapshotError { MultipleResourceUsageRecords(String), } +/// Crucible resources freed by previous volume deletes +#[derive(Debug, Serialize, Deserialize)] +pub struct FreedCrucibleResources { + /// Regions that previously could not be deleted (often due to region + /// snaphots) that were freed by a volume delete + pub datasets_and_regions: Vec<(Dataset, Region)>, + + /// Previously soft-deleted volumes that can now be hard-deleted + pub volumes: Vec, +} + +impl FreedCrucibleResources { + pub fn is_empty(&self) -> bool { + self.datasets_and_regions.is_empty() && self.volumes.is_empty() + } +} + impl DataStore { async fn volume_create_in_txn( conn: &async_bb8_diesel::Connection, @@ -1107,12 +1125,12 @@ impl DataStore { .await } - /// Find regions for deleted volumes that do not have associated region - /// snapshots and are not being used by any other non-deleted volumes, and - /// return them for garbage collection + /// Find read/write regions for deleted volumes that do not have associated + /// region snapshots and are not being used by any other non-deleted + /// volumes, and return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option)> { + ) -> LookupResult { let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("find_deleted_volume_regions") .transaction(&conn, |conn| async move { @@ -1124,8 +1142,7 @@ impl DataStore { async fn find_deleted_volume_regions_in_txn( conn: &async_bb8_diesel::Connection, - ) -> Result)>, diesel::result::Error> - { + ) -> Result { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; @@ -1171,6 +1188,9 @@ impl DataStore { let mut deleted_regions = Vec::with_capacity(unfiltered_deleted_regions.len()); + let mut volume_set: HashSet = + HashSet::with_capacity(unfiltered_deleted_regions.len()); + for (dataset, region, region_snapshot, volume) in unfiltered_deleted_regions { @@ -1204,10 +1224,61 @@ impl DataStore { continue; } + if let Some(volume) = &volume { + volume_set.insert(volume.id()); + } + deleted_regions.push((dataset, region, volume)); } - Ok(deleted_regions) + let regions_for_deletion: HashSet = + deleted_regions.iter().map(|(_, region, _)| region.id()).collect(); + + let mut volumes = Vec::with_capacity(deleted_regions.len()); + + for volume_id in volume_set { + // Do not return a volume hard-deletion if there are still lingering + // read/write regions, unless all those lingering read/write regions + // will be deleted from the result of returning from this function. + let allocated_rw_regions: HashSet = + Self::get_allocated_regions_query(volume_id) + .get_results_async::<(Dataset, Region)>(conn) + .await? + .into_iter() + .filter_map(|(_, region)| { + if !region.read_only() { + Some(region.id()) + } else { + None + } + }) + .collect(); + + if allocated_rw_regions.is_subset(®ions_for_deletion) { + // If all the allocated rw regions for this volume are in the + // set of regions being returned for deletion, then we can + // hard-delete this volume. Read-only region accounting should + // have already been updated by soft-deleting this volume. + // + // Note: we'll be in this branch if allocated_rw_regions is + // empty. I believe the only time we'll hit this empty case is + // when the volume is fully populated with read-only resources + // (read-only regions and region snapshots). + volumes.push(volume_id); + } else { + // Not all r/w regions allocated to this volume are being + // deleted here, so we can't hard-delete the volume yet. + } + } + + Ok(FreedCrucibleResources { + datasets_and_regions: deleted_regions + .into_iter() + .map(|(d, r, _)| (d, r)) + .collect(), + + volumes, + }) } pub async fn read_only_resources_associated_with_volume( @@ -3668,7 +3739,15 @@ impl DataStore { let mut paginator = Paginator::new(SQL_BATCH_SIZE); let conn = self.pool_connection_authorized(opctx).await?; - let needle = address.to_string(); + let needle = match address { + SocketAddr::V4(_) => { + return Err(Error::internal_error(&format!( + "find_volumes_referencing_socket_addr not ipv6: {address}" + ))); + } + + SocketAddr::V6(addr) => addr, + }; while let Some(p) = paginator.next() { use db::schema::volume::dsl; @@ -3685,7 +3764,23 @@ impl DataStore { paginator = p.found_batch(&haystack, &|r| r.id()); for volume in haystack { - if volume.data().contains(&needle) { + let vcr: VolumeConstructionRequest = + match serde_json::from_str(&volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(Error::internal_error(&format!( + "cannot deserialize volume data for {}: {e}", + volume.id(), + ))); + } + }; + + let rw_reference = region_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + let ro_reference = read_only_target_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + + if rw_reference || ro_reference { volumes.push(volume); } } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ad39777054..f107f2af71 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -825,7 +825,7 @@ impl BackgroundTasksInitializer { done", period: config.region_snapshot_replacement_finish.period_secs, task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( - datastore, + datastore, sagas, )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index caa2fa7bed..51d3df0862 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -8,9 +8,15 @@ //! Once all related region snapshot replacement steps are done, the region //! snapshot replacement can be completed. +use crate::app::authn; use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas; +use crate::app::sagas::region_snapshot_replacement_finish::*; +use crate::app::sagas::NexusSaga; use futures::future::BoxFuture; use futures::FutureExt; +use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus; @@ -19,11 +25,31 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, + sagas: Arc, } impl RegionSnapshotReplacementFinishDetector { - pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore } + pub fn new(datastore: Arc, sagas: Arc) -> Self { + RegionSnapshotReplacementFinishDetector { datastore, sagas } + } + + async fn send_finish_request( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacement, + ) -> Result<(), omicron_common::api::external::Error> { + let params = sagas::region_snapshot_replacement_finish::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + request, + }; + + let saga_dag = SagaRegionSnapshotReplacementFinish::prepare(¶ms)?; + + // We only care that the saga was started, and don't wish to wait for it + // to complete, so use `StartSaga::saga_start`, rather than `saga_run`. + self.sagas.saga_start(saga_dag).await?; + + Ok(()) } async fn transition_requests_to_done( @@ -120,21 +146,23 @@ impl RegionSnapshotReplacementFinishDetector { } }; - // Transition region snapshot replacement to Complete - match self - .datastore - .set_region_snapshot_replacement_complete(opctx, request.id) - .await - { + let request_id = request.id; + + match self.send_finish_request(opctx, request).await { Ok(()) => { - let s = format!("set request {} to done", request.id); + let s = format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" + ); + info!(&log, "{s}"); - status.records_set_to_done.push(s); + status.finish_invoked_ok.push(s); } Err(e) => { let s = format!( - "marking snapshot replacement as done failed: {e}" + "invoking region snapshot replacement finish for \ + {request_id} failed: {e}", ); error!(&log, "{s}"); status.errors.push(s); @@ -185,8 +213,10 @@ mod test { datastore.clone(), ); - let mut task = - RegionSnapshotReplacementFinishDetector::new(datastore.clone()); + let mut task = RegionSnapshotReplacementFinishDetector::new( + datastore.clone(), + nexus.sagas.clone(), + ); // Noop test let result: RegionSnapshotReplacementFinishStatus = @@ -231,6 +261,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -239,6 +270,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -334,8 +366,9 @@ mod test { assert_eq!( result, RegionSnapshotReplacementFinishStatus { - records_set_to_done: vec![format!( - "set request {request_id} to done" + finish_invoked_ok: vec![format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" )], errors: vec![], }, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index bc739ecf27..f568535086 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -40,7 +41,7 @@ impl RegionSnapshotReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionSnapshotReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_start::Params { serialized_authn, request, @@ -338,6 +339,7 @@ mod test { "region snapshot replacement start invoked ok for \ {request_id}" )], + requests_completed_ok: vec![], errors: vec![], }, ); diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index 29878364e6..1a3f218e3f 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -32,7 +32,7 @@ use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::RegionSnapshotReplacementStep; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::datastore::region_snapshot_replacement; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::DataStore; use nexus_types::identity::Asset; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; @@ -315,6 +315,21 @@ impl RegionSnapshotReplacementFindAffected { // functions execute), an indefinite amount of work would be // created, continually "moving" the snapshot_addr from // temporary volume to temporary volume. + // + // If the volume was soft deleted, then skip making a step for + // it. + + if volume.time_deleted.is_some() { + info!( + log, + "volume was soft-deleted, skipping creating a step for \ + it"; + "request id" => ?request.id, + "volume id" => ?volume.id(), + ); + + continue; + } match self .datastore @@ -326,7 +341,7 @@ impl RegionSnapshotReplacementFindAffected { .await { Ok(insertion_result) => match insertion_result { - region_snapshot_replacement::InsertStepResult::Inserted { step_id } => { + InsertStepResult::Inserted { step_id } => { let s = format!("created {step_id}"); info!( log, @@ -337,7 +352,7 @@ impl RegionSnapshotReplacementFindAffected { status.step_records_created_ok.push(s); } - region_snapshot_replacement::InsertStepResult::AlreadyHandled { .. } => { + InsertStepResult::AlreadyHandled { .. } => { info!( log, "step already exists for volume id"; @@ -345,7 +360,7 @@ impl RegionSnapshotReplacementFindAffected { "volume id" => ?volume.id(), ); } - } + }, Err(e) => { let s = format!("error creating step request: {e}"); @@ -392,13 +407,79 @@ impl RegionSnapshotReplacementFindAffected { }; for request in step_requests { - let request_id = request.id; + let request_step_id = request.id; + + // Check if the volume was deleted _after_ the replacement step was + // created. + + let volume_deleted = + match self.datastore.volume_deleted(request.volume_id).await { + Ok(volume_deleted) => volume_deleted, + + Err(e) => { + let s = format!( + "error checking if volume id {} was \ + deleted: {e}", + request.volume_id, + ); + error!(&log, "{s}"); + + status.errors.push(s); + continue; + } + }; + + if volume_deleted { + // Volume was soft or hard deleted, so proceed with clean up, + // which if this is in state Requested there won't be any + // additional associated state, so transition the record to + // Completed. + + info!( + &log, + "request {} step {} volume {} was soft or hard deleted!", + request.request_id, + request_step_id, + request.volume_id, + ); + + let result = self + .datastore + .set_region_snapshot_replacement_step_volume_deleted_from_requested( + opctx, request, + ) + .await; + + match result { + Ok(()) => { + let s = format!( + "request step {request_step_id} transitioned from \ + requested to volume_deleted" + ); + + info!(&log, "{s}"); + status.step_set_volume_deleted_ok.push(s); + } + + Err(e) => { + let s = format!( + "error transitioning {request_step_id} from \ + requested to complete: {e}" + ); + + error!(&log, "{s}"); + status.errors.push(s); + } + } + + continue; + } match self.send_start_request(opctx, request.clone()).await { Ok(()) => { let s = format!( "region snapshot replacement step saga invoked ok for \ - {request_id}" + {request_step_id}" ); info!( @@ -413,7 +494,7 @@ impl RegionSnapshotReplacementFindAffected { Err(e) => { let s = format!( "invoking region snapshot replacement step saga for \ - {request_id} failed: {e}" + {request_step_id} failed: {e}" ); error!( @@ -598,6 +679,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -606,6 +688,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -746,10 +829,7 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); let result = datastore .insert_region_snapshot_replacement_step(&opctx, { @@ -767,10 +847,7 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); // Activate the task - it should pick the complete steps up and try to // run the region snapshot replacement step garbage collect saga diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 405a972976..d8ba6abbdd 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -39,6 +39,7 @@ pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; pub mod region_replacement_start; +pub mod region_snapshot_replacement_finish; pub mod region_snapshot_replacement_garbage_collect; pub mod region_snapshot_replacement_start; pub mod region_snapshot_replacement_step; @@ -173,7 +174,8 @@ fn make_action_registry() -> ActionRegistry { region_snapshot_replacement_start::SagaRegionSnapshotReplacementStart, region_snapshot_replacement_garbage_collect::SagaRegionSnapshotReplacementGarbageCollect, region_snapshot_replacement_step::SagaRegionSnapshotReplacementStep, - region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect + region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect, + region_snapshot_replacement_finish::SagaRegionSnapshotReplacementFinish ]; #[cfg(test)] diff --git a/nexus/src/app/sagas/region_snapshot_replacement_finish.rs b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs new file mode 100644 index 0000000000..d992f753d6 --- /dev/null +++ b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs @@ -0,0 +1,211 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! After the change to store a "new region volume" in the region snapshot +//! replacement request, that volume requires garbage collection before the +//! region snapshot replacement transitions to Complete. It's this saga's +//! responsibility to ensure that cleanup. This saga handles the following +//! region snapshot replacement request state transitions: +//! +//! ```text +//! Running <-- +//! | +//! | | +//! v | +//! | +//! Completing -- +//! +//! | +//! v +//! +//! Complete +//! ``` +//! +//! The first thing this saga does is set itself as the "operating saga" for the +//! request, and change the state to "Completing". Then, it performs the volume +//! delete sub-saga for the new region volume. Finally, it updates the region +//! snapshot replacement request by clearing the operating saga id and changing +//! the state to "Complete". +//! +//! Any unwind will place the state back into Running. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::sagas::declare_saga_actions; +use crate::app::sagas::volume_delete; +use crate::app::{authn, db}; +use serde::Deserialize; +use serde::Serialize; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +// region snapshot replacement finish saga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub serialized_authn: authn::saga::Serialized, + pub request: db::model::RegionSnapshotReplacement, +} + +// region snapshot replacement finish saga: actions + +declare_saga_actions! { + region_snapshot_replacement_finish; + SET_SAGA_ID -> "unused_1" { + + rsrfs_set_saga_id + - rsrfs_set_saga_id_undo + } + UPDATE_REQUEST_RECORD -> "unused_4" { + + rsrfs_update_request_record + } +} + +// region snapshot replacement finish saga: definition + +#[derive(Debug)] +pub(crate) struct SagaRegionSnapshotReplacementFinish; +impl NexusSaga for SagaRegionSnapshotReplacementFinish { + const NAME: &'static str = "region-snapshot-replacement-finish"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + region_snapshot_replacement_finish_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(Node::action( + "saga_id", + "GenerateSagaId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(set_saga_id_action()); + + if let Some(new_region_volume_id) = params.request.new_region_volume_id + { + let subsaga_params = volume_delete::Params { + serialized_authn: params.serialized_authn.clone(), + volume_id: new_region_volume_id, + }; + + let subsaga_dag = { + let subsaga_builder = steno::DagBuilder::new( + steno::SagaName::new(volume_delete::SagaVolumeDelete::NAME), + ); + volume_delete::SagaVolumeDelete::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + "params_for_volume_delete_subsaga", + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + "params_for_volume_delete_subsaga".to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "volume_delete_subsaga_no_result", + subsaga_dag, + "params_for_volume_delete_subsaga", + )); + } + + builder.append(update_request_record_action()); + + Ok(builder.build()?) + } +} + +// region snapshot replacement finish saga: action implementations + +async fn rsrfs_set_saga_id( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Change the request record here to an intermediate "completing" state to + // block out other sagas that will be triggered for the same request. + osagactx + .datastore() + .set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrfs_set_saga_id_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + osagactx + .datastore() + .undo_set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await?; + + Ok(()) +} + +async fn rsrfs_update_request_record( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Update the replacement request record to 'Complete' and clear the + // operating saga id. There is no undo step for this, it should succeed + // idempotently. + datastore + .set_region_snapshot_replacement_complete( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 4e6c3e1e16..8e67b8fc85 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -105,6 +105,10 @@ declare_saga_actions! { + rsrss_new_region_ensure - rsrss_new_region_ensure_undo } + NEW_REGION_VOLUME_CREATE -> "new_region_volume" { + + rsrss_new_region_volume_create + - rsrss_new_region_volume_create_undo + } GET_OLD_SNAPSHOT_VOLUME_ID -> "old_snapshot_volume_id" { + rsrss_get_old_snapshot_volume_id } @@ -149,11 +153,18 @@ impl NexusSaga for SagaRegionSnapshotReplacementStart { ACTION_GENERATE_ID.as_ref(), )); + builder.append(Node::action( + "new_region_volume_id", + "GenerateNewRegionVolumeId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(set_saga_id_action()); builder.append(get_alloc_region_params_action()); builder.append(alloc_new_region_action()); builder.append(find_new_region_action()); builder.append(new_region_ensure_action()); + builder.append(new_region_volume_create_action()); builder.append(get_old_snapshot_volume_id_action()); builder.append(create_fake_volume_action()); builder.append(replace_snapshot_in_volume_action()); @@ -474,6 +485,94 @@ async fn rsrss_new_region_ensure_undo( Ok(()) } +async fn rsrss_new_region_volume_create( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + + let (new_dataset, ensured_region) = sagactx.lookup::<( + db::model::Dataset, + crucible_agent_client::types::Region, + )>( + "ensured_dataset_and_region", + )?; + + let Some(new_dataset_address) = new_dataset.address() else { + return Err(ActionError::action_failed(format!( + "dataset {} does not have an address!", + new_dataset.id(), + ))); + }; + + let new_region_address = SocketAddrV6::new( + *new_dataset_address.ip(), + ensured_region.port_number, + 0, + 0, + ); + + // Create a volume to inflate the reference count of the newly created + // read-only region. If this is not done it's possible that a user could + // delete the snapshot volume _after_ the new read-only region was swapped + // in, removing the last reference to it and causing garbage collection. + + let volume_construction_request = VolumeConstructionRequest::Volume { + id: new_region_volume_id, + block_size: 0, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 0, + blocks_per_extent: 0, + extent_count: 0, + gen: 0, + opts: CrucibleOpts { + id: new_region_volume_id, + target: vec![new_region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&volume_construction_request) + .map_err(|e| { + ActionError::action_failed(Error::internal_error(&e.to_string())) + })?; + + let volume = db::model::Volume::new(new_region_volume_id, volume_data); + + osagactx + .datastore() + .volume_create(volume) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrss_new_region_volume_create_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + + // Delete the volume. + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + osagactx.datastore().volume_hard_delete(new_region_volume_id).await?; + + Ok(()) +} + async fn rsrss_get_old_snapshot_volume_id( sagactx: NexusActionContext, ) -> Result { @@ -780,6 +879,9 @@ async fn rsrss_update_request_record( let old_region_volume_id = sagactx.lookup::("new_volume_id")?; + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + // Now that the region has been ensured and the construction request has // been updated, update the replacement request record to 'ReplacementDone' // and clear the operating saga id. There is no undo step for this, it @@ -790,6 +892,7 @@ async fn rsrss_update_request_record( params.request.id, saga_id, new_region_id, + new_region_volume_id, old_region_volume_id, ) .await @@ -990,8 +1093,10 @@ pub(crate) mod test { .await .unwrap(); - assert_eq!(volumes.len(), 1); - assert_eq!(volumes[0].id(), db_snapshot.volume_id); + assert!(volumes + .iter() + .map(|v| v.id()) + .any(|vid| vid == db_snapshot.volume_id)); } fn new_test_params( @@ -1154,7 +1259,7 @@ pub(crate) mod test { async fn test_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let PrepareResult { db_disk, snapshot, db_snapshot: _ } = + let PrepareResult { db_disk, snapshot, .. } = prepare_for_test(cptestctx).await; let nexus = &cptestctx.server.server_context().nexus; diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 66d9426cdd..c5f78f7b09 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -423,6 +423,20 @@ async fn rsrss_notify_upstairs( let params = sagactx.saga_params::()?; let log = sagactx.user_data().log(); + // If the associated volume was deleted, then skip this notification step. + // It's likely there is no Upstairs to talk to, but continue with the saga + // to transition the step request to Complete, and then perform the + // associated clean up. + + let volume_replace_snapshot_result = sagactx + .lookup::("volume_replace_snapshot_result")?; + if matches!( + volume_replace_snapshot_result, + VolumeReplaceResult::ExistingVolumeDeleted + ) { + return Ok(()); + } + // Make an effort to notify a Propolis if one was booted for this volume. // This is best effort: if there is a failure, this saga will unwind and be // triggered again for the same request. If there is no Propolis booted for diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index 12d48cb367..f00d804433 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -27,12 +27,9 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; use crate::app::sagas::declare_saga_actions; -use nexus_db_model::Dataset; -use nexus_db_model::Region; -use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; -use nexus_types::identity::Asset; +use nexus_db_queries::db::datastore::FreedCrucibleResources; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -330,8 +327,6 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; - /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent /// because a snapshot existed. Look for those here. These will be a different @@ -417,7 +412,7 @@ type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// another snapshot delete. async fn svd_find_freed_crucible_regions( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); // Find regions freed up for deletion by a previous saga node deleting the @@ -432,11 +427,7 @@ async fn svd_find_freed_crucible_regions( }, )?; - // Don't serialize the whole Volume, as the data field contains key material! - Ok(freed_datasets_regions_and_volumes - .into_iter() - .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) - .collect()) + Ok(freed_datasets_regions_and_volumes) } async fn svd_delete_freed_crucible_regions( @@ -448,9 +439,11 @@ async fn svd_delete_freed_crucible_regions( // Find regions freed up for deletion by a previous saga node deleting the // region snapshots. let freed_datasets_regions_and_volumes = - sagactx.lookup::("freed_crucible_regions")?; + sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { + for (dataset, region) in + &freed_datasets_regions_and_volumes.datasets_and_regions + { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -477,18 +470,17 @@ async fn svd_delete_freed_crucible_regions( e, )) })?; + } - // Remove volume DB record - if let Some(volume_id) = volume_id { - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; - } + for volume_id in &freed_datasets_regions_and_volumes.volumes { + osagactx.datastore().volume_hard_delete(*volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; } Ok(()) diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index b0474037a8..23edde8fcb 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -305,7 +305,7 @@ pub async fn run_region_snapshot_replacement_finish( assert!(status.errors.is_empty()); - status.records_set_to_done.len() + status.finish_invoked_ok.len() } /// Run all replacement related background tasks until they aren't doing diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 2f5317e249..fcee0e5607 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -5,9 +5,12 @@ //! Tests related to region and region snapshot replacement use dropshot::test_util::ClientTestContext; +use nexus_client::types::LastResult; use nexus_db_model::PhysicalDiskPolicy; use nexus_db_model::RegionReplacementState; +use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; use nexus_test_utils::background::*; @@ -15,11 +18,22 @@ use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::resource_helpers::create_default_ip_pool; use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; +use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::params; +use nexus_types::external_api::views; +use nexus_types::identity::Asset; +use nexus_types::internal_api::background::*; +use omicron_common::api::external; +use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::GenericUuid; use slog::Logger; +use std::collections::HashSet; +use std::net::SocketAddr; use std::sync::Arc; use uuid::Uuid; @@ -40,12 +54,37 @@ fn get_disk_url(disk_name: &str) -> String { format!("/v1/disks/{disk_name}?project={}", PROJECT_NAME) } +fn get_disks_url() -> String { + format!("/v1/disks?project={}", PROJECT_NAME) +} + +fn get_snapshot_url(snapshot_name: &str) -> String { + format!("/v1/snapshots/{snapshot_name}?project={}", PROJECT_NAME) +} + +fn get_snapshots_url() -> String { + format!("/v1/snapshots?project={}", PROJECT_NAME) +} + async fn create_project_and_pool(client: &ClientTestContext) -> Uuid { create_default_ip_pool(client).await; let project = create_project(client, PROJECT_NAME).await; project.identity.id } +async fn collection_list( + client: &ClientTestContext, + list_url: &str, +) -> Vec +where + T: Clone + serde::de::DeserializeOwned, +{ + NexusRequest::iter_collection_authn(client, list_url, "", None) + .await + .expect("failed to list") + .all_items +} + /// Assert that the first part of region replacement does not create a freed /// crucible region (that would be picked up by a volume delete saga) #[nexus_test] @@ -118,280 +157,296 @@ async fn test_region_replacement_does_not_create_freed_region( assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); } -struct RegionReplacementDeletedVolumeTest<'a> { - log: Logger, - datastore: Arc, - disk_test: DiskTest<'a>, - client: ClientTestContext, - internal_client: ClientTestContext, - replacement_request_id: Uuid, -} - -#[derive(Debug)] -struct ExpectedEndState(pub RegionReplacementState); +mod region_replacement { + use super::*; -#[derive(Debug)] -struct ExpectedIntermediateState(pub RegionReplacementState); + #[derive(Debug)] + struct ExpectedEndState(pub RegionReplacementState); -impl<'a> RegionReplacementDeletedVolumeTest<'a> { - pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { - let nexus = &cptestctx.server.server_context().nexus; + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionReplacementState); - // Create four zpools, each with one dataset. This is required for - // region and region snapshot replacement to have somewhere to move the - // data. - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(cptestctx.first_sled()) - .with_zpool_count(4) - .build() - .await; + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + } - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - let datastore = nexus.datastore().clone(); + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + // Manually create the region replacement request for the first + // allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let replacement_request_id = datastore + .create_region_replacement_request_for_region(&opctx, ®ion) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Requested, + ); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), + replacement_request_id, + } + } - let opctx = OpContext::for_tests( - cptestctx.logctx.log.new(o!()), - datastore.clone(), - ); + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } - // Create a disk - let _project_id = create_project_and_pool(client).await; + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } - let disk = create_disk(&client, PROJECT_NAME, "disk").await; + /// Make sure: + /// + /// - all region replacement related background tasks run to completion + /// - this harness' region replacement request has transitioned to + /// Complete + /// - no Crucible resources are leaked + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. - // Manually create the region replacement request for the first - // allocated region of that disk + run_replacement_tasks_to_completion(&self.internal_client).await; - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() - .await - .unwrap(); + // Assert the request is in state Complete - assert_eq!(db_disk.id(), disk.identity.id); + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Complete, + ); - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); + // Assert there are no more Crucible resources - // Assert the request is in state Requested + assert!(self.disk_test.crucible_resources_deleted().await); + } - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, - replacement_request_id, + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), ) .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); - - RegionReplacementDeletedVolumeTest { - log: cptestctx.logctx.log.new(o!()), - datastore, - disk_test, - client: client.clone(), - internal_client: internal_client.clone(), - replacement_request_id, + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + expected_end_state.0 + ); } - } - - pub fn opctx(&self) -> OpContext { - OpContext::for_tests(self.log.clone(), self.datastore.clone()) - } - - pub async fn delete_the_disk(&self) { - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(&self.client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); - } - /// Make sure: - /// - /// - all region replacement related background tasks run to completion - /// - this harness' region replacement request has transitioned to Complete - /// - no Crucible resources are leaked - pub async fn finish_test(&self) { - // Make sure that all the background tasks can run to completion. + /// Run the "region replacement" task to transition the request to + /// Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region replacement" background task - run_replacement_tasks_to_completion(&self.internal_client).await; + run_region_replacement(&self.internal_client).await; - // Assert the request is in state Complete + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Allocating), ) - .await - .unwrap(); + .await; + } - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); + /// Call the region replacement drive task to attach the associated volume + /// to the simulated pantry, ostensibly for reconciliation + pub async fn attach_request_volume_to_pantry(&self) { + // Run the "region replacement driver" task to attach the associated + // volume to the simulated pantry. - // Assert there are no more Crucible resources + run_region_replacement_driver(&self.internal_client).await; - assert!(self.disk_test.crucible_resources_deleted().await); - } + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. - async fn wait_for_request_state( - &self, - expected_end_state: ExpectedEndState, - expected_intermediate_state: ExpectedIntermediateState, - ) { - wait_for_condition( - || { - let datastore = self.datastore.clone(); - let opctx = self.opctx(); - let replacement_request_id = self.replacement_request_id; - - async move { - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, - replacement_request_id, - ) - .await - .unwrap(); - - let state = region_replacement.replacement_state; - - if state == expected_end_state.0 { - // The saga transitioned the request ok - Ok(()) - } else if state == expected_intermediate_state.0 { - // The saga is still running - Err(CondCheckError::<()>::NotYet) - } else { - // Any other state is not expected - panic!("unexpected state {state:?}!"); - } - } - }, - &std::time::Duration::from_millis(500), - &std::time::Duration::from_secs(60), - ) - .await - .expect("request transitioned to expected state"); - - // Assert the request state - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Driving), ) - .await - .unwrap(); - - assert_eq!(region_replacement.replacement_state, expected_end_state.0); - } - - /// Run the "region replacement" task to transition the request to Running. - pub async fn transition_request_to_running(&self) { - // Activate the "region replacement" background task - - run_region_replacement(&self.internal_client).await; - - // The activation above could only have started the associated saga, so - // wait until the request is in state Running. - - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Allocating), - ) - .await; - } - - /// Call the region replacement drive task to attach the associated volume - /// to the simulated pantry, ostensibly for reconciliation - pub async fn attach_request_volume_to_pantry(&self) { - // Run the "region replacement driver" task to attach the associated - // volume to the simulated pantry. + .await; - run_region_replacement_driver(&self.internal_client).await; + // Additionally, assert that the drive saga recorded that it sent + // the attachment request to the simulated pantry - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. + let most_recent_step = self + .datastore + .current_region_replacement_request_step( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap() + .unwrap(); - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + assert!(most_recent_step.pantry_address().is_some()); + } - // Additionally, assert that the drive saga recorded that it sent the - // attachment request to the simulated pantry + /// Manually activate the background attachment for the request volume + pub async fn manually_activate_attached_volume( + &self, + cptestctx: &'a ControlPlaneTestContext, + ) { + let pantry = cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .clone(); + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + pantry + .activate_background_attachment( + region_replacement.volume_id.to_string(), + ) + .await + .unwrap(); + } - let most_recent_step = self - .datastore - .current_region_replacement_request_step( - &self.opctx(), - self.replacement_request_id, - ) - .await - .unwrap() - .unwrap(); + /// Transition request to ReplacementDone via the region replacement + /// drive saga + pub async fn transition_request_to_replacement_done(&self) { + // Run the "region replacement driver" task - assert!(most_recent_step.pantry_address().is_some()); - } + run_region_replacement_driver(&self.internal_client).await; - /// Manually activate the background attachment for the request volume - pub async fn manually_activate_attached_volume( - &self, - cptestctx: &'a ControlPlaneTestContext, - ) { - let pantry = - cptestctx.sled_agent.pantry_server.as_ref().unwrap().pantry.clone(); - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, - ) - .await - .unwrap(); + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. - pantry - .activate_background_attachment( - region_replacement.volume_id.to_string(), + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::ReplacementDone), + ExpectedIntermediateState(RegionReplacementState::Driving), ) - .await - .unwrap(); - } - - /// Transition request to ReplacementDone via the region replacement drive - /// saga - pub async fn transition_request_to_replacement_done(&self) { - // Run the "region replacement driver" task - - run_region_replacement_driver(&self.internal_client).await; - - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. - - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::ReplacementDone), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + .await; + } } } @@ -401,7 +456,8 @@ impl<'a> RegionReplacementDeletedVolumeTest<'a> { async fn test_delete_volume_region_replacement_state_requested( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: delete the // disk, then finish the test. @@ -417,7 +473,8 @@ async fn test_delete_volume_region_replacement_state_requested( async fn test_delete_volume_region_replacement_state_running( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -437,7 +494,8 @@ async fn test_delete_volume_region_replacement_state_running( async fn test_delete_volume_region_replacement_state_running_on_pantry( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -459,7 +517,8 @@ async fn test_delete_volume_region_replacement_state_running_on_pantry( async fn test_delete_volume_region_replacement_state_replacement_done( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -481,3 +540,1045 @@ async fn test_delete_volume_region_replacement_state_replacement_done( test_harness.finish_test().await; } + +/// Assert that the problem experienced in issue 6353 is fixed +#[nexus_test] +async fn test_racing_replacements_for_soft_deleted_disk_volume( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create four zpools, each with one dataset. This is required for region + // and region snapshot replacement to have somewhere to move the data. + let sled_id = cptestctx.first_sled(); + let mut disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(4) + .build() + .await; + + // Create a disk, then a snapshot of that disk + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: "snapshot".parse().unwrap(), + description: String::from("a snapshot"), + }, + disk: disk.identity.name.into(), + }, + ) + .await; + + // Before deleting the disk, save the DB model + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + // Next, expunge a physical disk that contains a region snapshot (which + // means it'll have the region too) + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let (dataset, region) = &disk_allocated_regions[0]; + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id, + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + + // Only one region snapshot should be been returned by the following call + // due to the allocation policy. + + let expunged_region_snapshots = datastore + .find_region_snapshots_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + assert_eq!(expunged_region_snapshots.len(), 1); + + for expunged_region_snapshot in expunged_region_snapshots { + assert_eq!(expunged_region_snapshot.snapshot_id, snapshot.identity.id); + } + + // Either one or two regions can be returned, depending on if the snapshot + // destination volume was allocated on to the physical disk that was + // expunged. + + let expunged_regions = datastore + .find_regions_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + match expunged_regions.len() { + 1 => { + assert_eq!(expunged_regions[0].id(), region.id()); + } + + 2 => { + assert!(expunged_regions.iter().any(|r| r.id() == region.id())); + + let (.., db_snapshot) = LookupPath::new(&opctx, datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let snapshot_allocated_datasets_and_regions = datastore + .get_allocated_regions(db_snapshot.destination_volume_id) + .await + .unwrap(); + + let snapshot_allocated_regions: Vec = + snapshot_allocated_datasets_and_regions + .into_iter() + .map(|(_, r)| r.id()) + .collect(); + + assert!(expunged_regions.iter().any(|region| { + snapshot_allocated_regions.contains(®ion.id()) + })); + } + + _ => { + panic!("unexpected number of expunged regions!"); + } + } + + // Now, race the region replacement with the region snapshot replacement: + // + // 1) region replacement will allocate a new region and swap it into the + // disk volume. + + let internal_client = &cptestctx.internal_client; + + let _ = + activate_background_task(&internal_client, "region_replacement").await; + + // After that task invocation, there should be one running region + // replacement for the disk's region. Filter out the replacement request for + // the snapshot destination volume if it's there. The above background task + // only starts the associated saga, so wait for it to complete. + + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + if region_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + assert_eq!(region_replacements.len(), 1); + + // 2) region snapshot replacement start will replace the region snapshot in + // the snapshot volume + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_start", + ) + .await; + + // After that, there should be one "replacement done" region snapshot + // replacement for the associated region snapshot. The above background task + // only starts the associated saga, so wait for it to complete. + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + if region_snapshot_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + assert_eq!(region_snapshot_replacements.len(), 1); + assert_eq!(region_snapshot_replacements[0].old_dataset_id, dataset.id()); + assert_eq!(region_snapshot_replacements[0].old_region_id, region.id()); + assert_eq!( + region_snapshot_replacements[0].old_snapshot_id, + snapshot.identity.id + ); + assert_eq!( + region_snapshot_replacements[0].replacement_state, + RegionSnapshotReplacementState::ReplacementDone, + ); + + assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); + + // 3) Delete the disk + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // The volume should be soft-deleted now. The region snapshot replacement + // swapped out the region snapshot from the snapshot volume to the temporary + // volume for later deletion, but has not actually deleted that temporary + // volume yet, so the count will not have gone to 0. + + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + assert!(volume.unwrap().time_deleted.is_some()); + + // 4) region snapshot replacement garbage collect will delete the temporary + // volume with the stashed reference to the region snapshot, bringing the + // reference count to zero. + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_garbage_collection", + ) + .await; + + // Assert the region snapshot was deleted. + assert!(datastore + .region_snapshot_get(dataset.id(), region.id(), snapshot.identity.id) + .await + .unwrap() + .is_none()); + + // Assert that the disk's volume is still only soft-deleted, because the two + // other associated region snapshots still exist. + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + + // Check on the old region id - it should not be deleted + let maybe_region = + datastore.get_region_optional(region.id()).await.unwrap(); + + eprintln!("old_region_id: {:?}", &maybe_region); + assert!(maybe_region.is_some()); + + // But the new region id will be! + let maybe_region = datastore + .get_region_optional(region_replacements[0].new_region_id.unwrap()) + .await + .unwrap(); + + eprintln!("new region id: {:?}", &maybe_region); + assert!(maybe_region.is_none()); + + // The region_replacement drive task should invoke the drive saga now, which + // will skip over all notification steps and transition the request to + // ReplacementDone + + let last_background_task = + activate_background_task(&internal_client, "region_replacement_driver") + .await; + + assert!(match last_background_task.last { + LastResult::Completed(last_result_completed) => { + match serde_json::from_value::( + last_result_completed.details, + ) { + Err(e) => { + eprintln!("{e}"); + false + } + + Ok(v) => !v.drive_invoked_ok.is_empty(), + } + } + + _ => { + false + } + }); + + // wait for the drive saga to complete here + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let replacement_request_id = region_replacements[0].id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionReplacementState::ReplacementDone { + // The saga transitioned the request ok + Ok(()) + } else if state == RegionReplacementState::Driving { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // After the region snapshot replacement process runs to completion, there + // should be no more crucible resources left. Run the "region snapshot + // replacement step" background task until there's nothing left, then the + // "region snapshot replacement finish", then make sure there are no + // crucible resources left. + + let mut count = 0; + loop { + let actions_taken = + run_region_snapshot_replacement_step(&internal_client).await; + + if actions_taken == 0 { + break; + } + + count += 1; + + if count > 20 { + assert!(false); + } + } + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_finish", + ) + .await; + + // Ensure the region snapshot replacement request went to Complete + + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + region_snapshot_replacements[0].id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Complete, + ); + + // Delete the snapshot + + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // and now there should be no higher level resources left + + let disks_url = get_disks_url(); + assert_eq!( + collection_list::(&client, &disks_url).await.len(), + 0 + ); + + let snapshots_url = get_snapshots_url(); + assert_eq!( + collection_list::(&client, &snapshots_url).await.len(), + 0 + ); + + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&internal_client).await; + + // The disk volume should be deleted by the snapshot delete: wait until this + // happens + + wait_for_condition( + || { + let datastore = datastore.clone(); + let volume_id = db_disk.volume_id; + + async move { + let volume = datastore.volume_get(volume_id).await.unwrap(); + if volume.is_none() { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("disk volume deleted"); + + // There should be no more crucible resources left. Don't just check for + // `crucible_resources_deleted` here! We have set one of the physical disk + // policies to expunged, so Nexus will not attempt to clean up any resources + // on that physical disk. + + disk_test.remove_zpool(db_zpool.id()).await; + + // Now, assert that all crucible resources are cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} + +mod region_snapshot_replacement { + use super::*; + + #[derive(Debug)] + struct ExpectedEndState(pub RegionSnapshotReplacementState); + + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionSnapshotReplacementState); + + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + snapshot_socket_addr: SocketAddr, + } + + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk, a snapshot of that disk, and a disk from that + // snapshot + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot") + .await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Manually create the region snapshot replacement request for the + // first allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let region_snapshot = datastore + .region_snapshot_get( + region.dataset_id(), + region.id(), + snapshot.identity.id, + ) + .await + .expect("found region snapshot without error") + .unwrap(); + + let replacement_request_id = datastore + .create_region_snapshot_replacement_request( + &opctx, + ®ion_snapshot, + ) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Requested, + ); + + // Assert two volumes reference the snapshot addr + + let snapshot_socket_addr = + region_snapshot.snapshot_addr.parse().unwrap(); + + let volumes = datastore + .find_volumes_referencing_socket_addr( + &opctx, + snapshot_socket_addr, + ) + .await + .unwrap(); + + assert_eq!(volumes.len(), 2); + + // Validate that they are snapshot and disk from snapshot + + let volumes_set: HashSet = + volumes.into_iter().map(|v| v.id()).collect(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + assert!(volumes_set.contains(&db_snapshot.volume_id)); + assert!(volumes_set.contains(&db_disk_from_snapshot.volume_id)); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), + replacement_request_id, + snapshot_socket_addr, + } + } + + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } + + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } + + pub async fn delete_the_snapshot(&self) { + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(&self.client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + } + + pub async fn delete_the_disk_from_snapshot(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + } + + /// Make sure: + /// + /// - all region snapshot replacement related background tasks run to + /// completion + /// - this harness' region snapshot replacement request has transitioned + /// to Complete + /// - there are no more volumes that reference the request's region + /// snapshot + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&self.internal_client).await; + + // Assert the request is in state Complete + + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Complete, + ); + + // Assert no volumes are referencing the snapshot address + + let volumes = self + .datastore + .find_volumes_referencing_socket_addr( + &self.opctx(), + self.snapshot_socket_addr, + ) + .await + .unwrap(); + + if !volumes.is_empty() { + eprintln!("{:?}", volumes); + } + + assert!(volumes.is_empty()); + } + + /// Assert no Crucible resources are leaked + pub async fn assert_no_crucible_resources_leaked(&self) { + assert!(self.disk_test.crucible_resources_deleted().await); + } + + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + expected_end_state.0, + ); + } + + /// Run the "region snapshot replacement" task to transition the request + /// to ReplacementDone. + pub async fn transition_request_to_replacement_done(&self) { + // Activate the "region snapshot replacement start" background task + + run_region_snapshot_replacement_start(&self.internal_client).await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState( + RegionSnapshotReplacementState::ReplacementDone, + ), + ExpectedIntermediateState( + RegionSnapshotReplacementState::Allocating, + ), + ) + .await; + } + + /// Run the "region snapshot replacement garbage collection" task to + /// transition the request to Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region snapshot replacement garbage collection" + // background task + + run_region_snapshot_replacement_garbage_collection( + &self.internal_client, + ) + .await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState(RegionSnapshotReplacementState::Running), + ExpectedIntermediateState( + RegionSnapshotReplacementState::DeletingOldVolume, + ), + ) + .await; + } + + /// Manually create a region snapshot replacement step for the disk + /// created from the snapshot + pub async fn create_manual_region_snapshot_replacement_step(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + + let disk_from_snapshot: external::Disk = + NexusRequest::object_get(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&self.opctx(), &self.datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let result = self + .datastore + .create_region_snapshot_replacement_step( + &self.opctx(), + self.replacement_request_id, + db_disk_from_snapshot.volume_id, + ) + .await + .unwrap(); + + match result { + InsertStepResult::Inserted { .. } => {} + + _ => { + assert!(false, "bad result from create_region_snapshot_replacement_step"); + } + } + } + } +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted and still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: delete + // the snapshot, then finish the test. + + test_harness.delete_the_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted, and the snapshot's source disk can be deleted, +/// and the request will still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_2( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - the only thing that will remain is the disk-from-snap that was created + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "ReplacementDone" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_replacement_done( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Running" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_running( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement step can have its associated +/// volume deleted and still transition to VolumeDeleted +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_step( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - manually create a region snapshot replacement step for the disk created + // from the snapshot + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.create_manual_region_snapshot_replacement_step().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 887afff20f..2eb019334f 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -3763,12 +3763,13 @@ impl TestReadOnlyRegionReferenceUsage { // read-only regions should never be returned by find_deleted_volume_regions pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { - let deleted_volume_regions = + let freed_crucible_resources = self.datastore.find_deleted_volume_regions().await.unwrap(); - assert!(!deleted_volume_regions + assert!(!freed_crucible_resources + .datasets_and_regions .into_iter() - .any(|(_, r, _)| r.id() == self.region.id())); + .any(|(_, r)| r.id() == self.region.id())); } pub async fn create_first_volume_region_in_rop(&self) { diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index cf3d652587..bee2f56c34 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -37,6 +37,7 @@ pub struct LookupRegionPortStatus { pub struct RegionSnapshotReplacementStartStatus { pub requests_created_ok: Vec, pub start_invoked_ok: Vec, + pub requests_completed_ok: Vec, pub errors: Vec, } @@ -55,13 +56,14 @@ pub struct RegionSnapshotReplacementStepStatus { pub step_records_created_ok: Vec, pub step_garbage_collect_invoked_ok: Vec, pub step_invoked_ok: Vec, + pub step_set_volume_deleted_ok: Vec, pub errors: Vec, } /// The status of a `region_snapshot_replacement_finish` background task activation #[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] pub struct RegionSnapshotReplacementFinishStatus { - pub records_set_to_done: Vec, + pub finish_invoked_ok: Vec, pub errors: Vec, } diff --git a/schema/crdb/add-completing-and-new-region-volume/up01.sql b/schema/crdb/add-completing-and-new-region-volume/up01.sql new file mode 100644 index 0000000000..6a973eb3c3 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up01.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.region_snapshot_replacement_state ADD VALUE IF NOT EXISTS 'completing' AFTER 'complete'; diff --git a/schema/crdb/add-completing-and-new-region-volume/up02.sql b/schema/crdb/add-completing-and-new-region-volume/up02.sql new file mode 100644 index 0000000000..42c0028ff5 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up02.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot_replacement ADD COLUMN IF NOT EXISTS new_region_volume_id UUID; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f689c7e9f7..cb6005b280 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4417,7 +4417,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_state AS EN 'replacement_done', 'deleting_old_volume', 'running', - 'complete' + 'complete', + 'completing' ); CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( @@ -4435,7 +4436,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( replacement_state omicron.public.region_snapshot_replacement_state NOT NULL, - operating_saga_id UUID + operating_saga_id UUID, + + new_region_volume_id UUID ); CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_by_state on omicron.public.region_snapshot_replacement (replacement_state); @@ -4684,7 +4687,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '114.0.0', NULL) + (TRUE, NOW(), NOW(), '115.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From bee3cca180223ca683f70d171d2d4acc1da03946 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 14 Nov 2024 21:36:47 +0000 Subject: [PATCH 02/29] slow CI machines may not have started the sagas after running background task --- .../crucible_replacements.rs | 73 ++++++++++++++++--- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index fcee0e5607..9acbc94bb7 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -166,6 +166,9 @@ mod region_replacement { #[derive(Debug)] struct ExpectedIntermediateState(pub RegionReplacementState); + #[derive(Debug)] + struct ExpectedStartState(pub RegionReplacementState); + pub(super) struct DeletedVolumeTest<'a> { log: Logger, datastore: Arc, @@ -298,6 +301,7 @@ mod region_replacement { &self, expected_end_state: ExpectedEndState, expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, ) { wait_for_condition( || { @@ -322,6 +326,14 @@ mod region_replacement { } else if state == expected_intermediate_state.0 { // The saga is still running Err(CondCheckError::<()>::NotYet) + // If the expected start and end state are the same, + // then it's impossible to determine when the saga + // starts and stops based on the state. + } else if expected_end_state.0 != expected_start_state.0 + && state == expected_start_state.0 + { + // The saga hasn't started yet + Err(CondCheckError::<()>::NotYet) } else { // Any other state is not expected panic!("unexpected state {state:?}!"); @@ -364,6 +376,7 @@ mod region_replacement { self.wait_for_request_state( ExpectedEndState(RegionReplacementState::Running), ExpectedIntermediateState(RegionReplacementState::Allocating), + ExpectedStartState(RegionReplacementState::Requested), ) .await; } @@ -382,21 +395,49 @@ mod region_replacement { self.wait_for_request_state( ExpectedEndState(RegionReplacementState::Running), ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), ) .await; // Additionally, assert that the drive saga recorded that it sent - // the attachment request to the simulated pantry + // the attachment request to the simulated pantry. + // + // If `wait_for_request_state` has the same expected start and end + // state (as it does above), it's possible to exit the function + // having not yet started the saga yet, and this requires an + // additional `wait_for_condition` to wait for the expected recorded + // step. + + let most_recent_step = wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; - let most_recent_step = self - .datastore - .current_region_replacement_request_step( - &self.opctx(), - self.replacement_request_id, - ) - .await - .unwrap() - .unwrap(); + async move { + match datastore + .current_region_replacement_request_step( + &opctx, + replacement_request_id, + ) + .await + .unwrap() + { + Some(step) => Ok(step), + + None => { + // The saga is still running - this can + // happen when + Err(CondCheckError::<()>::NotYet) + } + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("most recent step"); assert!(most_recent_step.pantry_address().is_some()); } @@ -444,6 +485,7 @@ mod region_replacement { self.wait_for_request_state( ExpectedEndState(RegionReplacementState::ReplacementDone), ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), ) .await; } @@ -1023,6 +1065,9 @@ mod region_snapshot_replacement { #[derive(Debug)] struct ExpectedIntermediateState(pub RegionSnapshotReplacementState); + #[derive(Debug)] + struct ExpectedStartState(pub RegionSnapshotReplacementState); + pub(super) struct DeletedVolumeTest<'a> { log: Logger, datastore: Arc, @@ -1289,6 +1334,7 @@ mod region_snapshot_replacement { &self, expected_end_state: ExpectedEndState, expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, ) { wait_for_condition( || { @@ -1313,6 +1359,9 @@ mod region_snapshot_replacement { } else if state == expected_intermediate_state.0 { // The saga is still running Err(CondCheckError::<()>::NotYet) + } else if state == expected_start_state.0 { + // The saga hasn't started yet + Err(CondCheckError::<()>::NotYet) } else { // Any other state is not expected panic!("unexpected state {state:?}!"); @@ -1359,6 +1408,7 @@ mod region_snapshot_replacement { ExpectedIntermediateState( RegionSnapshotReplacementState::Allocating, ), + ExpectedStartState(RegionSnapshotReplacementState::Requested), ) .await; } @@ -1382,6 +1432,9 @@ mod region_snapshot_replacement { ExpectedIntermediateState( RegionSnapshotReplacementState::DeletingOldVolume, ), + ExpectedStartState( + RegionSnapshotReplacementState::ReplacementDone, + ), ) .await; } From 309742d871b648b53dcf8b97ed39f0aa8867a135 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 15 Nov 2024 22:40:27 +0000 Subject: [PATCH 03/29] THIS WAS A TREMENDOUS OVERSIGHT Do not allow a volume repair record to be created if the volume does not exist, or was hard-deleted! --- .../src/db/datastore/region_replacement.rs | 37 +++++-- nexus/db-queries/src/db/datastore/volume.rs | 2 +- .../src/db/datastore/volume_repair.rs | 104 +++++++++++++++--- 3 files changed, 115 insertions(+), 28 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index de047d6d0c..e83727ab15 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -21,6 +21,7 @@ use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use crate::db::TransactionError; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -52,24 +53,38 @@ impl DataStore { opctx: &OpContext, request: RegionReplacement, ) -> Result<(), Error> { + let err = OptionalError::new(); self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - use db::schema::region_replacement::dsl; - - Self::volume_repair_insert_query(request.volume_id, request.id) - .execute_async(&conn) + .transaction_async(|conn| { + let err = err.clone(); + async move { + use db::schema::region_replacement::dsl; + + Self::volume_repair_insert_in_txn( + &conn, + err, + request.volume_id, + request.id, + ) .await?; - diesel::insert_into(dsl::region_replacement) - .values(request) - .execute_async(&conn) - .await?; + diesel::insert_into(dsl::region_replacement) + .values(request) + .execute_async(&conn) + .await?; - Ok(()) + Ok(()) + } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_replacement_request_by_id( diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 7a4822acbd..fc37b5e4eb 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -428,7 +428,7 @@ impl DataStore { }) } - async fn volume_get_impl( + pub(super) async fn volume_get_impl( conn: &async_bb8_diesel::Connection, volume_id: Uuid, ) -> Result, diesel::result::Error> { diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 7ea88c8542..6271303c20 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -11,6 +11,8 @@ use crate::db::datastore::RunnableQuery; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::VolumeRepair; +use crate::db::DbConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::result::DatabaseErrorKind; @@ -19,14 +21,44 @@ use omicron_common::api::external::Error; use uuid::Uuid; impl DataStore { - pub(super) fn volume_repair_insert_query( + pub(super) async fn volume_repair_insert_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, volume_id: Uuid, repair_id: Uuid, - ) -> impl RunnableQuery { + ) -> Result<(), diesel::result::Error> { + // Do not allow a volume repair record to be created if the volume does + // not exist, or was hard-deleted! + let maybe_volume = Self::volume_get_impl(conn, volume_id).await?; + + if maybe_volume.is_none() { + return Err(err.bail(Error::invalid_request(format!( + "cannot create record: volume {volume_id} does not exist" + )))); + } + use db::schema::volume_repair::dsl; - diesel::insert_into(dsl::volume_repair) + match diesel::insert_into(dsl::volume_repair) .values(VolumeRepair { volume_id, repair_id }) + .execute_async(conn) + .await + { + Ok(_) => Ok(()), + + Err(e) => match e { + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + ref error_information, + ) if error_information.constraint_name() + == Some("volume_repair_pkey") => + { + Err(err.bail(Error::conflict("volume repair lock"))) + } + + _ => Err(e), + }, + } } pub async fn volume_repair_lock( @@ -36,21 +68,25 @@ impl DataStore { repair_id: Uuid, ) -> Result<(), Error> { let conn = self.pool_connection_authorized(opctx).await?; - Self::volume_repair_insert_query(volume_id, repair_id) - .execute_async(&*conn) + let err = OptionalError::new(); + + self.transaction_retry_wrapper("volume_repair_lock") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, repair_id, + ) + .await + } + }) .await - .map(|_| ()) - .map_err(|e| match e { - DieselError::DatabaseError( - DatabaseErrorKind::UniqueViolation, - ref error_information, - ) if error_information.constraint_name() - == Some("volume_repair_pkey") => - { - Error::conflict("volume repair lock") + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - - _ => public_error_from_diesel(e, ErrorHandler::Server), }) } @@ -102,6 +138,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn volume_lock_conflict_error_returned() { @@ -113,6 +150,20 @@ mod test { let lock_2 = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_1).await.unwrap(); let err = datastore @@ -125,4 +176,25 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + /// Assert that you can't take a volume repair lock if the volume does not + /// exist yet! + #[tokio::test] + async fn volume_lock_should_fail_without_volume() { + let logctx = + dev::test_setup_log("volume_lock_should_fail_without_volume"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_1 = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_repair_lock(&opctx, volume_id, lock_1) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } } From c9455ecf50a21424957618716bbcb8de6c97a87d Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 15 Nov 2024 22:44:25 +0000 Subject: [PATCH 04/29] emit VolumeReplaceResult as saga node data serialize VolumeReplaceResult for a debugging breadcrumb --- nexus/src/app/sagas/region_replacement_start.rs | 6 ++---- nexus/src/app/sagas/region_snapshot_replacement_start.rs | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index a71a7498ac..e8e7881283 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -491,7 +491,7 @@ async fn srrs_get_old_region_address( async fn srrs_replace_region_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; @@ -555,8 +555,6 @@ async fn srrs_replace_region_in_volume( .await .map_err(ActionError::action_failed)?; - debug!(log, "replacement returned {:?}", volume_replace_region_result); - match volume_replace_region_result { VolumeReplaceResult::AlreadyHappened | VolumeReplaceResult::Done => { // The replacement was done either by this run of this saga node, or @@ -565,7 +563,7 @@ async fn srrs_replace_region_in_volume( // with the rest of the saga (to properly clean up allocated // resources). - Ok(()) + Ok(volume_replace_region_result) } VolumeReplaceResult::ExistingVolumeDeleted => { diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 8e67b8fc85..96b904f394 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -759,7 +759,7 @@ async fn get_replace_params( async fn rsrss_replace_snapshot_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); @@ -793,7 +793,7 @@ async fn rsrss_replace_snapshot_in_volume( // if the transaction occurred on the non-deleted volume so proceed // with the rest of the saga. - Ok(()) + Ok(volume_replace_snapshot_result) } VolumeReplaceResult::ExistingVolumeDeleted => { @@ -805,7 +805,7 @@ async fn rsrss_replace_snapshot_in_volume( // deleted. If this saga unwound here, that would violate the // property of idempotency. - Ok(()) + Ok(volume_replace_snapshot_result) } } } From 4630e03159e3bf5afffa12bc6b1f1d7a8e1a1b9d Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 15 Nov 2024 22:59:39 +0000 Subject: [PATCH 05/29] fix a bunch of tests that were locking non-existent volumes --- .../src/db/datastore/region_replacement.rs | 43 +++++++++++++++++++ .../datastore/region_snapshot_replacement.rs | 15 +++++++ 2 files changed, 58 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index e83727ab15..ace8aca144 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -912,6 +912,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -923,6 +924,20 @@ mod test { let region_2_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionReplacement::new(region_1_id, volume_id); let request_2 = RegionReplacement::new(region_2_id, volume_id); @@ -954,6 +969,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -1046,6 +1075,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::ReplacementDone; diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 546f66be82..c29dee047a 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -1256,6 +1256,7 @@ mod test { use crate::db::model::RegionReplacement; use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -1731,6 +1732,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); datastore From 4c729e658d69eb5700a36a8e5cd1dc145ccfb7d6 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 15 Nov 2024 23:12:27 +0000 Subject: [PATCH 06/29] fix a bunch more tests that were locking non-existent volumes --- .../background/tasks/region_replacement.rs | 15 +++++ .../tasks/region_replacement_driver.rs | 57 +++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index f86ba8eb8f..3b854a618b 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -314,6 +314,21 @@ mod test { // Add a region replacement request for a fake region let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); let request_id = request.id; diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index 02db86eab3..d11de04f24 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -257,6 +257,7 @@ mod test { use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -287,6 +288,20 @@ mod test { let new_region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -381,6 +396,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state ReplacementDone. Set the new_region_id to the region created // above. @@ -480,6 +509,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { @@ -629,6 +672,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { From 1c50f374205a7a1cf5c1b36860d5e5d851741782 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 15 Nov 2024 23:25:28 +0000 Subject: [PATCH 07/29] fix another test that was locking non-existent volumes --- nexus/src/app/sagas/region_replacement_finish.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index 8ea77f4e97..7259a1623b 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -310,6 +310,20 @@ pub(crate) mod test { operating_saga_id: None, }; + datastore + .volume_create(nexus_db_model::Volume::new( + new_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: new_volume_id, + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_replacement_request(&opctx, request.clone()) .await From 63cb56eac782cfc3d82ee1f35a6338feb9fc6697 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 18 Nov 2024 19:54:09 +0000 Subject: [PATCH 08/29] the TREMENDOUS oversight continues nothing should directly insert into volume_repair dsl anymore --- .../datastore/region_snapshot_replacement.rs | 281 +++++++++++++++--- .../region_snapshot_replacement_finish.rs | 55 +++- ...on_snapshot_replacement_garbage_collect.rs | 41 ++- .../region_snapshot_replacement_start.rs | 39 ++- .../tasks/region_snapshot_replacement_step.rs | 56 +++- ...on_snapshot_replacement_garbage_collect.rs | 18 +- ...apshot_replacement_step_garbage_collect.rs | 18 +- 7 files changed, 440 insertions(+), 68 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index c29dee047a..8129eaf212 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -17,12 +17,12 @@ use crate::db::model::RegionSnapshotReplacement; use crate::db::model::RegionSnapshotReplacementState; use crate::db::model::RegionSnapshotReplacementStep; use crate::db::model::RegionSnapshotReplacementStepState; -use crate::db::model::VolumeRepair; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; use crate::db::TransactionError; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -85,32 +85,42 @@ impl DataStore { request: RegionSnapshotReplacement, volume_id: Uuid, ) -> Result<(), Error> { + let err = OptionalError::new(); self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - use db::schema::region_snapshot_replacement::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; - - // An associated volume repair record isn't _strictly_ needed: - // snapshot volumes should never be directly constructed, and - // therefore won't ever have an associated Upstairs that - // receives a volume replacement request. However it's being - // done in an attempt to be overly cautious. - - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { volume_id, repair_id: request.id }) - .execute_async(&conn) + .transaction_async(|conn| { + let err = err.clone(); + async move { + use db::schema::region_snapshot_replacement::dsl; + + // An associated volume repair record isn't _strictly_ + // needed: snapshot volumes should never be directly + // constructed, and therefore won't ever have an associated + // Upstairs that receives a volume replacement request. + // However it's being done in an attempt to be overly + // cautious. + + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, request.id, + ) .await?; - diesel::insert_into(dsl::region_snapshot_replacement) - .values(request) - .execute_async(&conn) - .await?; + diesel::insert_into(dsl::region_snapshot_replacement) + .values(request) + .execute_async(&conn) + .await?; - Ok(()) + Ok(()) + } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_request_by_id( @@ -749,6 +759,7 @@ impl DataStore { opctx: &OpContext, request: RegionSnapshotReplacementStep, ) -> Result { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper( @@ -756,10 +767,10 @@ impl DataStore { ) .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_snapshot_replacement_step::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; // Skip inserting this new record if we found another region // snapshot replacement step with this volume in the step's @@ -812,13 +823,13 @@ impl DataStore { // volume replacement: create an associated volume repair // record. - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { - volume_id: request.volume_id, - repair_id: request.id, - }) - .execute_async(&conn) - .await?; + Self::volume_repair_insert_in_txn( + &conn, + err, + request.volume_id, + request.id, + ) + .await?; let request_id = request.id; @@ -831,7 +842,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_step_by_id( @@ -1274,6 +1291,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1320,6 +1351,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1356,6 +1401,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); @@ -1388,11 +1447,25 @@ mod test { // Insert some replacement steps, and make sure counting works + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); let result = datastore .insert_region_snapshot_replacement_step(&opctx, step) @@ -1421,11 +1494,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Running; @@ -1457,11 +1544,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); // VolumeDeleted does not count as "in-progress" step.replacement_state = @@ -1511,6 +1612,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let step = RegionSnapshotReplacementStep::new(Uuid::new_v4(), volume_id); let first_request_id = step.id; @@ -1605,6 +1720,22 @@ mod test { let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = RegionSnapshotReplacement::new( Uuid::new_v4(), Uuid::new_v4(), @@ -1616,9 +1747,7 @@ mod test { datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -1631,8 +1760,24 @@ mod test { .unwrap() .is_empty()); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1642,8 +1787,24 @@ mod test { assert!(matches!(result, InsertStepResult::Inserted { .. })); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1683,6 +1844,34 @@ mod test { let volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore + .volume_create(nexus_db_model::Volume::new( + old_snapshot_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = RegionSnapshotReplacementStep::new(request_id, volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 51d3df0862..e63f0e5bde 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -197,6 +197,7 @@ mod test { use nexus_db_model::RegionSnapshotReplacementStepState; use nexus_db_queries::db::datastore::region_snapshot_replacement; use nexus_test_utils_macros::nexus_test; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -237,11 +238,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -298,14 +313,44 @@ mod test { let operating_saga_id = Uuid::new_v4(); + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_1 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_1.replacement_state = RegionSnapshotReplacementStepState::Complete; step_1.operating_saga_id = Some(operating_saga_id); let step_1_id = step_1.id; + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_2 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_2.replacement_state = RegionSnapshotReplacementStepState::Complete; step_2.operating_saga_id = Some(operating_saga_id); let step_2_id = step_2.id; diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index f3b1b68198..c2895eef1f 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -154,6 +154,7 @@ mod test { use nexus_db_model::RegionSnapshotReplacement; use nexus_db_model::RegionSnapshotReplacementState; use nexus_test_utils_macros::nexus_test; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -198,11 +199,25 @@ mod test { let request_1_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -218,11 +233,25 @@ mod test { let request_2_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index f568535086..cb5a4f81a0 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -276,6 +276,7 @@ mod test { use nexus_test_utils_macros::nexus_test; use omicron_common::api::external; use omicron_uuid_kinds::GenericUuid; + use sled_agent_client::types::VolumeConstructionRequest; use std::collections::BTreeMap; use uuid::Uuid; @@ -317,11 +318,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -408,6 +423,22 @@ mod test { .await .unwrap(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .project_ensure_snapshot( &opctx, @@ -427,7 +458,7 @@ mod test { project_id, disk_id: Uuid::new_v4(), - volume_id: Uuid::new_v4(), + volume_id, destination_volume_id: Uuid::new_v4(), gen: Generation::new(), diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index 1a3f218e3f..7ab1167fea 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -655,11 +655,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -813,11 +827,27 @@ mod test { // Now, add some Complete records and make sure the garbage collection // saga is invoked. + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = @@ -831,11 +861,27 @@ mod test { assert!(matches!(result, InsertStepResult::Inserted { .. })); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = diff --git a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs index 9ebc6f0271..ebcc57e816 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs @@ -283,11 +283,27 @@ pub(crate) mod test { RegionSnapshotReplacementState::ReplacementDone; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( &opctx, request.clone(), - Uuid::new_v4(), + volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs index b83f917a70..15c6a39651 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs @@ -187,8 +187,24 @@ pub(crate) mod test { .await .unwrap(); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = - RegionSnapshotReplacementStep::new(Uuid::new_v4(), Uuid::new_v4()); + RegionSnapshotReplacementStep::new(Uuid::new_v4(), step_volume_id); request.replacement_state = RegionSnapshotReplacementStepState::Complete; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); From 79f8ad9f63c83f6f646496d78d908b88f6a3646e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 19 Nov 2024 16:00:30 +0000 Subject: [PATCH 09/29] fix after merge --- .../src/db/datastore/region_snapshot_replacement.rs | 2 +- .../background/tasks/region_snapshot_replacement_finish.rs | 2 +- .../tasks/region_snapshot_replacement_garbage_collect.rs | 2 +- nexus/tests/integration_tests/crucible_replacements.rs | 5 ++++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 6d0b59b18a..1848d010d4 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -1273,8 +1273,8 @@ mod test { use crate::db::model::RegionReplacement; use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; - use sled_agent_client::types::VolumeConstructionRequest; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index e494cbe98c..61a84c579d 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -197,8 +197,8 @@ mod test { use nexus_db_model::RegionSnapshotReplacementStepState; use nexus_db_queries::db::datastore::region_snapshot_replacement; use nexus_test_utils_macros::nexus_test; - use sled_agent_client::types::VolumeConstructionRequest; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index eb6eb90337..57bbf3741c 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -154,8 +154,8 @@ mod test { use nexus_db_model::RegionSnapshotReplacement; use nexus_db_model::RegionSnapshotReplacementState; use nexus_test_utils_macros::nexus_test; - use sled_agent_client::types::VolumeConstructionRequest; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 9acbc94bb7..3e64a8e58d 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -818,7 +818,10 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( .unwrap(); assert_eq!(region_snapshot_replacements.len(), 1); - assert_eq!(region_snapshot_replacements[0].old_dataset_id, dataset.id()); + assert_eq!( + region_snapshot_replacements[0].old_dataset_id, + dataset.id().into() + ); assert_eq!(region_snapshot_replacements[0].old_region_id, region.id()); assert_eq!( region_snapshot_replacements[0].old_snapshot_id, From 8b0a6779eff9265051c56d09df072869149d2814 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 22 Nov 2024 16:39:51 +0000 Subject: [PATCH 10/29] account for relocking a volume --- .../src/db/datastore/volume_repair.rs | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 6271303c20..45c54e8887 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -27,6 +27,24 @@ impl DataStore { volume_id: Uuid, repair_id: Uuid, ) -> Result<(), diesel::result::Error> { + use db::schema::volume_repair::dsl; + + // If a lock that matches the arguments exists already, return Ok + // + // Note: if rerunning this function (for example if a saga node was + // rerun), the volume could have existed when this lock was inserted the + // first time, but have been deleted now. + let maybe_lock = dsl::volume_repair + .filter(dsl::repair_id.eq(repair_id)) + .filter(dsl::volume_id.eq(volume_id)) + .first_async::(conn) + .await + .optional()?; + + if maybe_lock.is_some() { + return Ok(()); + } + // Do not allow a volume repair record to be created if the volume does // not exist, or was hard-deleted! let maybe_volume = Self::volume_get_impl(conn, volume_id).await?; @@ -37,7 +55,8 @@ impl DataStore { )))); } - use db::schema::volume_repair::dsl; + // Do not check for soft-deletion here: We may want to request locks for + // soft-deleted volumes. match diesel::insert_into(dsl::volume_repair) .values(VolumeRepair { volume_id, repair_id }) @@ -197,4 +216,34 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn volume_lock_relock_allowed() { + let logctx = dev::test_setup_log("volume_lock_relock_allowed"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + + db.terminate().await; + logctx.cleanup_successful(); + } } From b2a740f6ec039c823e217db6c6d86eec0b85bd2c Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 22 Nov 2024 16:40:24 +0000 Subject: [PATCH 11/29] add comment about validating volume exists --- .../src/db/datastore/region_snapshot_replacement.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 1848d010d4..a0f03f001b 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -98,7 +98,10 @@ impl DataStore { // constructed, and therefore won't ever have an associated // Upstairs that receives a volume replacement request. // However it's being done in an attempt to be overly - // cautious. + // cautious, and it validates that the volume exist: + // otherwise it would be possible to create a region + // snapshot replacement request for a volume that didn't + // exist! Self::volume_repair_insert_in_txn( &conn, err, volume_id, request.id, From 18efdd3aa57e443dd6ea275fc6f40564f1554b0b Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 22 Nov 2024 16:40:38 +0000 Subject: [PATCH 12/29] disambiguate between soft and hard delete --- nexus/db-queries/src/db/datastore/volume.rs | 15 +++++++++------ nexus/src/app/sagas/region_replacement_start.rs | 3 ++- .../sagas/region_snapshot_replacement_start.rs | 3 ++- .../app/sagas/region_snapshot_replacement_step.rs | 6 ++++-- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index e7fde81ae2..505533da50 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -2692,8 +2692,11 @@ pub enum VolumeReplaceResult { // this call performed the replacement Done, - // the "existing" volume was deleted - ExistingVolumeDeleted, + // the "existing" volume was soft deleted + ExistingVolumeSoftDeleted, + + // the "existing" volume was hard deleted + ExistingVolumeHardDeleted, } impl DataStore { @@ -2818,14 +2821,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = @@ -3072,14 +3075,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index d02917dd5a..aa9e83c037 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -566,7 +566,8 @@ async fn srrs_replace_region_in_volume( Ok(volume_replace_region_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Unwind the saga here to clean up the resources allocated during // this saga. The associated background task will transition this // request's state to Completed. diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index ffd4d398f4..5fe4e21fde 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -796,7 +796,8 @@ async fn rsrss_replace_snapshot_in_volume( Ok(volume_replace_snapshot_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // If the snapshot volume was deleted, we still want to proceed with // replacing the rest of the uses of the region snapshot. Note this // also covers the case where this saga node runs (performing the diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 49fa23d953..947becb5a8 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -375,7 +375,8 @@ async fn rsrss_replace_snapshot_in_volume( // with the saga. } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Proceed with the saga but skip the notification step. } } @@ -432,7 +433,8 @@ async fn rsrss_notify_upstairs( .lookup::("volume_replace_snapshot_result")?; if matches!( volume_replace_snapshot_result, - VolumeReplaceResult::ExistingVolumeDeleted + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted ) { return Ok(()); } From 640e6787993f4ec7f8c5e624af108cd7f7b6f174 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 22 Nov 2024 20:03:48 +0000 Subject: [PATCH 13/29] cover the case that the region snapshot finish saga kicks off in the test --- nexus/tests/integration_tests/crucible_replacements.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 3e64a8e58d..839b815d91 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -942,6 +942,10 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( } else if state == RegionReplacementState::Driving { // The saga is still running Err(CondCheckError::<()>::NotYet) + } else if state == RegionReplacementState::Completing { + // The saga transitioned the request ok, and it's now being + // finished by the region replacement finish saga + Ok(()) } else { // Any other state is not expected panic!("unexpected state {state:?}!"); From 1a60b15f1075bd5021b3cde0ae23a78a80dec822 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 22 Nov 2024 21:43:31 +0000 Subject: [PATCH 14/29] wait for state to transition to complete --- .../crucible_replacements.rs | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 839b815d91..ac23be9de2 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -988,18 +988,37 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( // Ensure the region snapshot replacement request went to Complete - let region_snapshot_replacement = datastore - .get_region_snapshot_replacement_request_by_id( - &opctx, - region_snapshot_replacements[0].id, - ) - .await - .unwrap(); + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let request_id = region_snapshot_replacements[0].id; - assert_eq!( - region_snapshot_replacement.replacement_state, - RegionSnapshotReplacementState::Complete, - ); + async move { + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = region_snapshot_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + // Any other state is not expected + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); // Delete the snapshot From 9fc46ab18f494981da2378fb99a7f5850c78380e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 25 Nov 2024 16:28:20 +0000 Subject: [PATCH 15/29] just an optimization --- nexus/src/app/background/tasks/region_replacement.rs | 4 +++- .../app/background/tasks/region_snapshot_replacement_step.rs | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index 3b854a618b..e20a79b7a7 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -173,7 +173,9 @@ impl BackgroundTask for RegionReplacementDetector { // If the replacement request is in the `requested` state and // the request's volume was soft-deleted or hard-deleted, avoid // sending the start request and instead transition the request - // to completed + // to completed. Note the saga will do the right thing if the + // volume is deleted, but this avoids the overhead of starting + // it. let volume_deleted = match self .datastore diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index 221dee7d4b..7967db44e0 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -410,7 +410,9 @@ impl RegionSnapshotReplacementFindAffected { let request_step_id = request.id; // Check if the volume was deleted _after_ the replacement step was - // created. + // created. Avoid launching the region snapshot replacement step + // saga if it was deleted: the saga will do the right thing if it is + // deleted, but this avoids the overhead of starting it. let volume_deleted = match self.datastore.volume_deleted(request.volume_id).await { From 2cd7881268c57d99b26c4c5bec7669a37e4d8c96 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 25 Nov 2024 16:54:18 +0000 Subject: [PATCH 16/29] add missing unwind edge --- nexus/db-model/src/region_snapshot_replacement.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nexus/db-model/src/region_snapshot_replacement.rs b/nexus/db-model/src/region_snapshot_replacement.rs index cd1d5c3f10..28627d8379 100644 --- a/nexus/db-model/src/region_snapshot_replacement.rs +++ b/nexus/db-model/src/region_snapshot_replacement.rs @@ -81,12 +81,12 @@ impl std::str::FromStr for RegionSnapshotReplacementState { /// | | /// v --- /// --- -/// Running | -/// | -/// | | -/// v | -/// | responsibility of region snapshot -/// Completing | replacement finish saga +/// Running <-- | +/// | | +/// | | | +/// v | | +/// | | responsibility of region snapshot +/// Completing -- | replacement finish saga /// | /// | | /// v | From a78cabf45137705fa8645a7799ed4afcf310d7cd Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 25 Nov 2024 17:10:36 +0000 Subject: [PATCH 17/29] remove likely --- nexus/src/app/sagas/region_snapshot_replacement_step.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 947becb5a8..a236fcf62c 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -424,10 +424,9 @@ async fn rsrss_notify_upstairs( let params = sagactx.saga_params::()?; let log = sagactx.user_data().log(); - // If the associated volume was deleted, then skip this notification step. - // It's likely there is no Upstairs to talk to, but continue with the saga - // to transition the step request to Complete, and then perform the - // associated clean up. + // If the associated volume was deleted, then skip this notification step as + // there is no Upstairs to talk to. Continue with the saga to transition the + // step request to Complete, and then perform the associated clean up. let volume_replace_snapshot_result = sagactx .lookup::("volume_replace_snapshot_result")?; From c869cad330e7d10bb7a12e3e6bc097ac6522e013 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 27 Nov 2024 19:25:46 +0000 Subject: [PATCH 18/29] handle if region snapshot is hard deleted while replacement request is in state Requested --- .../datastore/region_snapshot_replacement.rs | 77 +++++++++++++++++++ .../region_snapshot_replacement_start.rs | 60 +++++++++++++++ .../crucible_replacements.rs | 27 +++++++ 3 files changed, 164 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index a0f03f001b..a81299d5d0 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -746,6 +746,83 @@ impl DataStore { }) } + /// Transition a RegionSnapshotReplacement record from Requested to Complete + /// - this is required when the region snapshot is hard-deleted, which means + /// that all volume references are gone and no replacement is required. Also + /// removes the `volume_repair` record that is taking a "lock" on the + /// Volume. + pub async fn set_region_snapshot_replacement_complete_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + ) -> Result<(), Error> { + type TxnError = TransactionError; + + self.pool_connection_authorized(opctx) + .await? + .transaction_async(|conn| async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement::dsl; + + let result = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Requested), + ) + .filter(dsl::operating_saga_id.is_null()) + .filter(dsl::new_region_volume_id.is_null()) + .set((dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete),)) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementState::Complete + { + Ok(()) + } else { + Err(TxnError::CustomError(Error::conflict( + format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ), + ))) + } + } + } + }) + .await + .map_err(|e| match e { + TxnError::CustomError(error) => error, + + TxnError::Database(error) => { + public_error_from_diesel(error, ErrorHandler::Server) + } + }) + } + pub async fn create_region_snapshot_replacement_step( &self, opctx: &OpContext, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 36cf03c2f4..4c33343041 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -186,6 +186,66 @@ impl RegionSnapshotReplacementDetector { for request in requests { let request_id = request.id; + // If the region snapshot is gone, then there are no more references + // in any volume, and the whole region snapshot replacement can be + // fast-tracked to Complete. + + let maybe_region_snapshot = match self + .datastore + .region_snapshot_get( + request.old_dataset_id.into(), + request.old_region_id, + request.old_snapshot_id, + ) + .await + { + Ok(maybe_region_snapshot) => maybe_region_snapshot, + + Err(e) => { + let s = format!("query for region snapshot failed: {e}"); + + error!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.errors.push(s); + return; + } + }; + + if maybe_region_snapshot.is_none() { + match self + .datastore + .set_region_snapshot_replacement_complete_from_requested( + &opctx, request.id, + ) + .await + { + Ok(()) => { + let s = format!( + "region snapshot replacement {request_id} \ + completed ok" + ); + info!(&log, "{s}"); + status.start_invoked_ok.push(s); + } + + Err(e) => { + let s = format!( + "query to set region snapshot request state \ + to complete failed: {e}" + ); + + error!(&log, "{s}"; "request.id" => %request_id); + status.errors.push(s); + continue; + } + } + } + let result = self .send_start_request( authn::saga::Serialized::for_opctx(opctx), diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index ac23be9de2..e58217ce70 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -1563,6 +1563,33 @@ async fn test_delete_volume_region_snapshot_replacement_state_requested_2( test_harness.assert_no_crucible_resources_leaked().await; } +/// Assert that a region snapshot replacement request in state "Requested" can +/// have everything be deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_3( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + /// Assert that a region snapshot replacement request in state "ReplacementDone" /// can have its snapshot deleted, and the request will still transition to /// Complete From 7b93e2e95642a12828a34185cd676e9b92c93c03 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 9 Dec 2024 22:40:31 +0000 Subject: [PATCH 19/29] fmt --- .../datastore/region_snapshot_replacement.rs | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 7df18623b8..fcb0405dde 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -97,45 +97,45 @@ impl DataStore { self.transaction_retry_wrapper( "insert_region_snapshot_replacement_request_with_volume_id", - ) - .transaction(&conn, |conn| { - let request = request.clone(); - let err = err.clone(); - async move { - use db::schema::region_snapshot_replacement::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; + ) + .transaction(&conn, |conn| { + let request = request.clone(); + let err = err.clone(); + async move { + use db::schema::region_snapshot_replacement::dsl; + use db::schema::volume_repair::dsl as volume_repair_dsl; - // An associated volume repair record isn't _strictly_ - // needed: snapshot volumes should never be directly - // constructed, and therefore won't ever have an associated - // Upstairs that receives a volume replacement request. - // However it's being done in an attempt to be overly - // cautious, and it validates that the volume exist: - // otherwise it would be possible to create a region - // snapshot replacement request for a volume that didn't - // exist! - - Self::volume_repair_insert_in_txn( - &conn, err, volume_id, request.id, - ) - .await?; + // An associated volume repair record isn't _strictly_ + // needed: snapshot volumes should never be directly + // constructed, and therefore won't ever have an associated + // Upstairs that receives a volume replacement request. + // However it's being done in an attempt to be overly + // cautious, and it validates that the volume exist: + // otherwise it would be possible to create a region + // snapshot replacement request for a volume that didn't + // exist! - diesel::insert_into(dsl::region_snapshot_replacement) - .values(request) - .execute_async(&conn) - .await?; + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, request.id, + ) + .await?; - Ok(()) - } - }) - .await - .map_err(|e| { - if let Some(err) = err.take() { - err - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + diesel::insert_into(dsl::region_snapshot_replacement) + .values(request) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_request_by_id( From cb1c2fe4464c0e6cce191dc1c791a050d5a2047a Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Mon, 9 Dec 2024 23:04:51 +0000 Subject: [PATCH 20/29] fix compile time errors from merge --- .../src/db/datastore/region_replacement.rs | 1 + .../datastore/region_snapshot_replacement.rs | 84 +++++++++---------- .../crucible_replacements.rs | 2 +- 3 files changed, 44 insertions(+), 43 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index ae1692704d..b0b47d6756 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -58,6 +58,7 @@ impl DataStore { self.transaction_retry_wrapper("insert_region_replacement_request") .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_replacement::dsl; diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index fcb0405dde..d4c36eca3f 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -103,7 +103,6 @@ impl DataStore { let err = err.clone(); async move { use db::schema::region_snapshot_replacement::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; // An associated volume repair record isn't _strictly_ // needed: snapshot volumes should never be directly @@ -689,11 +688,15 @@ impl DataStore { region_snapshot_replacement_id: Uuid, operating_saga_id: Uuid, ) -> Result<(), Error> { - type TxnError = TransactionError; + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + async move { use db::schema::volume_repair::dsl as volume_repair_dsl; diesel::delete( @@ -735,27 +738,23 @@ impl DataStore { { Ok(()) } else { - Err(TxnError::CustomError(Error::conflict( - format!( + Err(err.bail(Error::conflict(format!( "region snapshot replacement {} set to {:?} \ - (operating saga id {:?})", + (operating saga id {:?})", region_snapshot_replacement_id, record.replacement_state, record.operating_saga_id, - ), - ))) + )))) } } } - }) - .await - .map_err(|e| match e { - TxnError::CustomError(error) => error, - - TxnError::Database(error) => { - public_error_from_diesel(error, ErrorHandler::Server) - } - }) + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) } /// Transition a RegionSnapshotReplacement record from Requested to Complete @@ -797,8 +796,8 @@ impl DataStore { ) .filter(dsl::operating_saga_id.is_null()) .filter(dsl::new_region_volume_id.is_null()) - .set((dsl::replacement_state - .eq(RegionSnapshotReplacementState::Complete))) + .set(dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete)) .check_if_exists::( region_snapshot_replacement_id, ) @@ -1292,11 +1291,16 @@ impl DataStore { opctx: &OpContext, region_snapshot_replacement_step: RegionSnapshotReplacementStep, ) -> Result<(), Error> { - type TxnError = TransactionError; + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { use db::schema::volume_repair::dsl as volume_repair_dsl; diesel::delete( @@ -1339,27 +1343,23 @@ impl DataStore { { Ok(()) } else { - Err(TxnError::CustomError(Error::conflict( - format!( - "region snapshot replacement step {} set \ + Err(err.bail(Error::conflict(format!( + "region snapshot replacement step {} set \ to {:?} (operating saga id {:?})", - region_snapshot_replacement_step.id, - record.replacement_state, - record.operating_saga_id, - ), - ))) + region_snapshot_replacement_step.id, + record.replacement_state, + record.operating_saga_id, + )))) } } } - }) - .await - .map_err(|e| match e { - TxnError::CustomError(error) => error, - - TxnError::Database(error) => { - public_error_from_diesel(error, ErrorHandler::Server) - } - }) + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) } } diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 11f2ecfa26..edf2062e2f 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -652,7 +652,7 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( datastore .physical_disk_update_policy( &opctx, - db_zpool.physical_disk_id, + db_zpool.physical_disk_id.into(), PhysicalDiskPolicy::Expunged, ) .await From fdf800a747725bcc71cf672430550487ece4a154 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 10 Dec 2024 16:00:52 +0000 Subject: [PATCH 21/29] conflicts are not errors! --- .../background/tasks/region_replacement.rs | 35 ++++++++++++----- .../region_snapshot_replacement_start.rs | 38 +++++++++++++------ .../tasks/region_snapshot_replacement_step.rs | 28 ++++++++++---- 3 files changed, 74 insertions(+), 27 deletions(-) diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index e20a79b7a7..eb01e96f6a 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionReplacementStatus; +use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use serde_json::json; @@ -42,7 +43,7 @@ impl RegionReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_replacement_start::Params { serialized_authn, request, @@ -134,15 +135,31 @@ impl BackgroundTask for RegionReplacementDetector { } Err(e) => { - let s = format!( - "error adding region replacement request for \ - region {} volume id {}: {e}", - region.id(), - region.volume_id(), - ); - error!(&log, "{s}"); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error adding region replacement \ + request for region {} volume id {}: \ + {e}", + region.id(), + region.volume_id(), + ); + error!(&log, "{s}"); + + status.errors.push(s); + } + } - status.errors.push(s); continue; } } diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index be9b115785..4fe64d7662 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -139,17 +139,33 @@ impl RegionSnapshotReplacementDetector { } Err(e) => { - let s = - format!("error creating replacement request: {e}"); - - error!( - &log, - "{s}"; - "snapshot_id" => %region_snapshot.snapshot_id, - "region_id" => %region_snapshot.region_id, - "dataset_id" => %region_snapshot.dataset_id, - ); - status.errors.push(s); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error creating replacement request: {e}" + ); + + error!( + &log, + "{s}"; + "snapshot_id" => %region_snapshot.snapshot_id, + "region_id" => %region_snapshot.region_id, + "dataset_id" => %region_snapshot.dataset_id, + ); + + status.errors.push(s); + } + } } } } diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index 7967db44e0..f481126312 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -36,6 +36,7 @@ use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::DataStore; use nexus_types::identity::Asset; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -53,7 +54,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_step::Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), request, @@ -70,7 +71,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let Some(old_snapshot_volume_id) = request.old_snapshot_volume_id else { // This state is illegal! @@ -79,9 +80,7 @@ impl RegionSnapshotReplacementFindAffected { request.id, ); - return Err(omicron_common::api::external::Error::internal_error( - &s, - )); + return Err(Error::internal_error(&s)); }; let params = @@ -364,13 +363,28 @@ impl RegionSnapshotReplacementFindAffected { Err(e) => { let s = format!("error creating step request: {e}"); - error!( + warn!( log, "{s}"; "request id" => ?request.id, "volume id" => ?volume.id(), ); - status.errors.push(s); + + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + status.errors.push(s); + } + } } } } From abf522e2da873d58fad874291a5b73b5b04d6e4f Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 10 Dec 2024 16:01:02 +0000 Subject: [PATCH 22/29] add dummy region snapshot --- .../tasks/region_snapshot_replacement_start.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 4fe64d7662..df10b5da92 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -387,10 +387,21 @@ mod test { // Add a region snapshot replacement request for a fake region snapshot + let dataset_id = DatasetUuid::new_v4(); + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::]:12345".to_string(), + ); + + datastore.region_snapshot_create(region_snapshot).await.unwrap(); + let request = RegionSnapshotReplacement::new( - DatasetUuid::new_v4(), // dataset id - Uuid::new_v4(), // region id - Uuid::new_v4(), // snapshot id + dataset_id, region_id, snapshot_id ); let request_id = request.id; From 715c9f7bd9efb3b02a8e99826d69609e3c24c121 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 10 Dec 2024 16:08:02 +0000 Subject: [PATCH 23/29] fmt --- .../tasks/region_snapshot_replacement_start.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index df10b5da92..fbf5488fe8 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -391,19 +391,18 @@ mod test { let region_id = Uuid::new_v4(); let snapshot_id = Uuid::new_v4(); - let region_snapshot = RegionSnapshot::new( + let region_snapshot = RegionSnapshot::new( dataset_id, region_id, snapshot_id, "[::]:12345".to_string(), - ); - - datastore.region_snapshot_create(region_snapshot).await.unwrap(); - - let request = RegionSnapshotReplacement::new( - dataset_id, region_id, snapshot_id ); + datastore.region_snapshot_create(region_snapshot).await.unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + let request_id = request.id; let volume_id = Uuid::new_v4(); From e4d0844a1e36d4bc9ae7875bf90057e222fc8f5d Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 17 Dec 2024 19:36:34 +0000 Subject: [PATCH 24/29] comment for volume_repair_insert_in_txn --- nexus/db-queries/src/db/datastore/volume_repair.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 45c54e8887..598d9d77a2 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -21,6 +21,18 @@ use omicron_common::api::external::Error; use uuid::Uuid; impl DataStore { + /// Insert a volume repair record, taking a "lock" on the volume pointed to + /// by volume id with some repair id. + /// + /// If there exists a record that has a matching volume id and repair id, + /// return Ok(()). + /// + /// If there is no volume that matches the given volume id, return an error: + /// it should not be possible to lock a volume that does not exist! Note + /// that it is possible to lock a soft-deleted volume. + /// + /// If there is already an existing record that has a matching volume id but + /// a different repair id, then this function returns an Error::conflict. pub(super) async fn volume_repair_insert_in_txn( conn: &async_bb8_diesel::Connection, err: OptionalError, From 518a86c455af38d0ffc0fe47c09e8261f0627048 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 17 Dec 2024 20:39:56 +0000 Subject: [PATCH 25/29] cargo fmt missed this one! --- .../db-queries/src/db/datastore/region_snapshot_replacement.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index d4c36eca3f..90b014c582 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -802,7 +802,7 @@ impl DataStore { region_snapshot_replacement_id, ) .execute_and_check(&conn) - .await?; + .await?; match result.status { UpdateStatus::Updated => Ok(()), From ec9bb9aa5cd6bd27eb37b4c24652c8e6c551b647 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 18 Dec 2024 17:37:59 +0000 Subject: [PATCH 26/29] when completing a region snapshot replacement from the start background task, insert into the `requests_completed_ok` vec! also, add a test for this, which exposed another bug: the continue that would have skipped starting the start saga was in the wrong place. --- .../region_snapshot_replacement_start.rs | 197 +++++++++++++++++- 1 file changed, 195 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index fbf5488fe8..f2b82a3943 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -246,7 +246,7 @@ impl RegionSnapshotReplacementDetector { completed ok" ); info!(&log, "{s}"); - status.start_invoked_ok.push(s); + status.requests_completed_ok.push(s); } Err(e) => { @@ -257,9 +257,10 @@ impl RegionSnapshotReplacementDetector { error!(&log, "{s}"; "request.id" => %request_id); status.errors.push(s); - continue; } } + + continue; } let result = self @@ -346,6 +347,7 @@ mod test { use nexus_db_model::Snapshot; use nexus_db_model::SnapshotIdentity; use nexus_db_model::SnapshotState; + use nexus_db_model::VolumeResourceUsage; use nexus_db_queries::authz; use nexus_db_queries::db::lookup::LookupPath; use nexus_test_utils::resource_helpers::create_project; @@ -353,6 +355,7 @@ mod test { use omicron_common::api::external; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; + use sled_agent_client::types::CrucibleOpts; use sled_agent_client::types::VolumeConstructionRequest; use std::collections::BTreeMap; use uuid::Uuid; @@ -627,4 +630,194 @@ mod test { dataset_to_zpool.get(&first_zpool.id.to_string()).unwrap(); assert_eq!(&request.old_dataset_id.to_string(), dataset_id); } + + #[nexus_test(server = crate::Server)] + async fn test_delete_region_snapshot_replacement_volume_causes_complete( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = RegionSnapshotReplacementDetector::new( + datastore.clone(), + starter.clone(), + ); + + // Noop test + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementStartStatus::default()); + assert_eq!(starter.count_reset(), 0); + + // The volume reference counting machinery needs a fake dataset to exist + // (region snapshots are joined with the dataset table when creating the + // CrucibleResources object) + + let disk_test = DiskTest::new(cptestctx).await; + + let dataset_id = disk_test.zpools().next().unwrap().datasets[0].id; + + // Add a region snapshot replacement request for a fake region snapshot + + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::1]:12345".to_string(), + ); + + datastore + .region_snapshot_create(region_snapshot.clone()) + .await + .unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + + let request_id = request.id; + + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + // the region snapshot + String::from("[::1]:12345"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(!records.is_empty()); + assert_eq!(records[0].volume_id, volume_id); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request, volume_id, + ) + .await + .unwrap(); + + // Before the task starts, soft-delete the volume, and delete the + // region snapshot (like the volume delete saga would do). + + let crucible_resources = + datastore.soft_delete_volume(volume_id).await.unwrap(); + + // Assert no more usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(records.is_empty()); + + // The region snapshot should have been returned for deletion + + let datasets_and_snapshots = + datastore.snapshots_to_delete(&crucible_resources).await.unwrap(); + + assert!(!datasets_and_snapshots.is_empty()); + + let region_snapshot_to_delete = &datasets_and_snapshots[0].1; + + assert_eq!( + region_snapshot_to_delete.dataset_id, + region_snapshot.dataset_id, + ); + assert_eq!( + region_snapshot_to_delete.region_id, + region_snapshot.region_id, + ); + assert_eq!( + region_snapshot_to_delete.snapshot_id, + region_snapshot.snapshot_id, + ); + + // So delete it! + + datastore + .region_snapshot_remove( + region_snapshot_to_delete.dataset_id.into(), + region_snapshot_to_delete.region_id, + region_snapshot_to_delete.snapshot_id, + ) + .await + .unwrap(); + + // Activate the task - it should pick the request up but not attempt to + // run the start saga + + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + assert_eq!( + result, + RegionSnapshotReplacementStartStatus { + requests_created_ok: vec![], + start_invoked_ok: vec![], + requests_completed_ok: vec![format!( + "region snapshot replacement {request_id} completed ok" + )], + errors: vec![], + }, + ); + + // Assert start saga not invoked + assert_eq!(starter.count_reset(), 0); + } } From 34db989954e99b1541ef5e1a5c18050fe68c0e8d Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 18 Dec 2024 18:58:30 +0000 Subject: [PATCH 27/29] address the unfinished comment --- nexus/tests/integration_tests/crucible_replacements.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index edf2062e2f..4b6aae195b 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -403,7 +403,7 @@ mod region_replacement { // the attachment request to the simulated pantry. // // If `wait_for_request_state` has the same expected start and end - // state (as it does above), it's possible to exit the function + // state (as it does above), it's possible to exit that function // having not yet started the saga yet, and this requires an // additional `wait_for_condition` to wait for the expected recorded // step. @@ -426,8 +426,9 @@ mod region_replacement { Some(step) => Ok(step), None => { - // The saga is still running - this can - // happen when + // The saga either has not started yet or is + // still running - see the comment before this + // check for mroe info. Err(CondCheckError::<()>::NotYet) } } From 4931f58d071f96361119e08a05591ad6304f28e2 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 18 Dec 2024 20:11:46 +0000 Subject: [PATCH 28/29] rework wait_for_request_state to be clearer --- .../crucible_replacements.rs | 48 ++++++++++++------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 4b6aae195b..067504d71a 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -320,23 +320,39 @@ mod region_replacement { let state = region_replacement.replacement_state; - if state == expected_end_state.0 { - // The saga transitioned the request ok - Ok(()) - } else if state == expected_intermediate_state.0 { - // The saga is still running - Err(CondCheckError::<()>::NotYet) - // If the expected start and end state are the same, - // then it's impossible to determine when the saga - // starts and stops based on the state. - } else if expected_end_state.0 != expected_start_state.0 - && state == expected_start_state.0 - { - // The saga hasn't started yet - Err(CondCheckError::<()>::NotYet) + // If the expected start and end state are the same + // (i.e. there's a back edge in the associated request's + // state machine), then it's impossible to determine + // when the saga starts and stops based on the state. + if expected_end_state.0 == expected_start_state.0 { + if state == expected_end_state.0 { + // The saga transitioned the request ok, or + // hasn't started yet. Either way we have to + // return here, and the call site should perform + // an additional check for some associated + // expected result. + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } } else { - // Any other state is not expected - panic!("unexpected state {state:?}!"); + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 + || state == expected_start_state.0 + { + // The saga is still running, or hasn't started + // yet. + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } } } }, From 2a37e9cc4984ffa5f5494a806df8431bf4e8580e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 19 Dec 2024 03:57:17 +0000 Subject: [PATCH 29/29] mroe -> more! --- nexus/tests/integration_tests/crucible_replacements.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 067504d71a..c301372d5f 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -444,7 +444,7 @@ mod region_replacement { None => { // The saga either has not started yet or is // still running - see the comment before this - // check for mroe info. + // check for more info. Err(CondCheckError::<()>::NotYet) } }