From 4a4d6ca07d70532e16d5898c8060c01c255f5661 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 17 Dec 2024 15:49:20 -0500 Subject: [PATCH 1/9] Delete failed regions when the saga unwinds (#7245) One of the common sharp edges of sagas is that the compensating action of a node does _not_ run if the forward action fails. Said another way, for this node: EXAMPLE -> "output" { + forward_action - forward_action_undo } If `forward_action` fails, `forward_action_undo` is never executed. Forward actions are therefore required to be atomic, in that they either fully apply or don't apply at all. Sagas with nodes that ensure multiple regions exist cannot be atomic because they can partially fail (for example: what if only 2 out of 3 ensures succeed?). In order for the compensating action to be run, it must exist as a separate node that has a no-op forward action: EXAMPLE_UNDO -> "not_used" { + noop - forward_action_undo } EXAMPLE -> "output" { + forward_action } The region snapshot replacement start saga will only ever ensure that a single region exists, so one might think they could get away with a single node that combines the forward and compensating action - you'd be mistaken! The Crucible agent's region ensure is not atomic in all cases: if the region fails to create, it enters the `failed` state, but is not deleted. Nexus must clean these up. Fixes an issue that Angela saw where failed regions were taking up disk space in rack2 (#7209). This commit also includes an omdb command for finding these orphaned regions and optionally cleaning them up. --- dev-tools/omdb/src/bin/omdb/db.rs | 349 +++++++++++++++++- .../region_snapshot_replacement_start.rs | 253 ++++++++++++- nexus/tests/integration_tests/disks.rs | 5 +- sled-agent/src/sim/storage.rs | 6 - 4 files changed, 583 insertions(+), 30 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 667a666375..5058739da0 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -797,12 +797,23 @@ enum ValidateCommands { /// Validate each `volume_references` column in the region snapshots table ValidateVolumeReferences, + /// Find either regions Nexus knows about that the corresponding Crucible + /// agent says were deleted, or regions that Nexus doesn't know about. + ValidateRegions(ValidateRegionsArgs), + /// Find either region snapshots Nexus knows about that the corresponding /// Crucible agent says were deleted, or region snapshots that Nexus doesn't /// know about. ValidateRegionSnapshots, } +#[derive(Debug, Args)] +struct ValidateRegionsArgs { + /// Delete Regions Nexus is unaware of + #[clap(long, default_value_t = false)] + clean_up_orphaned_regions: bool, +} + #[derive(Debug, Args)] struct VolumeArgs { #[command(subcommand)] @@ -1093,6 +1104,20 @@ impl DbArgs { DbCommands::Validate(ValidateArgs { command: ValidateCommands::ValidateVolumeReferences, }) => cmd_db_validate_volume_references(&datastore).await, + DbCommands::Validate(ValidateArgs { + command: ValidateCommands::ValidateRegions(args), + }) => { + let clean_up_orphaned_regions = + if args.clean_up_orphaned_regions { + let token = omdb.check_allow_destructive()?; + CleanUpOrphanedRegions::Yes { _token: token } + } else { + CleanUpOrphanedRegions::No + }; + + cmd_db_validate_regions(&datastore, clean_up_orphaned_regions) + .await + } DbCommands::Validate(ValidateArgs { command: ValidateCommands::ValidateRegionSnapshots, }) => cmd_db_validate_region_snapshots(&datastore).await, @@ -4526,6 +4551,283 @@ async fn cmd_db_validate_volume_references( Ok(()) } +enum CleanUpOrphanedRegions { + Yes { _token: DestructiveOperationToken }, + No, +} + +async fn cmd_db_validate_regions( + datastore: &DataStore, + clean_up_orphaned_regions: CleanUpOrphanedRegions, +) -> Result<(), anyhow::Error> { + // *Lifetime note*: + // + // The lifetime of the region record in cockroachdb is longer than the time + // the Crucible agent's region is in a non-destroyed state: Nexus will + // perform the query to allocate regions (inserting them into the database) + // before it ensures those regions are created (i.e. making the POST request + // to the appropriate Crucible agent to create them), and it will request + // that the regions be deleted (then wait for that region to transition to + // the destroyed state) before hard-deleting the records in the database. + + // First, get all region records (with their corresponding dataset) + let datasets_and_regions: Vec<(Dataset, Region)> = datastore + .pool_connection_for_tests() + .await? + .transaction_async(|conn| async move { + // Selecting all datasets and regions requires a full table scan + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl; + + dsl::region + .inner_join( + dataset_dsl::dataset + .on(dsl::dataset_id.eq(dataset_dsl::id)), + ) + .select((Dataset::as_select(), Region::as_select())) + .get_results_async(&conn) + .await + }) + .await?; + + #[derive(Tabled)] + struct Row { + dataset_id: DatasetUuid, + region_id: Uuid, + dataset_addr: std::net::SocketAddrV6, + error: String, + } + + let mut rows = Vec::new(); + + // Reconcile with the corresponding Crucible Agent: are they aware of each + // region in the database? + for (dataset, region) in &datasets_and_regions { + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + eprintln!( + "dataset {} {:?} is not in service, skipping", + dataset.id(), + dataset.address(), + ); + continue; + } + + use crucible_agent_client::types::RegionId; + use crucible_agent_client::types::State; + use crucible_agent_client::Client as CrucibleAgentClient; + + let Some(dataset_addr) = dataset.address() else { + eprintln!("Dataset {} missing an IP address", dataset.id()); + continue; + }; + + let url = format!("http://{}", dataset_addr); + let client = CrucibleAgentClient::new(&url); + + let actual_region = + match client.region_get(&RegionId(region.id().to_string())).await { + Ok(region) => region.into_inner(), + + Err(e) => { + // Either there was a communication error, or the agent is + // unaware of the Region (this would be a 404). + match e { + crucible_agent_client::Error::ErrorResponse(rv) + if rv.status() == http::StatusCode::NOT_FOUND => + { + rows.push(Row { + dataset_id: dataset.id(), + region_id: region.id(), + dataset_addr, + error: String::from( + "Agent does not know about this region!", + ), + }); + } + + _ => { + eprintln!( + "{} region_get {:?}: {e}", + dataset_addr, + region.id(), + ); + } + } + + continue; + } + }; + + // The Agent is aware of this region, but is it in the appropriate + // state? + + match actual_region.state { + State::Destroyed => { + // If it is destroyed, then this is invalid as the record should + // be hard-deleted as well (see the lifetime note above). Note + // that omdb could be racing a Nexus that is performing region + // deletion: if the region transitioned to Destroyed but Nexus + // is waiting to re-poll, it will not have hard-deleted the + // region record yet. + + rows.push(Row { + dataset_id: dataset.id(), + region_id: region.id(), + dataset_addr, + error: String::from( + "region may need to be manually hard-deleted", + ), + }); + } + + _ => { + // ok + } + } + } + + // Reconcile with the Crucible agents: are there regions that Nexus does not + // know about? Ask each Crucible agent for its list of regions, then check + // in the database: if that region is _not_ in the database, then either it + // was never created by Nexus, or it was hard-deleted by Nexus. Either way, + // omdb should (if the command line argument is supplied) request that the + // orphaned region be deleted. + // + // Note: This should not delete what is actually a valid region, see the + // lifetime note above. + + let mut orphaned_bytes: u64 = 0; + + let db_region_ids: BTreeSet = + datasets_and_regions.iter().map(|(_, r)| r.id()).collect(); + + // Find all the Crucible datasets + let datasets: Vec = datastore + .pool_connection_for_tests() + .await? + .transaction_async(|conn| async move { + // Selecting all datasets and regions requires a full table scan + conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await?; + + use db::schema::dataset::dsl; + + dsl::dataset + .filter(dsl::kind.eq(nexus_db_model::DatasetKind::Crucible)) + .select(Dataset::as_select()) + .get_results_async(&conn) + .await + }) + .await?; + + for dataset in &datasets { + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + eprintln!( + "dataset {} {:?} is not in service, skipping", + dataset.id(), + dataset.address(), + ); + continue; + } + + use crucible_agent_client::types::State; + use crucible_agent_client::Client as CrucibleAgentClient; + + let Some(dataset_addr) = dataset.address() else { + eprintln!("Dataset {} missing an IP address", dataset.id()); + continue; + }; + + let url = format!("http://{}", dataset_addr); + let client = CrucibleAgentClient::new(&url); + + let actual_regions = match client.region_list().await { + Ok(v) => v.into_inner(), + Err(e) => { + eprintln!("{} region_list: {e}", dataset_addr); + continue; + } + }; + + for actual_region in actual_regions { + // Skip doing anything if the region is already tombstoned or + // destroyed + match actual_region.state { + State::Destroyed | State::Tombstoned => { + // the Crucible agent will eventually clean this up, or + // already has. + continue; + } + + State::Failed | State::Requested | State::Created => { + // this region needs cleaning up if there isn't an + // associated db record + } + } + + let actual_region_id: Uuid = actual_region.id.0.parse().unwrap(); + if !db_region_ids.contains(&actual_region_id) { + orphaned_bytes += actual_region.block_size + * actual_region.extent_size + * u64::from(actual_region.extent_count); + + match clean_up_orphaned_regions { + CleanUpOrphanedRegions::Yes { .. } => { + match client.region_delete(&actual_region.id).await { + Ok(_) => { + eprintln!( + "{} region {} deleted ok", + dataset_addr, actual_region.id, + ); + } + + Err(e) => { + eprintln!( + "{} region_delete {:?}: {e}", + dataset_addr, actual_region.id, + ); + } + } + } + + CleanUpOrphanedRegions::No => { + // Do not delete this region, just print a row + rows.push(Row { + dataset_id: dataset.id(), + region_id: actual_region_id, + dataset_addr, + error: String::from( + "Nexus does not know about this region!", + ), + }); + } + } + } + } + } + + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .to_string(); + + println!("{}", table); + + eprintln!("found {} orphaned bytes", orphaned_bytes); + + Ok(()) +} + async fn cmd_db_validate_region_snapshots( datastore: &DataStore, ) -> Result<(), anyhow::Error> { @@ -4581,6 +4883,15 @@ async fn cmd_db_validate_region_snapshots( .or_default() .insert(region_snapshot.snapshot_id); + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + continue; + } + use crucible_agent_client::types::RegionId; use crucible_agent_client::types::State; use crucible_agent_client::Client as CrucibleAgentClient; @@ -4593,11 +4904,21 @@ async fn cmd_db_validate_region_snapshots( let url = format!("http://{}", dataset_addr); let client = CrucibleAgentClient::new(&url); - let actual_region_snapshots = client + let actual_region_snapshots = match client .region_get_snapshots(&RegionId( region_snapshot.region_id.to_string(), )) - .await?; + .await + { + Ok(v) => v, + Err(e) => { + eprintln!( + "{} region_get_snapshots {:?}: {e}", + dataset_addr, region_snapshot.region_id, + ); + continue; + } + }; let snapshot_id = region_snapshot.snapshot_id.to_string(); @@ -4741,6 +5062,15 @@ async fn cmd_db_validate_region_snapshots( // Reconcile with the Crucible agents: are there snapshots that Nexus does // not know about? for (dataset, region) in datasets_and_regions { + // If the dataset was expunged, do not attempt to contact the Crucible + // agent! + let in_service = + datastore.dataset_physical_disk_in_service(dataset.id()).await?; + + if !in_service { + continue; + } + use crucible_agent_client::types::RegionId; use crucible_agent_client::types::State; use crucible_agent_client::Client as CrucibleAgentClient; @@ -4753,9 +5083,20 @@ async fn cmd_db_validate_region_snapshots( let url = format!("http://{}", dataset_addr); let client = CrucibleAgentClient::new(&url); - let actual_region_snapshots = client + let actual_region_snapshots = match client .region_get_snapshots(&RegionId(region.id().to_string())) - .await?; + .await + { + Ok(v) => v, + Err(e) => { + eprintln!( + "{} region_get_snapshots {:?}: {e}", + dataset_addr, + region.id(), + ); + continue; + } + }; let default = HashSet::default(); let nexus_region_snapshots: &HashSet = diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 4855f64ac2..2092c3065a 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -101,9 +101,43 @@ declare_saga_actions! { FIND_NEW_REGION -> "new_dataset_and_region" { + rsrss_find_new_region } + // One of the common sharp edges of sagas is that the compensating action of + // a node does _not_ run if the forward action fails. Said another way, for + // this node: + // + // EXAMPLE -> "output" { + // + forward_action + // - forward_action_undo + // } + // + // If `forward_action` fails, `forward_action_undo` is never executed. + // Forward actions are therefore required to be atomic, in that they either + // fully apply or don't apply at all. + // + // Sagas with nodes that ensure multiple regions exist cannot be atomic + // because they can partially fail (for example: what if only 2 out of 3 + // ensures succeed?). In order for the compensating action to be run, it + // must exist as a separate node that has a no-op forward action: + // + // EXAMPLE_UNDO -> "not_used" { + // + noop + // - forward_action_undo + // } + // EXAMPLE -> "output" { + // + forward_action + // } + // + // This saga will only ever ensure that a single region exists, so you might + // think you could get away with a single node that combines the forward and + // compensating action - you'd be mistaken! The Crucible agent's region + // ensure is not atomic in all cases: if the region fails to create, it + // enters the `failed` state, but is not deleted. Nexus must clean these up. + NEW_REGION_ENSURE_UNDO -> "not_used" { + + rsrss_noop + - rsrss_new_region_ensure_undo + } NEW_REGION_ENSURE -> "ensured_dataset_and_region" { + rsrss_new_region_ensure - - rsrss_new_region_ensure_undo } GET_OLD_SNAPSHOT_VOLUME_ID -> "old_snapshot_volume_id" { + rsrss_get_old_snapshot_volume_id @@ -153,6 +187,7 @@ impl NexusSaga for SagaRegionSnapshotReplacementStart { builder.append(get_alloc_region_params_action()); builder.append(alloc_new_region_action()); builder.append(find_new_region_action()); + builder.append(new_region_ensure_undo_action()); builder.append(new_region_ensure_action()); builder.append(get_old_snapshot_volume_id_action()); builder.append(create_fake_volume_action()); @@ -380,6 +415,10 @@ async fn rsrss_find_new_region( Ok(dataset_and_region) } +async fn rsrss_noop(_sagactx: NexusActionContext) -> Result<(), ActionError> { + Ok(()) +} + async fn rsrss_new_region_ensure( sagactx: NexusActionContext, ) -> Result< @@ -390,10 +429,6 @@ async fn rsrss_new_region_ensure( let osagactx = sagactx.user_data(); let log = osagactx.log(); - // With a list of datasets and regions to ensure, other sagas need to have a - // separate no-op forward step for the undo action to ensure that the undo - // step occurs in the case that the ensure partially fails. Here this is not - // required, there's only one dataset and region. let new_dataset_and_region = sagactx .lookup::<(db::model::Dataset, db::model::Region)>( "new_dataset_and_region", @@ -830,7 +865,7 @@ pub(crate) mod test { /// Create four zpools, a disk, and a snapshot of that disk async fn prepare_for_test( cptestctx: &ControlPlaneTestContext, - ) -> PrepareResult { + ) -> PrepareResult<'_> { let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let datastore = nexus.datastore(); @@ -876,20 +911,21 @@ pub(crate) mod test { panic!("test snapshot {:?} should exist", snapshot_id) }); - PrepareResult { db_disk, snapshot, db_snapshot } + PrepareResult { db_disk, snapshot, db_snapshot, disk_test } } - struct PrepareResult { + struct PrepareResult<'a> { db_disk: nexus_db_model::Disk, snapshot: views::Snapshot, db_snapshot: nexus_db_model::Snapshot, + disk_test: DiskTest<'a, crate::Server>, } #[nexus_test(server = crate::Server)] async fn test_region_snapshot_replacement_start_saga( cptestctx: &ControlPlaneTestContext, ) { - let PrepareResult { db_disk, snapshot, db_snapshot } = + let PrepareResult { db_disk, snapshot, db_snapshot, .. } = prepare_for_test(cptestctx).await; let nexus = &cptestctx.server.server_context().nexus; @@ -1009,9 +1045,11 @@ pub(crate) mod test { pub(crate) async fn verify_clean_slate( cptestctx: &ControlPlaneTestContext, + test: &DiskTest<'_, crate::Server>, request: &RegionSnapshotReplacement, affected_volume_original: &Volume, ) { + let sled_agent = &cptestctx.sled_agent.sled_agent; let datastore = cptestctx.server.server_context().nexus.datastore(); crate::app::sagas::test_helpers::assert_no_failed_undo_steps( @@ -1024,6 +1062,10 @@ pub(crate) mod test { // original disk, and three for the (currently unused) snapshot // destination volume assert_eq!(region_allocations(&datastore).await, 6); + + // Assert that only those six provisioned regions are non-destroyed + assert_no_other_ensured_regions(sled_agent, test, &datastore).await; + assert_region_snapshot_replacement_request_untouched( cptestctx, &datastore, &request, ) @@ -1031,11 +1073,12 @@ pub(crate) mod test { assert_volume_untouched(&datastore, &affected_volume_original).await; } - async fn region_allocations(datastore: &DataStore) -> usize { + async fn regions(datastore: &DataStore) -> Vec { use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use async_bb8_diesel::AsyncSimpleConnection; use diesel::QueryDsl; + use diesel::SelectableHelper; use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_queries::db::schema::region::dsl; @@ -1047,15 +1090,56 @@ pub(crate) mod test { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL).await.unwrap(); dsl::region - .count() - .get_result_async(&conn) + .select(db::model::Region::as_select()) + .get_results_async(&conn) .await - .map(|x: i64| x as usize) }) .await .unwrap() } + async fn region_allocations(datastore: &DataStore) -> usize { + regions(datastore).await.len() + } + + async fn assert_no_other_ensured_regions( + sled_agent: &omicron_sled_agent::sim::SledAgent, + test: &DiskTest<'_, crate::Server>, + datastore: &DataStore, + ) { + let mut non_destroyed_regions_from_agent = vec![]; + + for zpool in test.zpools() { + for dataset in &zpool.datasets { + let crucible_dataset = + sled_agent.get_crucible_dataset(zpool.id, dataset.id).await; + for region in crucible_dataset.list().await { + match region.state { + crucible_agent_client::types::State::Tombstoned + | crucible_agent_client::types::State::Destroyed => { + // ok + } + + _ => { + non_destroyed_regions_from_agent + .push(region.clone()); + } + } + } + } + } + + let db_regions = regions(datastore).await; + let db_region_ids: Vec = + db_regions.iter().map(|x| x.id()).collect(); + + for region in non_destroyed_regions_from_agent { + let region_id = region.id.0.parse().unwrap(); + let contains = db_region_ids.contains(®ion_id); + assert!(contains, "db does not have {:?}", region_id); + } + } + async fn assert_region_snapshot_replacement_request_untouched( cptestctx: &ControlPlaneTestContext, datastore: &DataStore, @@ -1094,11 +1178,78 @@ pub(crate) mod test { assert_eq!(actual, expected); } + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let PrepareResult { db_disk, snapshot, db_snapshot, disk_test } = + prepare_for_test(cptestctx).await; + + let log = &cptestctx.logctx.log; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + let region: &nexus_db_model::Region = &disk_allocated_regions[0].1; + let snapshot_id = snapshot.identity.id; + + let region_snapshot = datastore + .region_snapshot_get(region.dataset_id(), region.id(), snapshot_id) + .await + .unwrap() + .unwrap(); + + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + let affected_volume_original = + datastore.volume_get(db_snapshot.volume_id).await.unwrap().unwrap(); + + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; + + crate::app::sagas::test_helpers::action_failure_can_unwind::< + SagaRegionSnapshotReplacementStart, + _, + _, + >( + nexus, + || Box::pin(async { new_test_params(&opctx, &request) }), + || { + Box::pin(async { + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; + }) + }, + log, + ) + .await; + } + #[nexus_test(server = crate::Server)] async fn test_action_failure_can_unwind_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let PrepareResult { db_disk, snapshot, db_snapshot } = + let PrepareResult { db_disk, snapshot, db_snapshot, disk_test } = prepare_for_test(cptestctx).await; let log = &cptestctx.logctx.log; @@ -1130,8 +1281,13 @@ pub(crate) mod test { let affected_volume_original = datastore.volume_get(db_snapshot.volume_id).await.unwrap().unwrap(); - verify_clean_slate(&cptestctx, &request, &affected_volume_original) - .await; + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; crate::app::sagas::test_helpers::action_failure_can_unwind_idempotently::< SagaRegionSnapshotReplacementStart, @@ -1143,6 +1299,7 @@ pub(crate) mod test { || Box::pin(async { verify_clean_slate( &cptestctx, + &disk_test, &request, &affected_volume_original, ).await; @@ -1155,7 +1312,7 @@ pub(crate) mod test { async fn test_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let PrepareResult { db_disk, snapshot, db_snapshot: _ } = + let PrepareResult { db_disk, snapshot, .. } = prepare_for_test(cptestctx).await; let nexus = &cptestctx.server.server_context().nexus; @@ -1192,4 +1349,66 @@ pub(crate) mod test { ) .await; } + + /// Assert this saga does not leak regions if the replacement read-only + /// region cannot be created. + #[nexus_test(server = crate::Server)] + async fn test_no_leak_region(cptestctx: &ControlPlaneTestContext) { + let PrepareResult { db_disk, snapshot, db_snapshot, disk_test } = + prepare_for_test(cptestctx).await; + + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(cptestctx); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(disk_allocated_regions.len(), 3); + + let region: &nexus_db_model::Region = &disk_allocated_regions[0].1; + let snapshot_id = snapshot.identity.id; + + let region_snapshot = datastore + .region_snapshot_get(region.dataset_id(), region.id(), snapshot_id) + .await + .unwrap() + .unwrap(); + + let request = + RegionSnapshotReplacement::for_region_snapshot(®ion_snapshot); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + let affected_volume_original = + datastore.volume_get(db_snapshot.volume_id).await.unwrap().unwrap(); + + disk_test.set_always_fail_callback().await; + + // Run the region snapshot replacement start saga + let dag = + create_saga_dag::(Params { + serialized_authn: Serialized::for_opctx(&opctx), + request: request.clone(), + allocation_strategy: RegionAllocationStrategy::Random { + seed: None, + }, + }) + .unwrap(); + + let runnable_saga = nexus.sagas.saga_prepare(dag).await.unwrap(); + + // Actually run the saga + runnable_saga.run_to_completion().await.unwrap(); + + verify_clean_slate( + &cptestctx, + &disk_test, + &request, + &affected_volume_original, + ) + .await; + } } diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index d9888f9ccd..e381146fc0 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -769,10 +769,9 @@ async fn test_disk_region_creation_failure( .await .unwrap(); - // After the failed allocation, the disk should be Faulted + // After the failed allocation, the disk creation should have unwound let disks = disks_list(&client, &disks_url).await; - assert_eq!(disks.len(), 1); - assert_eq!(disks[0].state, DiskState::Faulted); + assert_eq!(disks.len(), 0); } // Tests that invalid block sizes are rejected diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index dc8cf63fe4..8fd648096a 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -168,12 +168,6 @@ impl CrucibleDataInner { let id = Uuid::from_str(&id.0).unwrap(); if let Some(region) = self.regions.get_mut(&id) { - if region.state == State::Failed { - // The real Crucible agent would not let a Failed region be - // deleted - bail!("cannot delete in state Failed"); - } - region.state = State::Destroyed; self.used_ports.remove(®ion.port_number); Ok(Some(region.clone())) From ad61b013dcea2f408660d15295b0f90a9be71edd Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Wed, 18 Dec 2024 09:09:23 -0800 Subject: [PATCH 2/9] live-tests: avoid relying on poorly-behaving DNS (#7267) 1. Live tests don't currently run because the check that you are on an underlay network asks internal DNS for A/AAAA records for internal DNS (via `hickory_resolver::Resolver::lookup_ip`). We don't have those, but prior to #6308 we responded with whatever records we had for a given name regardless of what type the query asked for. Live tests aren't run in CI so we missed it. 2. `internal_dns_resolver::Resolver::lookup_ip` is now only used here and in a unit test that is arguably doing something similar, so we remove the function altogether. --- internal-dns/resolver/src/resolver.rs | 18 ++---------------- live-tests/tests/common/mod.rs | 2 +- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/internal-dns/resolver/src/resolver.rs b/internal-dns/resolver/src/resolver.rs index af47bb23ad..8d0e071e19 100644 --- a/internal-dns/resolver/src/resolver.rs +++ b/internal-dns/resolver/src/resolver.rs @@ -12,7 +12,7 @@ use omicron_common::address::{ get_internal_dns_server_addresses, Ipv6Subnet, AZ_PREFIX, DNS_PORT, }; use slog::{debug, error, info, trace}; -use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; #[derive(Debug, Clone, thiserror::Error)] pub enum ResolveError { @@ -323,20 +323,6 @@ impl Resolver { } } - pub async fn lookup_ip( - &self, - srv: ServiceName, - ) -> Result { - let name = srv.srv_name(); - debug!(self.log, "lookup srv"; "dns_name" => &name); - let response = self.resolver.lookup_ip(&name).await?; - let address = response - .iter() - .next() - .ok_or_else(|| ResolveError::NotFound(srv))?; - Ok(address) - } - /// Returns an iterator of [`SocketAddrV6`]'s for the targets of the given /// SRV lookup response. // SRV records have a target, which is itself another DNS name that needs @@ -534,7 +520,7 @@ mod test { let resolver = dns_server.resolver().unwrap(); let err = resolver - .lookup_ip(ServiceName::Cockroach) + .lookup_srv(ServiceName::Cockroach) .await .expect_err("Looking up non-existent service should fail"); diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs index 50f84d0b59..8a60b332b6 100644 --- a/live-tests/tests/common/mod.rs +++ b/live-tests/tests/common/mod.rs @@ -157,7 +157,7 @@ async fn check_execution_environment( // The only real requirement for these tests is that they're run from a // place with connectivity to the underlay network of a deployed control // plane. The easiest way to tell is to look up something in internal DNS. - resolver.lookup_ip(ServiceName::InternalDns).await.map_err(|e| { + resolver.lookup_srv(ServiceName::InternalDns).await.map_err(|e| { let text = format!( "check_execution_environment(): failed to look up internal DNS \ in the internal DNS servers.\n\n \ From 7297d76de8834c0f6dbe3007509adf0f789086a2 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 18 Dec 2024 13:01:50 -0500 Subject: [PATCH 3/9] Only perform region replacement for r/w regions (#7243) Region replacement does not work for read-only regions (there is a check in the region_replacement_start saga that bails out if the supplied region is read-only) - this is tracked by #6172. The current plan to make read-only region replacement work is to use the same machinery as region snapshot replacements (as they're both read-only), so this commit changes the current query that finds expunged regions to replace to only return read/write regions. This can be changed back in the future if the plan for #6172 changes. --- nexus/db-queries/src/db/datastore/region.rs | 6 ++++-- nexus/db-queries/src/db/datastore/region_replacement.rs | 7 +++++++ nexus/src/app/background/tasks/region_replacement.rs | 7 ++++--- nexus/tests/integration_tests/disks.rs | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 885cb622b8..8e59462aa3 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -422,8 +422,8 @@ impl DataStore { } } - /// Find regions on expunged disks - pub async fn find_regions_on_expunged_physical_disks( + /// Find read/write regions on expunged disks + pub async fn find_read_write_regions_on_expunged_physical_disks( &self, opctx: &OpContext, ) -> LookupResult> { @@ -450,6 +450,8 @@ impl DataStore { )) .select(dataset_dsl::id) )) + // only return read-write regions here + .filter(region_dsl::read_only.eq(false)) .select(Region::as_select()) .load_async(&*conn) .await diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index 0fda6b46ba..0d259ba47e 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -37,6 +37,13 @@ impl DataStore { opctx: &OpContext, region: &Region, ) -> Result { + if region.read_only() { + return Err(Error::invalid_request(format!( + "region {} is read-only", + region.id(), + ))); + } + let request = RegionReplacement::for_region(region); let request_id = request.id; diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index f86ba8eb8f..71b49c5a6f 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -68,17 +68,18 @@ impl BackgroundTask for RegionReplacementDetector { let mut status = RegionReplacementStatus::default(); - // Find regions on expunged physical disks + // Find read/write regions on expunged physical disks let regions_to_be_replaced = match self .datastore - .find_regions_on_expunged_physical_disks(opctx) + .find_read_write_regions_on_expunged_physical_disks(opctx) .await { Ok(regions) => regions, Err(e) => { let s = format!( - "find_regions_on_expunged_physical_disks failed: {e}" + "find_read_write_regions_on_expunged_physical_disks \ + failed: {e}" ); error!(&log, "{s}"); status.errors.push(s); diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index e381146fc0..db16113dd7 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -2577,7 +2577,7 @@ async fn test_disk_expunge(cptestctx: &ControlPlaneTestContext) { // All three regions should be returned let expunged_regions = datastore - .find_regions_on_expunged_physical_disks(&opctx) + .find_read_write_regions_on_expunged_physical_disks(&opctx) .await .unwrap(); From 510ef5a7849afa01ce28a706e109cbd1cf360370 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 18 Dec 2024 13:35:24 -0800 Subject: [PATCH 4/9] [sled-agent] Avoid causing UUID conflicts (#7266) - Avoids overwriting the value of "dataset UUID" when creating datasets from `omicron_zones_ensure`. Instead, don't set any dataset UUID, which lets subsequent calls to `datasets_ensure` set the right value here. Fixes https://github.com/oxidecomputer/omicron/issues/7265 --- sled-agent/src/sled_agent.rs | 21 +++++-- sled-storage/src/manager.rs | 108 ++++++++++++++++++++++++++--------- 2 files changed, 97 insertions(+), 32 deletions(-) diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index b9bf703933..37526690cb 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -962,12 +962,21 @@ impl SledAgent { continue; }; - // First, ensure the dataset exists - let dataset_id = zone.id.into_untyped_uuid(); - self.inner - .storage - .upsert_filesystem(dataset_id, dataset_name) - .await?; + // NOTE: This code will be deprecated by https://github.com/oxidecomputer/omicron/pull/7160 + // + // However, we need to ensure that all blueprints have datasets + // within them before we can remove this back-fill. + // + // Therefore, we do something hairy here: We ensure the filesystem + // exists, but don't specify any dataset UUID value. + // + // This means that: + // - If the dataset exists and has a UUID, this will be a no-op + // - If the dataset doesn't exist, it'll be created without its + // oxide:uuid zfs property set + // - If a subsequent call to "datasets_ensure" tries to set a UUID, + // it should be able to get set (once). + self.inner.storage.upsert_filesystem(None, dataset_name).await?; } self.inner diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs index f0653c7905..d6ffd42d0a 100644 --- a/sled-storage/src/manager.rs +++ b/sled-storage/src/manager.rs @@ -35,7 +35,6 @@ use std::collections::HashSet; use std::future::Future; use tokio::sync::{mpsc, oneshot, watch}; use tokio::time::{interval, Duration, MissedTickBehavior}; -use uuid::Uuid; // The size of the mpsc bounded channel used to communicate // between the `StorageHandle` and `StorageManager`. @@ -100,7 +99,7 @@ enum StorageManagerState { #[derive(Debug)] pub(crate) struct NewFilesystemRequest { - dataset_id: Uuid, + dataset_id: Option, dataset_name: DatasetName, responder: DebugIgnore>>, } @@ -526,7 +525,7 @@ impl StorageHandle { // and ask for the set of all datasets from Nexus. pub async fn upsert_filesystem( &self, - dataset_id: Uuid, + dataset_id: Option, dataset_name: DatasetName, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); @@ -1499,27 +1498,9 @@ impl StorageManager { zoned, encryption_details, size_details, - id: Some(DatasetUuid::from_untyped_uuid(request.dataset_id)), + id: request.dataset_id, additional_options: None, })?; - // Ensure the dataset has a usable UUID. - if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { - if let Ok(id) = id_str.parse::() { - if id != request.dataset_id { - return Err(Error::UuidMismatch { - name: request.dataset_name.full_name(), - old: id, - new: request.dataset_id, - }); - } - return Ok(()); - } - } - Zfs::set_oxide_value( - &fs_name, - "uuid", - &request.dataset_id.to_string(), - )?; Ok(()) } @@ -1544,7 +1525,6 @@ mod tests { use std::collections::BTreeMap; use std::str::FromStr; use std::sync::atomic::Ordering; - use uuid::Uuid; // A helper struct to advance time. struct TimeTravel {} @@ -2005,16 +1985,92 @@ mod tests { .expect("Ensuring disks should work after key manager is ready"); assert!(!result.has_error(), "{:?}", result); - // Create a filesystem on the newly formatted U.2 - let dataset_id = Uuid::new_v4(); + // Create a filesystem on the newly formatted U.2. + // + // We can call "upsert_filesystem" both with and without a UUID. + let dataset_id = DatasetUuid::new_v4(); + let zpool_name = ZpoolName::new_external(config.disks[0].pool_id); + let dataset_name = + DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); + harness + .handle() + .upsert_filesystem(Some(dataset_id), dataset_name.clone()) + .await + .unwrap(); + // Observe the dataset exists, and the UUID is set. + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, Some(dataset_id)); + + harness + .handle() + .upsert_filesystem(None, dataset_name.clone()) + .await + .unwrap(); + // Observe the dataset still exists, and the UUID is still set, + // even though we did not ask for a new value explicitly. + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, Some(dataset_id)); + + harness.cleanup().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn upsert_filesystem_no_uuid() { + illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst); + let logctx = test_setup_log("upsert_filesystem"); + let mut harness = StorageManagerTestHarness::new(&logctx.log).await; + + // Test setup: Add a U.2 and M.2, adopt them into the "control plane" + // for usage. + harness.handle().key_manager_ready().await; + let raw_disks = + harness.add_vdevs(&["u2_under_test.vdev", "m2_helping.vdev"]).await; + let config = harness.make_config(1, &raw_disks); + let result = harness + .handle() + .omicron_physical_disks_ensure(config.clone()) + .await + .expect("Ensuring disks should work after key manager is ready"); + assert!(!result.has_error(), "{:?}", result); + + // Create a filesystem on the newly formatted U.2, without a UUID let zpool_name = ZpoolName::new_external(config.disks[0].pool_id); let dataset_name = DatasetName::new(zpool_name.clone(), DatasetKind::Crucible); harness .handle() - .upsert_filesystem(dataset_id, dataset_name) + .upsert_filesystem(None, dataset_name.clone()) .await .unwrap(); + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, None); + + // Later, we can set the UUID to a specific value + let dataset_id = DatasetUuid::new_v4(); + harness + .handle() + .upsert_filesystem(Some(dataset_id), dataset_name.clone()) + .await + .unwrap(); + let observed_dataset = &Zfs::get_dataset_properties( + &[dataset_name.full_name()], + WhichDatasets::SelfOnly, + ) + .unwrap()[0]; + assert_eq!(observed_dataset.id, Some(dataset_id)); harness.cleanup().await; logctx.cleanup_successful(); From 72ac078a8d751d85d784e0027c92098be660091b Mon Sep 17 00:00:00 2001 From: David Crespo Date: Wed, 18 Dec 2024 15:45:31 -0600 Subject: [PATCH 5/9] Bump web console (sled policy, lighter badge) (#7273) https://github.com/oxidecomputer/console/compare/c1ebd8d9...d583ae70 * [d583ae70](https://github.com/oxidecomputer/console/commit/d583ae70) oxidecomputer/console#2624 * [fe973ed5](https://github.com/oxidecomputer/console/commit/fe973ed5) oxidecomputer/console#2625 --- tools/console_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/console_version b/tools/console_version index 85ca41f755..ef52d38564 100644 --- a/tools/console_version +++ b/tools/console_version @@ -1,2 +1,2 @@ -COMMIT="c1ebd8d9acae4ff7a09b2517265fba52ebdfe82e" -SHA2="840dbfda1c0def66212e7602d7be6e8acf1b26ba218f10ce3e627df49f5ce9e2" +COMMIT="d583ae70786db46ace23f0e45bc9fdcffc21e6ae" +SHA2="590fda51f0599879effea4f4b2754ec572f6c1f4d5a586016ff6a2e2d8b6f5f9" From d337333577c582e44ea02bc711d86b4f11b122be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Thu, 19 Dec 2024 15:02:18 +1300 Subject: [PATCH 6/9] [clickhouse] Clickana monitoring dashboard tool (#7207) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Overview As part of Stage 1 of [RFD468](https://rfd.shared.oxide.computer/rfd/0468) we'll be observing how a ClickHouse cluster behaves in comparison with a single node server. This commit introduces a basic tool that lets us visualize internal ClickHouse metric information. As a starting point, Clickana only has 4 charts, and the user may not choose what these are. Additionally, it is only capable of rendering data by making API calls. I'd like to make the tool more flexible; other capabilities will be added in follow up PRs. ### Usage ```console clickana --help Usage: clickana [OPTIONS] --clickhouse-addr Options: -l, --log-path Path to the log file [env: CLICKANA_LOG_PATH=] [default: /tmp/clickana.log] -a, --clickhouse-addr Address where a clickhouse admin server is listening on -s, --sampling-interval The interval to collect monitoring data in seconds [default: 60] -t, --time-range Range of time to collect monitoring data in seconds [default: 3600] -r, --refresh-interval The interval at which the dashboards will refresh [default: 60] -h, --help Print help ``` ### Manual Testing ``` root@oxz_clickhouse_015f9c34:~# /opt/oxide/clickana/bin/clickana -a [fd00:1122:3344:101::e]:8888 ``` Screenshot 2024-12-12 at 4 11 15 PM ### Next Steps - Let the user set which metrics they would like to visualise in each chart. This may be nice to do through a TOML file or something. We could let them choose which unit to represent them in as well perhaps. - Have more metrics available. - It'd be nice to have the ability to take the timeseries as JSON instead of calling the API as well. This could be useful in the future to have some insight into our customer's racks for debugging purposes. We could include ClickHouse internal metric timeseries as part of the support bundles and they could be visualised via Clickana. WDYT @smklein ? Related: https://github.com/oxidecomputer/omicron/issues/6953 --- Cargo.lock | 26 + Cargo.toml | 3 + clickhouse-admin/Cargo.toml | 2 + clickhouse-admin/types/src/lib.rs | 22 +- dev-tools/clickana/Cargo.toml | 31 + dev-tools/clickana/src/bin/clickana.rs | 57 ++ dev-tools/clickana/src/chart.rs | 765 +++++++++++++++++++++++++ dev-tools/clickana/src/lib.rs | 274 +++++++++ openapi/clickhouse-admin-server.json | 13 +- openapi/clickhouse-admin-single.json | 13 +- package-manifest.toml | 15 +- sled-agent/src/services.rs | 19 +- 12 files changed, 1198 insertions(+), 42 deletions(-) create mode 100644 dev-tools/clickana/Cargo.toml create mode 100644 dev-tools/clickana/src/bin/clickana.rs create mode 100644 dev-tools/clickana/src/chart.rs create mode 100644 dev-tools/clickana/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 44ef4eccc5..4b1809d3b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1332,6 +1332,32 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +[[package]] +name = "clickana" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "chrono", + "clap", + "clickhouse-admin-server-client", + "clickhouse-admin-types", + "dropshot 0.13.0", + "futures", + "omicron-common", + "omicron-workspace-hack", + "ratatui", + "schemars", + "serde_json", + "slog", + "slog-async", + "slog-dtrace", + "slog-error-chain", + "slog-term", + "tokio", + "tokio-postgres", +] + [[package]] name = "clickhouse-admin-api" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index bb9ff0e80f..3d8efe3acb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ members = [ "cockroach-admin/types", "common", "dev-tools/cert-dev", + "dev-tools/clickana", "dev-tools/clickhouse-cluster-dev", "dev-tools/ch-dev", "dev-tools/crdb-seed", @@ -158,6 +159,7 @@ default-members = [ "cockroach-admin/types", "common", "dev-tools/cert-dev", + "dev-tools/clickana", "dev-tools/clickhouse-cluster-dev", "dev-tools/ch-dev", "dev-tools/crdb-seed", @@ -332,6 +334,7 @@ chrono = { version = "0.4", features = [ "serde" ] } chrono-tz = "0.10.0" ciborium = "0.2.2" clap = { version = "4.5", features = ["cargo", "derive", "env", "wrap_help"] } +clickana = { path = "dev-tools/clickana" } clickhouse-admin-api = { path = "clickhouse-admin/api" } clickhouse-admin-keeper-client = { path = "clients/clickhouse-admin-keeper-client" } clickhouse-admin-server-client = { path = "clients/clickhouse-admin-server-client" } diff --git a/clickhouse-admin/Cargo.toml b/clickhouse-admin/Cargo.toml index 80b080b2ff..65ceb3fdbf 100644 --- a/clickhouse-admin/Cargo.toml +++ b/clickhouse-admin/Cargo.toml @@ -22,7 +22,9 @@ slog.workspace = true slog-async.workspace = true slog-dtrace.workspace = true slog-error-chain.workspace = true +slog-term.workspace = true serde.workspace = true +serde_json.workspace = true thiserror.workspace = true tokio.workspace = true tokio-postgres.workspace = true diff --git a/clickhouse-admin/types/src/lib.rs b/clickhouse-admin/types/src/lib.rs index 3b8696438c..aa1437bedc 100644 --- a/clickhouse-admin/types/src/lib.rs +++ b/clickhouse-admin/types/src/lib.rs @@ -1200,7 +1200,7 @@ pub enum Timestamp { #[derive(Debug, Serialize, Deserialize, JsonSchema, PartialEq)] #[serde(rename_all = "snake_case")] pub struct SystemTimeSeries { - pub time: Timestamp, + pub time: String, pub value: f64, // TODO: Would be really nice to have an enum with possible units (s, ms, bytes) // Not sure if I can even add this, the system tables don't mention units at all. @@ -2099,15 +2099,15 @@ snapshot_storage_disk=LocalSnapshotDisk let expected = vec![ SystemTimeSeries { - time: crate::Timestamp::Unix("1732494720".to_string()), + time: "1732494720".to_string(), value: 110220450825.75238, }, SystemTimeSeries { - time: crate::Timestamp::Unix("1732494840".to_string()), + time: "1732494840".to_string(), value: 110339992917.33331, }, SystemTimeSeries { - time: crate::Timestamp::Unix("1732494960".to_string()), + time: "1732494960".to_string(), value: 110421854037.33331, }, ]; @@ -2127,21 +2127,15 @@ snapshot_storage_disk=LocalSnapshotDisk let expected = vec![ SystemTimeSeries { - time: crate::Timestamp::Utc( - "2024-11-25T00:34:00Z".parse::>().unwrap(), - ), + time: "2024-11-25T00:34:00Z".to_string(), value: 110220450825.75238, }, SystemTimeSeries { - time: crate::Timestamp::Utc( - "2024-11-25T00:35:00Z".parse::>().unwrap(), - ), + time: "2024-11-25T00:35:00Z".to_string(), value: 110339992917.33331, }, SystemTimeSeries { - time: crate::Timestamp::Utc( - "2024-11-25T00:36:00Z".parse::>().unwrap(), - ), + time: "2024-11-25T00:36:00Z".to_string(), value: 110421854037.33331, }, ]; @@ -2176,7 +2170,7 @@ snapshot_storage_disk=LocalSnapshotDisk assert_eq!( format!("{}", root_cause), - "data did not match any variant of untagged enum Timestamp at line 1 column 12", + "invalid type: integer `2024`, expected a string at line 1 column 12", ); } } diff --git a/dev-tools/clickana/Cargo.toml b/dev-tools/clickana/Cargo.toml new file mode 100644 index 0000000000..a9f91b890b --- /dev/null +++ b/dev-tools/clickana/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "clickana" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +camino.workspace = true +chrono.workspace = true +clap.workspace = true +clickhouse-admin-types.workspace = true +clickhouse-admin-server-client.workspace = true +dropshot.workspace = true +futures.workspace = true +omicron-common.workspace = true +ratatui.workspace = true +schemars.workspace = true +slog.workspace = true +slog-async.workspace = true +slog-dtrace.workspace = true +slog-error-chain.workspace = true +slog-term.workspace = true +serde_json.workspace = true +tokio.workspace = true +tokio-postgres.workspace = true + +omicron-workspace-hack.workspace = true + +[lints] +workspace = true diff --git a/dev-tools/clickana/src/bin/clickana.rs b/dev-tools/clickana/src/bin/clickana.rs new file mode 100644 index 0000000000..0c8d06156e --- /dev/null +++ b/dev-tools/clickana/src/bin/clickana.rs @@ -0,0 +1,57 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::Result; +use camino::Utf8PathBuf; +use clap::Parser; +use clickana::Clickana; +use std::net::SocketAddr; + +const CLICKANA_LOG_FILE: &str = "/tmp/clickana.log"; + +#[tokio::main] +async fn main() -> Result<()> { + let args = Cli::parse(); + + let terminal = ratatui::init(); + let result = Clickana::new( + args.clickhouse_addr, + args.log_path, + args.sampling_interval, + args.time_range, + args.refresh_interval, + ) + .run(terminal) + .await; + ratatui::restore(); + result +} + +#[derive(Debug, Parser)] +struct Cli { + /// Path to the log file + #[arg( + long, + short, + env = "CLICKANA_LOG_PATH", + default_value = CLICKANA_LOG_FILE, + )] + log_path: Utf8PathBuf, + + /// Address where a clickhouse admin server is listening on + #[arg(long, short = 'a')] + clickhouse_addr: SocketAddr, + + /// The interval to collect monitoring data in seconds + #[arg(long, short, default_value_t = 60)] + sampling_interval: u64, + + /// Range of time to collect monitoring data in seconds + #[arg(long, short, default_value_t = 3600)] + time_range: u64, + + /// The interval at which the dashboards will refresh + #[arg(long, short, default_value_t = 60)] + refresh_interval: u64, +} diff --git a/dev-tools/clickana/src/chart.rs b/dev-tools/clickana/src/chart.rs new file mode 100644 index 0000000000..f8a78fb63d --- /dev/null +++ b/dev-tools/clickana/src/chart.rs @@ -0,0 +1,765 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{bail, Result}; +use chrono::{DateTime, Utc}; +use clickhouse_admin_server_client::types::{SystemTable, SystemTimeSeries}; +use ratatui::{ + layout::{Constraint, Rect}, + style::{Color, Style, Stylize}, + symbols::Marker, + text::Line, + widgets::{Axis, Block, Chart, Dataset, GraphType, LegendPosition}, + Frame, +}; +use std::fmt::Display; + +// Ratatui requires data points in a Dataset to be f64 +const GIBIBYTE_F64: f64 = 1073741824.0; +const MEBIBYTE_F64: f64 = 1048576.0; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Unit { + Count, + Gibibyte, + Mebibyte, +} + +impl Display for Unit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Unit::Count => "", + Unit::Gibibyte => "GiB", + Unit::Mebibyte => "MiB", + }; + write!(f, "{s}") + } +} + +impl Unit { + /// Returns the value of the unit represented in bytes. + fn as_bytes_f64(&self) -> Result { + let bytes = match self { + Unit::Gibibyte => GIBIBYTE_F64, + Unit::Mebibyte => MEBIBYTE_F64, + Unit::Count => bail!("Count cannot be converted into bytes"), + }; + Ok(bytes) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum MetricName { + DiskUsage, + MemoryTracking, + QueryCount, + RunningQueries, +} + +impl Display for MetricName { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + MetricName::DiskUsage => "DiskUsed_default", + MetricName::MemoryTracking => "CurrentMetric_MemoryTracking", + MetricName::QueryCount => "ProfileEvent_Query", + MetricName::RunningQueries => "CurrentMetric_Query", + }; + write!(f, "{s}") + } +} + +impl MetricName { + /// Returns the associated table to query for each metric. + pub fn table(&self) -> SystemTable { + match self { + MetricName::DiskUsage => SystemTable::AsynchronousMetricLog, + MetricName::MemoryTracking + | MetricName::QueryCount + | MetricName::RunningQueries => SystemTable::MetricLog, + } + } + + /// Returns the unit the data values will be represented as. + fn unit(&self) -> Unit { + match self { + MetricName::DiskUsage => Unit::Gibibyte, + MetricName::MemoryTracking => Unit::Mebibyte, + MetricName::QueryCount | MetricName::RunningQueries => Unit::Count, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ChartMetadata { + pub title: String, + pub unit: Unit, +} + +impl ChartMetadata { + pub fn new(metric: MetricName, title: String) -> Self { + let unit = metric.unit(); + Self { title, unit } + } +} + +#[derive(Debug)] +struct TimeSeriesValues { + /// A collection of all the values from the timeseries + values: Vec, +} + +impl TimeSeriesValues { + fn new(raw_data: &Vec) -> Self { + let values: Vec = raw_data.iter().map(|ts| ts.value).collect(); + Self { values } + } + + fn min(&self) -> Result<&f64> { + let Some(min_value) = + self.values.iter().min_by(|a, b| a.partial_cmp(b).unwrap()) + else { + bail!("no values have been retrieved") + }; + + Ok(min_value) + } + + fn max(&self) -> Result<&f64> { + let Some(max_value) = + self.values.iter().max_by(|a, b| a.partial_cmp(b).unwrap()) + else { + bail!("no values have been retrieved") + }; + + Ok(max_value) + } + + /// Returns the average value of the max and min values. + fn avg(&self, min_value_label: f64, max_value_label: f64) -> f64 { + (min_value_label + max_value_label) / 2.0 + } +} + +// The result of the following functions will not be precise, but it doesn't +// matter since we just want an estimate for the chart's labels and bounds. +// all we need are values that are larger than the maximum value in the +// timeseries or smaller than the minimum value in the timeseries. + +/// Returns the sum of the maximum raw value and 1 or the equivalent of 1 +/// MiB or GiB in bytes. +fn padded_max_value_raw(unit: Unit, max_value_raw: &f64) -> Result { + let ceil_value = max_value_raw.ceil(); + let padded_value = match unit { + Unit::Count => ceil_value + 1.0, + Unit::Gibibyte | Unit::Mebibyte => ceil_value + unit.as_bytes_f64()?, + }; + Ok(padded_value) +} + +/// Returns the sum of the max raw value and 1 or the equivalent of 1 +/// Mib or Gib. +fn padded_max_value_as_unit(unit: Unit, max_value_raw: &f64) -> Result { + let label_value = match unit { + Unit::Count => max_value_raw + 1.0, + Unit::Gibibyte | Unit::Mebibyte => { + (max_value_raw / unit.as_bytes_f64()?) + 1.0 + } + }; + Ok(label_value.ceil()) +} + +/// Returns the difference of the minimum raw value and 1 or the equivalent +/// of 1 in MiB or GiB in bytes. If the minimum is equal to or less than 1.0, +/// or the equivalent of 1 once converted from bytes to the expected unit +/// (e.g. less than or equal to 1048576 if we're using MiB) we'll use 0.0 as +/// the minimum value as we don't expect any of our charts +/// to require negative numbers for now. +fn padded_min_value_raw(unit: Unit, min_value_raw: &f64) -> Result { + let padded_value = match unit { + Unit::Count => { + if *min_value_raw <= 1.0 { + 0.0 + } else { + min_value_raw - 1.0 + } + } + Unit::Gibibyte | Unit::Mebibyte => { + let bytes = unit.as_bytes_f64()?; + if *min_value_raw <= bytes { + 0.0 + } else { + min_value_raw - bytes + } + } + }; + Ok(padded_value.floor()) +} + +/// Returns the difference of the minimum raw value and 1 or the equivalent +/// of 1 in MiB or GiB in bytes. If the minimum is less than 1, we'll use +/// 0 as the minimum value as we don't expect any of our charts to require +/// negative numbers for now. +fn padded_min_value_as_unit(unit: Unit, min_value_raw: &f64) -> Result { + let padded_value = match unit { + Unit::Count => { + if *min_value_raw < 1.0 { + 0.0 + } else { + min_value_raw - 1.0 + } + } + Unit::Gibibyte | Unit::Mebibyte => { + let value_as_unit = min_value_raw / unit.as_bytes_f64()?; + if value_as_unit < 1.0 { + 0.0 + } else { + value_as_unit - 1.0 + } + } + }; + Ok(padded_value.floor()) +} + +#[derive(Debug, PartialEq)] +struct YAxisValues { + lower_label: String, + mid_label: String, + upper_label: String, + lower_bound: f64, + upper_bound: f64, +} + +impl YAxisValues { + fn new(unit: Unit, raw_data: &Vec) -> Result { + // Retrieve values only to create Y axis bounds and labels + let values = TimeSeriesValues::new(&raw_data); + let max_value = values.max()?; + let min_value = values.min()?; + + // In case there is very little variance in the y axis, we will be adding some + // padding to the bounds and labels so we don't end up with repeated labels or + // straight lines too close to the upper bounds. + let upper_bound = padded_max_value_raw(unit, max_value)?; + let upper_label_as_unit = padded_max_value_as_unit(unit, max_value)?; + let lower_bound = padded_min_value_raw(unit, min_value)?; + let lower_label_as_unit = padded_min_value_as_unit(unit, min_value)?; + let mid_label_as_unit = + values.avg(lower_label_as_unit, upper_label_as_unit); + + // To nicely display the mid value label for the Y axis, we do the following: + // - It is not displayed it if it is a 0.0. + // - If it does not have a fractional number we display it as an integer. + // - Else, we display the number as is up to the first fractional number. + //let mid_value = mid_label; + let fractional_of_mid_value = + mid_label_as_unit - mid_label_as_unit.floor(); + let mid_value_formatted = format!("{:.1}", mid_label_as_unit); + let mid_label = if mid_value_formatted == *"0.0" { + "".to_string() + } else if fractional_of_mid_value == 0.0 { + format!( + "{} {}", + mid_value_formatted.split('.').next().unwrap(), + unit + ) + } else { + format!("{} {}", mid_value_formatted, unit) + }; + + let upper_label = format!("{} {}", upper_label_as_unit, unit); + let lower_label = format!("{} {}", lower_label_as_unit, unit); + + Ok(Self { + lower_label, + mid_label, + upper_label, + lower_bound, + upper_bound, + }) + } +} + +#[derive(Debug)] +struct TimeSeriesTimestamps { + /// A collection of all the timestamps from the timeseries + timestamps: Vec, +} + +impl TimeSeriesTimestamps { + fn new(raw_data: &Vec) -> Self { + let timestamps: Vec = raw_data + .iter() + .map(|ts| { + ts.time.trim_matches('"').parse::().unwrap_or_else(|_| { + panic!("could not parse timestamp {} into i64", ts.time) + }) + }) + .collect(); + Self { timestamps } + } + + fn min(&self) -> Result<&i64> { + let Some(start_time) = self.timestamps.iter().min() else { + bail!("failed to retrieve start time, timestamp list is empty") + }; + Ok(start_time) + } + + fn max(&self) -> Result<&i64> { + let Some(end_time) = self.timestamps.iter().max() else { + bail!("failed to retrieve end time, timestamp list is empty") + }; + Ok(end_time) + } + + fn avg(&self, start_time: &i64, end_time: &i64) -> i64 { + (start_time + end_time) / 2 + } +} + +#[derive(Debug, PartialEq)] +pub struct XAxisTimestamps { + mid_time_label: DateTime, + pub start_time_label: DateTime, + pub end_time_label: DateTime, + start_time_bound: f64, + end_time_bound: f64, +} + +impl XAxisTimestamps { + fn new(raw_data: &Vec) -> Result { + // Retrieve timestamps only to create chart bounds and labels + let timestamps = TimeSeriesTimestamps::new(&raw_data); + // These timestamps will be used to calculate start and end timestamps in order + // to create labels and set bounds for the X axis. As above, some of these conversions + // may lose precision, but it's OK as these values are only used to make sure the + // datapoints fit within the graph nicely. + let start_time = timestamps.min()?; + let end_time = timestamps.max()?; + let avg_time = timestamps.avg(start_time, end_time); + + let Some(start_time_label) = DateTime::from_timestamp(*start_time, 0) + else { + bail!( + "failed to convert timestamp to UTC date and time; + timestamp = {}", + start_time + ) + }; + let Some(end_time_label) = DateTime::from_timestamp(*end_time, 0) + else { + bail!( + "failed to convert timestamp to UTC date and time; + timestamp = {}", + end_time + ) + }; + let Some(mid_time_label) = DateTime::from_timestamp(avg_time, 0) else { + bail!( + "failed to convert timestamp to UTC date and time; + timestamp = {}", + avg_time + ) + }; + + let start_time_bound = *start_time as f64; + let end_time_bound = *end_time as f64; + + Ok(Self { + mid_time_label, + start_time_label, + end_time_label, + start_time_bound, + end_time_bound, + }) + } +} + +#[derive(Debug, PartialEq)] +struct DataPoints { + data: Vec<(f64, f64)>, +} + +impl DataPoints { + fn new(timeseries: &Vec) -> Self { + // These values will be used to render the graph and ratatui + // requires them to be f64 + let data: Vec<(f64, f64)> = timeseries + .iter() + .map(|ts| { + ( + ts.time.trim_matches('"').parse::().unwrap_or_else( + |_| { + panic!( + "could not parse timestamp {} into f64", + ts.time + ) + }, + ), + ts.value, + ) + }) + .collect(); + Self { data } + } +} + +#[derive(Debug, PartialEq)] +pub struct ChartData { + metadata: ChartMetadata, + data_points: DataPoints, + pub x_axis_timestamps: XAxisTimestamps, + y_axis_values: YAxisValues, +} + +impl ChartData { + pub fn new( + raw_data: Vec, + metadata: ChartMetadata, + ) -> Result { + // Retrieve datapoints that will be charted + let data_points = DataPoints::new(&raw_data); + + // Retrieve X axis bounds and labels + let x_axis_timestamps = XAxisTimestamps::new(&raw_data)?; + + // Retrieve X axis bounds and labels + let y_axis_values = YAxisValues::new(metadata.unit, &raw_data)?; + + Ok(Self { metadata, data_points, x_axis_timestamps, y_axis_values }) + } + + pub fn render_line_chart(&self, frame: &mut Frame, area: Rect) { + let datasets = vec![Dataset::default() + .marker(Marker::Braille) + .style(Style::default().fg(Color::LightGreen)) + .graph_type(GraphType::Line) + .data(&self.data_points.data)]; + + let chart = Chart::new(datasets) + .block( + Block::bordered() + .title(Line::from(self.title()).cyan().bold().centered()), + ) + .x_axis( + Axis::default() + .style(Style::default().gray()) + .bounds([self.start_time_bound(), self.end_time_bound()]) + .labels([ + self.start_time_label().bold(), + self.mid_time_label().bold(), + self.end_time_label().bold(), + ]), + ) + .y_axis( + Axis::default() + .style(Style::default().gray()) + .bounds([ + self.lower_value_bound(), + self.upper_value_bound(), + ]) + .labels([ + self.lower_value_label().bold(), + self.mid_value_label().bold(), + self.upper_value_label().bold(), + ]), + ) + .legend_position(Some(LegendPosition::TopLeft)) + .hidden_legend_constraints(( + Constraint::Ratio(1, 2), + Constraint::Ratio(1, 2), + )); + + frame.render_widget(chart, area); + } + + fn title(&self) -> String { + self.metadata.title.clone() + } + + pub fn start_date_time(&self) -> DateTime { + self.x_axis_timestamps.start_time_label + } + + pub fn end_date_time(&self) -> DateTime { + self.x_axis_timestamps.end_time_label + } + + fn start_time_label(&self) -> String { + self.x_axis_timestamps.start_time_label.time().to_string() + } + + fn mid_time_label(&self) -> String { + self.x_axis_timestamps.mid_time_label.time().to_string() + } + + fn end_time_label(&self) -> String { + self.x_axis_timestamps.end_time_label.time().to_string() + } + + fn start_time_bound(&self) -> f64 { + self.x_axis_timestamps.start_time_bound + } + + fn end_time_bound(&self) -> f64 { + self.x_axis_timestamps.end_time_bound + } + + fn lower_value_label(&self) -> String { + self.y_axis_values.lower_label.clone() + } + + fn mid_value_label(&self) -> String { + self.y_axis_values.mid_label.clone() + } + + fn upper_value_label(&self) -> String { + self.y_axis_values.upper_label.clone() + } + + fn lower_value_bound(&self) -> f64 { + self.y_axis_values.lower_bound + } + + fn upper_value_bound(&self) -> f64 { + self.y_axis_values.upper_bound + } +} + +#[cfg(test)] +mod tests { + use crate::{ + chart::{Unit, YAxisValues}, + ChartData, ChartMetadata, MetricName, + }; + use chrono::DateTime; + use clickhouse_admin_server_client::types::SystemTimeSeries; + + use super::{DataPoints, XAxisTimestamps}; + + #[test] + fn gather_chart_data_for_disk_usage_success() { + let metadata = + ChartMetadata::new(MetricName::DiskUsage, "Test Chart".to_string()); + let raw_data = vec![ + SystemTimeSeries { + time: "1732223400".to_string(), + value: 479551511587.3104, + }, + SystemTimeSeries { + time: "1732223520".to_string(), + value: 479555459822.93335, + }, + SystemTimeSeries { + time: "1732223640".to_string(), + value: 479560290201.6, + }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Gibibyte, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 479551511587.3104), + (1732223520.0, 479555459822.93335), + (1732223640.0, 479560290201.6), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "445 GiB".to_string(), + mid_label: "446.5 GiB".to_string(), + upper_label: "448 GiB".to_string(), + lower_bound: 478477769763.0, + upper_bound: 480634032026.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + fn gather_chart_data_for_memory_tracking_success() { + let metadata = ChartMetadata::new( + MetricName::MemoryTracking, + "Test Chart".to_string(), + ); + let raw_data = vec![ + SystemTimeSeries { + time: "1732223400".to_string(), + value: 479551511587.3104, + }, + SystemTimeSeries { + time: "1732223520".to_string(), + value: 479555459822.93335, + }, + SystemTimeSeries { + time: "1732223640".to_string(), + value: 479560290201.6, + }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Mebibyte, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 479551511587.3104), + (1732223520.0, 479555459822.93335), + (1732223640.0, 479560290201.6), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "457334 MiB".to_string(), + mid_label: "457340 MiB".to_string(), + upper_label: "457346 MiB".to_string(), + lower_bound: 479550463011.0, + upper_bound: 479561338778.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + fn gather_chart_data_for_query_count_success() { + let metadata = ChartMetadata::new( + MetricName::QueryCount, + "Test Chart".to_string(), + ); + let raw_data = vec![ + SystemTimeSeries { time: "1732223400".to_string(), value: 0.0 }, + SystemTimeSeries { time: "1732223520".to_string(), value: 0.004 }, + SystemTimeSeries { time: "1732223640".to_string(), value: 0.0 }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Count, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 0.0), + (1732223520.0, 0.004), + (1732223640.0, 0.0), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "0 ".to_string(), + mid_label: "1 ".to_string(), + upper_label: "2 ".to_string(), + lower_bound: 0.0, + upper_bound: 2.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + fn gather_chart_data_for_running_queries_success() { + let metadata = ChartMetadata::new( + MetricName::RunningQueries, + "Test Chart".to_string(), + ); + let raw_data = vec![ + SystemTimeSeries { time: "1732223400".to_string(), value: 1.554 }, + SystemTimeSeries { time: "1732223520".to_string(), value: 1.877 }, + SystemTimeSeries { time: "1732223640".to_string(), value: 1.3456 }, + ]; + + let expected_result = ChartData { + metadata: ChartMetadata { + title: "Test Chart".to_string(), + unit: Unit::Count, + }, + data_points: DataPoints { + data: vec![ + (1732223400.0, 1.554), + (1732223520.0, 1.877), + (1732223640.0, 1.3456), + ], + }, + x_axis_timestamps: XAxisTimestamps { + start_time_label: DateTime::from_timestamp(1732223400, 0) + .unwrap(), + mid_time_label: DateTime::from_timestamp(1732223520, 0) + .unwrap(), + end_time_label: DateTime::from_timestamp(1732223640, 0) + .unwrap(), + start_time_bound: 1732223400.0, + end_time_bound: 1732223640.0, + }, + y_axis_values: YAxisValues { + lower_label: "0 ".to_string(), + mid_label: "1.5 ".to_string(), + upper_label: "3 ".to_string(), + lower_bound: 0.0, + upper_bound: 3.0, + }, + }; + let result = ChartData::new(raw_data, metadata).unwrap(); + assert_eq!(result, expected_result); + } + + #[test] + #[should_panic( + expected = "could not parse timestamp Some nonsense string into f64" + )] + fn gather_chart_data_failure() { + let metadata = + ChartMetadata::new(MetricName::DiskUsage, "Test Chart".to_string()); + let raw_data = vec![ + SystemTimeSeries { + time: "Some nonsense string".to_string(), + value: 479551511587.3104, + }, + SystemTimeSeries { + time: "1732223520".to_string(), + value: 479555459822.93335, + }, + SystemTimeSeries { + time: "1732223640".to_string(), + value: 479560290201.6, + }, + ]; + + let _ = ChartData::new(raw_data, metadata); + } +} diff --git a/dev-tools/clickana/src/lib.rs b/dev-tools/clickana/src/lib.rs new file mode 100644 index 0000000000..76af35ba8d --- /dev/null +++ b/dev-tools/clickana/src/lib.rs @@ -0,0 +1,274 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{anyhow, bail, Context, Result}; +use camino::Utf8PathBuf; +use chrono::{DateTime, Utc}; +use clickhouse_admin_server_client::types::{ + SystemTimeSeries, TimestampFormat, +}; +use clickhouse_admin_server_client::Client as ClickhouseServerClient; +use futures::stream::FuturesOrdered; +use futures::StreamExt; +use omicron_common::FileKv; +use ratatui::crossterm::event::{self, Event, KeyCode}; +use ratatui::layout::{Constraint, Layout, Rect}; +use ratatui::style::{Color, Style, Stylize}; +use ratatui::text::Span; +use ratatui::widgets::Paragraph; +use ratatui::{DefaultTerminal, Frame}; +use slog::{o, Drain, Logger}; +use slog_async::Async; +use slog_term::{FullFormat, PlainDecorator}; +use std::collections::BTreeMap; +use std::future::Future; +use std::net::SocketAddr; +use std::pin::Pin; +use std::time::{Duration, Instant}; + +use crate::chart::{ChartData, ChartMetadata, MetricName}; + +mod chart; + +#[derive(Debug)] +struct Dashboard { + start_time: DateTime, + end_time: DateTime, + top_left_frame: ChartData, + top_right_frame: ChartData, + bottom_left_frame: ChartData, + bottom_right_frame: ChartData, +} + +#[derive(Clone, Debug)] +pub struct Clickana { + clickhouse_addr: SocketAddr, + log_path: Utf8PathBuf, + sampling_interval: u64, + time_range: u64, + refresh_interval: u64, +} + +impl Clickana { + pub fn new( + clickhouse_addr: SocketAddr, + log_path: Utf8PathBuf, + sampling_interval: u64, + time_range: u64, + refresh_interval: u64, + ) -> Self { + Self { + clickhouse_addr, + log_path, + sampling_interval, + time_range, + refresh_interval, + } + } + + pub async fn run(self, mut terminal: DefaultTerminal) -> Result<()> { + let admin_url = format!("http://{}", self.clickhouse_addr); + let log = self.new_logger()?; + let client = ClickhouseServerClient::new(&admin_url, log.clone()); + + let tick_interval = Duration::from_secs(self.refresh_interval); + let mut last_tick = Instant::now(); + loop { + // Charts we will be showing in the dashboard + // + // We are hardcoding these for now. In the future these will likely be taken + // from a TOML config file. + let charts = BTreeMap::from([ + (MetricName::DiskUsage, "Disk Usage".to_string()), + ( + MetricName::MemoryTracking, + "Memory Allocated by the Server".to_string(), + ), + ( + MetricName::QueryCount, + "Queries Started per Second".to_string(), + ), + (MetricName::RunningQueries, "Queries Running".to_string()), + ]); + + let mut tasks = FuturesOrdered::< + Pin>>>, + >::new(); + + for (metric_name, title) in charts { + let s = self.clone(); + let c = client.clone(); + + let task = Box::pin(async move { + let metadata = ChartMetadata::new(metric_name, title); + let data = s.get_api_data(&c, metric_name).await?; + ChartData::new(data, metadata) + }); + tasks.push_back(task); + } + + if tasks.len() != 4 { + bail!( + "expected information for 4 charts, received {} instead", + tasks.len() + ); + } + + // TODO: Eventually we may want to not have a set amount of charts and make the + // dashboard a bit more dynamic. Perhaps taking a toml configuration file or + // something like that. We can then create a vector of "ChartData"s for Dashboard + // to take and create the layout dynamically. + // + // IDEA (ajs): I think it would be useful to be able to have a little menu of charts + // on the side of the pane, and then you can scroll and select which ones to show + // without having to restart the app, or mess with a toml file. + // You could also allow toggling between a set of predefined layouts to make it always + // look nice. So you could show, 1, 2, 4, 6, 8 charts or something and allow selecting + // which to show in each view. You could even remember which charts to show in each layout, + // so you could toggle back and forth between different layouts and see all the charts, + // some with more detail. + // + // We have already checked that the length of tasks is 4, so it's safe to unwrap + let top_left_frame: ChartData = tasks.next().await.unwrap()?; + let top_right_frame: ChartData = tasks.next().await.unwrap()?; + let bottom_left_frame: ChartData = tasks.next().await.unwrap()?; + let bottom_right_frame: ChartData = tasks.next().await.unwrap()?; + + // We only need to retrieve from one chart as they will all be relatively the same. + // Rarely, the charts may have a variance of a second or so depending on when + // the API calls were made, but for the header block we don't need exact precision. + let start_time = top_left_frame.start_date_time(); + let end_time = top_left_frame.end_date_time(); + + let dashboard = Dashboard { + start_time, + end_time, + top_left_frame, + top_right_frame, + bottom_left_frame, + bottom_right_frame, + }; + terminal.draw(|frame| self.draw(frame, dashboard))?; + + let timeout = tick_interval.saturating_sub(last_tick.elapsed()); + if event::poll(timeout)? { + if let Event::Key(key) = event::read()? { + // To exit the dashboard press the "q" key + if key.code == KeyCode::Char('q') { + return Ok(()); + } + } + } + + if last_tick.elapsed() >= tick_interval { + last_tick = Instant::now(); + } + } + } + + fn draw(&self, frame: &mut Frame, dashboard: Dashboard) { + let [heading, top, bottom] = Layout::vertical([ + Constraint::Length(4), + // TODO: If we make the dashboard with too many charts + // we may want to reconsider setting sizes instead of filling + // the space + Constraint::Fill(1), + Constraint::Fill(1), + ]) + .areas(frame.area()); + let [title] = + Layout::horizontal([Constraint::Fill(1); 1]).areas(heading); + let [top_left_frame, top_right_frame] = + Layout::horizontal([Constraint::Fill(1); 2]).areas(top); + let [bottom_left_frame, bottom_right_frame] = + Layout::horizontal([Constraint::Fill(1); 2]).areas(bottom); + + self.render_title_bar(frame, title, &dashboard); + + dashboard.top_left_frame.render_line_chart(frame, top_left_frame); + dashboard.top_right_frame.render_line_chart(frame, top_right_frame); + dashboard.bottom_left_frame.render_line_chart(frame, bottom_left_frame); + dashboard + .bottom_right_frame + .render_line_chart(frame, bottom_right_frame); + } + + fn render_title_bar( + &self, + frame: &mut Frame, + area: Rect, + dashboard: &Dashboard, + ) { + let style = Style::new().fg(Color::Green).bold(); + let title = vec![ + Span::styled("CLICKANA", style).into_centered_line(), + Span::styled( + format!("Sampling Interval: {}s", self.sampling_interval), + style, + ) + .into_left_aligned_line(), + Span::styled( + format!( + "Time Range: {} - {} ({}s)", + dashboard.start_time, dashboard.end_time, self.time_range + ), + style, + ) + .into_left_aligned_line(), + Span::styled( + format!("Refresh Interval {}s", self.refresh_interval), + style, + ) + .into_left_aligned_line(), + ]; + let p = Paragraph::new(title); + + frame.render_widget(p, area); + } + + async fn get_api_data( + &self, + client: &ClickhouseServerClient, + metric: MetricName, + ) -> Result> { + let timeseries = client + .system_timeseries_avg( + metric.table(), + &format!("{metric}"), + Some(self.sampling_interval), + Some(self.time_range), + Some(TimestampFormat::UnixEpoch), + ) + .await + .map(|t| t.into_inner()) + .map_err(|e| { + anyhow!( + concat!( + "failed to retrieve timeseries from clickhouse server; ", + "error = {}", + ), + e + ) + }); + + timeseries + } + + fn new_logger(&self) -> Result { + let file = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(self.log_path.clone()) + .with_context(|| { + format!("error opening log file {}", self.log_path) + })?; + + let decorator = PlainDecorator::new(file); + let drain = FullFormat::new(decorator).build().fuse(); + let drain = Async::new(drain).build().fuse(); + + Ok(slog::Logger::root(drain, o!(FileKv))) + } +} diff --git a/openapi/clickhouse-admin-server.json b/openapi/clickhouse-admin-server.json index c82c7c0d8e..50c526569e 100644 --- a/openapi/clickhouse-admin-server.json +++ b/openapi/clickhouse-admin-server.json @@ -616,7 +616,7 @@ "type": "object", "properties": { "time": { - "$ref": "#/components/schemas/Timestamp" + "type": "string" }, "value": { "type": "number", @@ -628,17 +628,6 @@ "value" ] }, - "Timestamp": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "string" - } - ] - }, "SystemTable": { "description": "Available metrics tables in the `system` database", "type": "string", diff --git a/openapi/clickhouse-admin-single.json b/openapi/clickhouse-admin-single.json index b00bf56314..c6b99da245 100644 --- a/openapi/clickhouse-admin-single.json +++ b/openapi/clickhouse-admin-single.json @@ -131,7 +131,7 @@ "type": "object", "properties": { "time": { - "$ref": "#/components/schemas/Timestamp" + "type": "string" }, "value": { "type": "number", @@ -143,17 +143,6 @@ "value" ] }, - "Timestamp": { - "anyOf": [ - { - "type": "string", - "format": "date-time" - }, - { - "type": "string" - } - ] - }, "SystemTable": { "description": "Available metrics tables in the `system` database", "type": "string", diff --git a/package-manifest.toml b/package-manifest.toml index 83b1ba8168..b28ac7d59f 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -166,7 +166,8 @@ source.packages = [ "internal-dns-cli.tar.gz", "omicron-clickhouse-admin.tar.gz", "zone-setup.tar.gz", - "zone-network-install.tar.gz" + "zone-network-install.tar.gz", + "clickana.tar.gz" ] output.type = "zone" @@ -197,7 +198,8 @@ source.packages = [ "internal-dns-cli.tar.gz", "omicron-clickhouse-admin.tar.gz", "zone-setup.tar.gz", - "zone-network-install.tar.gz" + "zone-network-install.tar.gz", + "clickana.tar.gz" ] output.type = "zone" @@ -924,3 +926,12 @@ service_name = "probe" source.type = "composite" source.packages = ["thundermuffin.tar.gz"] output.type = "zone" + +[package.clickana] +service_name = "clickana" +only_for_targets.image = "standard" +source.type = "local" +source.rust.binary_names = ["clickana"] +source.rust.release = true +output.type = "zone" +output.intermediate_only = true \ No newline at end of file diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index db250eb914..7c5fb44d13 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -76,6 +76,7 @@ use omicron_common::address::WICKETD_NEXUS_PROXY_PORT; use omicron_common::address::WICKETD_PORT; use omicron_common::address::{ get_internal_dns_server_addresses, CLICKHOUSE_ADMIN_PORT, + CLICKHOUSE_TCP_PORT, }; use omicron_common::address::{Ipv6Subnet, NEXUS_TECHPORT_EXTERNAL_PORT}; use omicron_common::address::{BOOTSTRAP_ARTIFACT_PORT, COCKROACH_ADMIN_PORT}; @@ -1597,13 +1598,20 @@ impl ServiceManager { addr.to_string() }; + // The ClickHouse client connects via the TCP port + let ch_address = { + let mut addr = *address; + addr.set_port(CLICKHOUSE_TCP_PORT); + addr.to_string() + }; + let clickhouse_admin_config = PropertyGroupBuilder::new("config") .add_property("http_address", "astring", admin_address) .add_property( "ch_address", "astring", - address.to_string(), + ch_address.to_string(), ) .add_property( "ch_binary", @@ -1668,13 +1676,20 @@ impl ServiceManager { addr.to_string() }; + // The ClickHouse client connects via the TCP port + let ch_address = { + let mut addr = *address; + addr.set_port(CLICKHOUSE_TCP_PORT); + addr.to_string() + }; + let clickhouse_admin_config = PropertyGroupBuilder::new("config") .add_property("http_address", "astring", admin_address) .add_property( "ch_address", "astring", - address.to_string(), + ch_address.to_string(), ) .add_property( "ch_binary", From 09b150f17780ccd8e35051f4611ff9883ced71ff Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 19 Dec 2024 09:22:54 -0500 Subject: [PATCH 7/9] Handle region snapshot replacement volume deletes (#7046) Volumes can be deleted at any time, but the tasks and sagas that perform region snapshot replacement did not account for this. This commit adds checks in a few places for if a volume is soft-deleted or hard-deleted, and bails out of any affected region snapshot replacement accordingly: - if a volume that has the region snapshot being replaced is soft-deleted, then skip making a region snapshot replacement step for it - if a region snapshot replacement step has the volume deleted after the step was created, transition it directly to the VolumeDeleted state - if a region snapshot replacement step has the volume deleted during the saga invocation, then skip notifying any Upstairs and allow the saga to transition the request to Complete, where then associated clean up can proceed An interesting race condition emerged during unit testing: the read-only region allocated to replace a region snapshot would be swapped into the snapshot volume, but would be susceptible to being deleted by the user, and therefore unable to be swapped into other volumes that have that snapshot volume as a read-only parent. This requires an additional volume that used that read-only region in order to bump the reference count associated with that region, so that the user cannot delete it before it was used to replace all other uses of the region snapshot it was meant to replace. This additional volume's lifetime lives as long as the region snapshot replacement, and therefore needs to be deleted when the region snapshot replacement is finished. This required a new region snapshot replacement finish saga, which required a new "Completing" state to perform the same type of state based lock on the replacement request done for all the other sagas. Testing also revealed that there were scenarios where `find_deleted_volume_regions` would return volumes for hard-deletion prematurely. The function now returns a struct instead of a list of tuples, and in that struct, regions freed for deletion are now distinct from volumes freed for deletion. Volumes are now only returned for hard-deletion when all associated read/write regions have been (or are going to be) deleted. Fixes #6353 --- dev-tools/omdb/src/bin/omdb/db.rs | 41 +- dev-tools/omdb/src/bin/omdb/nexus.rs | 23 +- dev-tools/omdb/tests/successes.out | 8 +- .../src/region_snapshot_replacement.rs | 20 +- nexus/db-model/src/schema.rs | 1 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/mod.rs | 11 +- .../src/db/datastore/region_replacement.rs | 58 +- .../datastore/region_snapshot_replacement.rs | 575 +++++- nexus/db-queries/src/db/datastore/volume.rs | 130 +- .../src/db/datastore/volume_repair.rs | 165 +- nexus/src/app/background/init.rs | 2 +- .../background/tasks/region_replacement.rs | 54 +- .../tasks/region_replacement_driver.rs | 57 + .../region_snapshot_replacement_finish.rs | 118 +- ...on_snapshot_replacement_garbage_collect.rs | 41 +- .../region_snapshot_replacement_start.rs | 352 +++- .../tasks/region_snapshot_replacement_step.rs | 193 +- nexus/src/app/sagas/mod.rs | 4 +- .../app/sagas/region_replacement_finish.rs | 14 + .../src/app/sagas/region_replacement_start.rs | 9 +- .../region_snapshot_replacement_finish.rs | 211 ++ ...on_snapshot_replacement_garbage_collect.rs | 18 +- .../region_snapshot_replacement_start.rs | 118 +- .../sagas/region_snapshot_replacement_step.rs | 17 +- ...apshot_replacement_step_garbage_collect.rs | 18 +- nexus/src/app/sagas/volume_delete.rs | 42 +- nexus/test-utils/src/background.rs | 2 +- .../crucible_replacements.rs | 1706 ++++++++++++++--- .../integration_tests/volume_management.rs | 7 +- nexus/types/src/internal_api/background.rs | 4 +- .../up01.sql | 1 + .../up02.sql | 1 + schema/crdb/dbinit.sql | 9 +- 34 files changed, 3540 insertions(+), 493 deletions(-) create mode 100644 nexus/src/app/sagas/region_snapshot_replacement_finish.rs create mode 100644 schema/crdb/add-completing-and-new-region-volume/up01.sql create mode 100644 schema/crdb/add-completing-and-new-region-volume/up02.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 5058739da0..e501e650b1 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2591,39 +2591,48 @@ async fn cmd_db_region_used_by( async fn cmd_db_region_find_deleted( datastore: &DataStore, ) -> Result<(), anyhow::Error> { - let datasets_regions_volumes = + let freed_crucible_resources = datastore.find_deleted_volume_regions().await?; #[derive(Tabled)] - struct Row { + struct RegionRow { dataset_id: DatasetUuid, region_id: Uuid, - volume_id: String, } - let rows: Vec = datasets_regions_volumes - .into_iter() + #[derive(Tabled)] + struct VolumeRow { + volume_id: Uuid, + } + + let region_rows: Vec = freed_crucible_resources + .datasets_and_regions + .iter() .map(|row| { - let (dataset, region, volume) = row; + let (dataset, region) = row; - Row { - dataset_id: dataset.id(), - region_id: region.id(), - volume_id: if let Some(volume) = volume { - volume.id().to_string() - } else { - String::from("") - }, - } + RegionRow { dataset_id: dataset.id(), region_id: region.id() } }) .collect(); - let table = tabled::Table::new(rows) + let table = tabled::Table::new(region_rows) .with(tabled::settings::Style::psql()) .to_string(); println!("{}", table); + let volume_rows: Vec = freed_crucible_resources + .volumes + .iter() + .map(|volume_id| VolumeRow { volume_id: *volume_id }) + .collect(); + + let volume_table = tabled::Table::new(volume_rows) + .with(tabled::settings::Style::psql()) + .to_string(); + + println!("{}", volume_table); + Ok(()) } diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index fe16b4076e..eb7d74b340 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1655,6 +1655,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total requests completed ok: {}", + status.requests_completed_ok.len(), + ); + for line in &status.requests_completed_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1720,6 +1728,14 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } + println!( + " total steps set to volume_deleted ok: {}", + status.step_set_volume_deleted_ok.len(), + ); + for line in &status.step_set_volume_deleted_ok { + println!(" > {line}"); + } + println!(" errors: {}", status.errors.len()); for line in &status.errors { println!(" > {line}"); @@ -1831,10 +1847,11 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { Ok(status) => { println!( - " total records transitioned to done: {}", - status.records_set_to_done.len(), + " region snapshot replacement finish sagas started \ + ok: {}", + status.finish_invoked_ok.len() ); - for line in &status.records_set_to_done { + for line in &status.finish_invoked_ok { println!(" > {line}"); } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index dcf1671d8f..75f6ce0d0e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -627,7 +627,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -645,6 +645,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -655,6 +656,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" @@ -1070,7 +1072,7 @@ task: "region_snapshot_replacement_finish" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total records transitioned to done: 0 + region snapshot replacement finish sagas started ok: 0 errors: 0 task: "region_snapshot_replacement_garbage_collection" @@ -1088,6 +1090,7 @@ task: "region_snapshot_replacement_start" started at (s ago) and ran for ms total requests created ok: 0 total start saga invoked ok: 0 + total requests completed ok: 0 errors: 0 task: "region_snapshot_replacement_step" @@ -1098,6 +1101,7 @@ task: "region_snapshot_replacement_step" total step records created ok: 0 total step garbage collect saga invoked ok: 0 total step saga invoked ok: 0 + total steps set to volume_deleted ok: 0 errors: 0 task: "saga_recovery" diff --git a/nexus/db-model/src/region_snapshot_replacement.rs b/nexus/db-model/src/region_snapshot_replacement.rs index bcbd55028d..28627d8379 100644 --- a/nexus/db-model/src/region_snapshot_replacement.rs +++ b/nexus/db-model/src/region_snapshot_replacement.rs @@ -28,6 +28,7 @@ impl_enum_type!( ReplacementDone => b"replacement_done" DeletingOldVolume => b"deleting_old_volume" Running => b"running" + Completing => b"completing" Complete => b"complete" ); @@ -46,6 +47,7 @@ impl std::str::FromStr for RegionSnapshotReplacementState { Ok(RegionSnapshotReplacementState::DeletingOldVolume) } "running" => Ok(RegionSnapshotReplacementState::Running), + "completing" => Ok(RegionSnapshotReplacementState::Completing), "complete" => Ok(RegionSnapshotReplacementState::Complete), _ => Err(format!("unrecognized value {} for enum", s)), } @@ -79,9 +81,14 @@ impl std::str::FromStr for RegionSnapshotReplacementState { /// | | /// v --- /// --- -/// Running | -/// | set in region snapshot replacement -/// | | finish background task +/// Running <-- | +/// | | +/// | | | +/// v | | +/// | | responsibility of region snapshot +/// Completing -- | replacement finish saga +/// | +/// | | /// v | /// | /// Complete --- @@ -133,6 +140,12 @@ pub struct RegionSnapshotReplacement { pub replacement_state: RegionSnapshotReplacementState, pub operating_saga_id: Option, + + /// In order for the newly created region not to be deleted inadvertently, + /// an additional reference count bump is required. This volume should live + /// as long as this request so that all necessary replacements can be + /// completed. + pub new_region_volume_id: Option, } impl RegionSnapshotReplacement { @@ -157,6 +170,7 @@ impl RegionSnapshotReplacement { old_snapshot_id, old_snapshot_volume_id: None, new_region_id: None, + new_region_volume_id: None, replacement_state: RegionSnapshotReplacementState::Requested, operating_saga_id: None, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 399da81ea4..a54c2e3029 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1932,6 +1932,7 @@ table! { new_region_id -> Nullable, replacement_state -> crate::RegionSnapshotReplacementStateEnum, operating_saga_id -> Nullable, + new_region_volume_id -> Nullable, } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 02646bc6dd..8b663d2549 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(116, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(117, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(117, "add-completing-and-new-region-volume"), KnownVersion::new(116, "bp-physical-disk-disposition"), KnownVersion::new(115, "inv-omicron-physical-disks-generation"), KnownVersion::new(114, "crucible-ref-count-records"), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index c90f1cc92e..d20f24e773 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -125,16 +125,7 @@ pub use sled::TransitionError; pub use switch_port::SwitchPortSettingsCombinedResult; pub use virtual_provisioning_collection::StorageType; pub use vmm::VmmStateUpdateResult; -pub use volume::read_only_resources_associated_with_volume; -pub use volume::CrucibleResources; -pub use volume::CrucibleTargets; -pub use volume::ExistingTarget; -pub use volume::ReplacementTarget; -pub use volume::VolumeCheckoutReason; -pub use volume::VolumeReplaceResult; -pub use volume::VolumeReplacementParams; -pub use volume::VolumeToDelete; -pub use volume::VolumeWithTarget; +pub use volume::*; // Number of unique datasets required to back a region. // TODO: This should likely turn into a configuration option. diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index 0d259ba47e..8aad7f2cfd 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -59,19 +59,22 @@ impl DataStore { opctx: &OpContext, request: RegionReplacement, ) -> Result<(), Error> { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper("insert_region_replacement_request") .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_replacement::dsl; - Self::volume_repair_insert_query( + Self::volume_repair_insert_in_txn( + &conn, + err, request.volume_id, request.id, ) - .execute_async(&conn) .await?; diesel::insert_into(dsl::region_replacement) @@ -83,7 +86,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_replacement_request_by_id( @@ -915,6 +924,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -926,6 +936,20 @@ mod test { let region_2_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionReplacement::new(region_1_id, volume_id); let request_2 = RegionReplacement::new(region_2_id, volume_id); @@ -957,6 +981,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -1049,6 +1087,20 @@ mod test { let region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::ReplacementDone; diff --git a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs index 76a83cca2a..90b014c582 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot_replacement.rs @@ -16,7 +16,6 @@ use crate::db::model::RegionSnapshotReplacement; use crate::db::model::RegionSnapshotReplacementState; use crate::db::model::RegionSnapshotReplacementStep; use crate::db::model::RegionSnapshotReplacementStepState; -use crate::db::model::VolumeRepair; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; @@ -93,6 +92,7 @@ impl DataStore { request: RegionSnapshotReplacement, volume_id: Uuid, ) -> Result<(), Error> { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper( @@ -100,20 +100,24 @@ impl DataStore { ) .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_snapshot_replacement::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; - // An associated volume repair record isn't _strictly_ needed: - // snapshot volumes should never be directly constructed, and - // therefore won't ever have an associated Upstairs that - // receives a volume replacement request. However it's being - // done in an attempt to be overly cautious. - - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { volume_id, repair_id: request.id }) - .execute_async(&conn) - .await?; + // An associated volume repair record isn't _strictly_ + // needed: snapshot volumes should never be directly + // constructed, and therefore won't ever have an associated + // Upstairs that receives a volume replacement request. + // However it's being done in an attempt to be overly + // cautious, and it validates that the volume exist: + // otherwise it would be possible to create a region + // snapshot replacement request for a volume that didn't + // exist! + + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, request.id, + ) + .await?; diesel::insert_into(dsl::region_snapshot_replacement) .values(request) @@ -124,7 +128,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_request_by_id( @@ -342,6 +352,7 @@ impl DataStore { region_snapshot_replacement_id: Uuid, operating_saga_id: Uuid, new_region_id: Uuid, + new_region_volume_id: Uuid, old_snapshot_volume_id: Uuid, ) -> Result<(), Error> { use db::schema::region_snapshot_replacement::dsl; @@ -357,6 +368,7 @@ impl DataStore { .eq(RegionSnapshotReplacementState::ReplacementDone), dsl::old_snapshot_volume_id.eq(Some(old_snapshot_volume_id)), dsl::new_region_id.eq(Some(new_region_id)), + dsl::new_region_volume_id.eq(Some(new_region_volume_id)), dsl::operating_saga_id.eq(Option::::None), )) .check_if_exists::( @@ -375,6 +387,8 @@ impl DataStore { && record.replacement_state == RegionSnapshotReplacementState::ReplacementDone && record.new_region_id == Some(new_region_id) + && record.new_region_volume_id + == Some(new_region_volume_id) && record.old_snapshot_volume_id == Some(old_snapshot_volume_id) { @@ -557,15 +571,201 @@ impl DataStore { } } - /// Transition a RegionSnapshotReplacement record from Running to Complete. - /// Also removes the `volume_repair` record that is taking a "lock" on the - /// Volume. Note this doesn't occur from a saga context, and therefore 1) - /// doesn't accept an operating saga id parameter, and 2) checks that - /// operating_saga_id is null for the corresponding record. + /// Transition a RegionSnapshotReplacement record from Running to + /// Completing, setting a unique id at the same time. + pub async fn set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + ) + .filter(dsl::operating_saga_id.is_null()) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + dsl::operating_saga_id.eq(operating_saga_id), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == Some(operating_saga_id) + && record.replacement_state + == RegionSnapshotReplacementState::Completing + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionReplacement record from Completing to Running, + /// clearing the operating saga id. + pub async fn undo_set_region_snapshot_replacement_completing( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + use db::schema::region_snapshot_replacement::dsl; + let updated = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Running), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await; + + match updated { + Ok(result) => match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.operating_saga_id == None + && record.replacement_state + == RegionSnapshotReplacementState::Running + { + Ok(()) + } else { + Err(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + ))) + } + } + }, + + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), + } + } + + /// Transition a RegionSnapshotReplacement record from Completing to + /// Complete. Also removes the `volume_repair` record that is taking a + /// "lock" on the Volume. pub async fn set_region_snapshot_replacement_complete( &self, opctx: &OpContext, region_snapshot_replacement_id: Uuid, + operating_saga_id: Uuid, + ) -> Result<(), Error> { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement::dsl; + + let result = diesel::update(dsl::region_snapshot_replacement) + .filter(dsl::id.eq(region_snapshot_replacement_id)) + .filter( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Completing), + ) + .filter(dsl::operating_saga_id.eq(operating_saga_id)) + .set(( + dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete), + dsl::operating_saga_id.eq(Option::::None), + )) + .check_if_exists::( + region_snapshot_replacement_id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementState::Complete + { + Ok(()) + } else { + Err(err.bail(Error::conflict(format!( + "region snapshot replacement {} set to {:?} \ + (operating saga id {:?})", + region_snapshot_replacement_id, + record.replacement_state, + record.operating_saga_id, + )))) + } + } + } + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) + } + + /// Transition a RegionSnapshotReplacement record from Requested to Complete + /// - this is required when the region snapshot is hard-deleted, which means + /// that all volume references are gone and no replacement is required. Also + /// removes the `volume_repair` record that is taking a "lock" on the + /// Volume. + pub async fn set_region_snapshot_replacement_complete_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_id: Uuid, ) -> Result<(), Error> { type TxnError = TransactionError; @@ -577,6 +777,7 @@ impl DataStore { let err = err.clone(); async move { use db::schema::volume_repair::dsl as volume_repair_dsl; + use db::schema::region_snapshot_replacement::dsl; diesel::delete( volume_repair_dsl::volume_repair.filter( @@ -587,17 +788,16 @@ impl DataStore { .execute_async(&conn) .await?; - use db::schema::region_snapshot_replacement::dsl; - let result = diesel::update(dsl::region_snapshot_replacement) .filter(dsl::id.eq(region_snapshot_replacement_id)) .filter( dsl::replacement_state - .eq(RegionSnapshotReplacementState::Running), + .eq(RegionSnapshotReplacementState::Requested), ) .filter(dsl::operating_saga_id.is_null()) - .set((dsl::replacement_state - .eq(RegionSnapshotReplacementState::Complete),)) + .filter(dsl::new_region_volume_id.is_null()) + .set(dsl::replacement_state + .eq(RegionSnapshotReplacementState::Complete)) .check_if_exists::( region_snapshot_replacement_id, ) @@ -621,7 +821,7 @@ impl DataStore { region_snapshot_replacement_id, record.replacement_state, record.operating_saga_id, - ), + ) )))) } } @@ -651,6 +851,7 @@ impl DataStore { opctx: &OpContext, request: RegionSnapshotReplacementStep, ) -> Result { + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; self.transaction_retry_wrapper( @@ -658,10 +859,10 @@ impl DataStore { ) .transaction(&conn, |conn| { let request = request.clone(); + let err = err.clone(); async move { use db::schema::region_snapshot_replacement_step::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; // Skip inserting this new record if we found another region // snapshot replacement step with this volume in the step's @@ -714,13 +915,13 @@ impl DataStore { // volume replacement: create an associated volume repair // record. - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { - volume_id: request.volume_id, - repair_id: request.id, - }) - .execute_async(&conn) - .await?; + Self::volume_repair_insert_in_txn( + &conn, + err, + request.volume_id, + request.id, + ) + .await?; let request_id = request.id; @@ -733,7 +934,13 @@ impl DataStore { } }) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn get_region_snapshot_replacement_step_by_id( @@ -1073,6 +1280,87 @@ impl DataStore { Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), } } + + /// Transition from Requested to VolumeDeleted, and remove the associated + /// `volume_repair` record. This occurs when the associated snapshot's + /// volume is deleted. Note this doesn't occur from a saga context, and + /// therefore 1) doesn't accept an operating saga id parameter, and 2) + /// checks that operating_saga_id is null for the corresponding record. + pub async fn set_region_snapshot_replacement_step_volume_deleted_from_requested( + &self, + opctx: &OpContext, + region_snapshot_replacement_step: RegionSnapshotReplacementStep, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + let err = OptionalError::new(); + + self.transaction_retry_wrapper( + "set_region_snapshot_replacement_complete", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + + async move { + use db::schema::volume_repair::dsl as volume_repair_dsl; + + diesel::delete( + volume_repair_dsl::volume_repair.filter( + volume_repair_dsl::repair_id + .eq(region_snapshot_replacement_step.id), + ), + ) + .execute_async(&conn) + .await?; + + use db::schema::region_snapshot_replacement_step::dsl; + let result = + diesel::update(dsl::region_snapshot_replacement_step) + .filter(dsl::id.eq(region_snapshot_replacement_step.id)) + .filter(dsl::operating_saga_id.is_null()) + .filter(dsl::old_snapshot_volume_id.is_null()) + .filter( + dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::Requested, + ), + ) + .set(dsl::replacement_state.eq( + RegionSnapshotReplacementStepState::VolumeDeleted, + )) + .check_if_exists::( + region_snapshot_replacement_step.id, + ) + .execute_and_check(&conn) + .await?; + + match result.status { + UpdateStatus::Updated => Ok(()), + + UpdateStatus::NotUpdatedButExists => { + let record = result.found; + + if record.replacement_state + == RegionSnapshotReplacementStepState::VolumeDeleted + { + Ok(()) + } else { + Err(err.bail(Error::conflict(format!( + "region snapshot replacement step {} set \ + to {:?} (operating saga id {:?})", + region_snapshot_replacement_step.id, + record.replacement_state, + record.operating_saga_id, + )))) + } + } + } + } + }) + .await + .map_err(|e| match err.take() { + Some(error) => error, + None => public_error_from_diesel(e, ErrorHandler::Server), + }) + } } #[cfg(test)] @@ -1083,6 +1371,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn test_one_replacement_per_volume() { @@ -1100,6 +1389,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1146,6 +1449,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request_1 = RegionSnapshotReplacement::new( dataset_1_id, region_1_id, @@ -1182,6 +1499,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); @@ -1214,11 +1545,25 @@ mod test { // Insert some replacement steps, and make sure counting works + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); let result = datastore .insert_region_snapshot_replacement_step(&opctx, step) @@ -1247,11 +1592,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Running; @@ -1283,11 +1642,25 @@ mod test { 1, ); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + { - let mut step = RegionSnapshotReplacementStep::new( - request_id, - Uuid::new_v4(), // volume id - ); + let mut step = + RegionSnapshotReplacementStep::new(request_id, step_volume_id); // VolumeDeleted does not count as "in-progress" step.replacement_state = @@ -1337,6 +1710,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let step = RegionSnapshotReplacementStep::new(Uuid::new_v4(), volume_id); let first_request_id = step.id; @@ -1431,6 +1818,22 @@ mod test { let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = RegionSnapshotReplacement::new( DatasetUuid::new_v4(), Uuid::new_v4(), @@ -1442,9 +1845,7 @@ mod test { datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -1457,8 +1858,24 @@ mod test { .unwrap() .is_empty()); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1468,8 +1885,24 @@ mod test { assert!(matches!(result, InsertStepResult::Inserted { .. })); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; let result = datastore @@ -1509,6 +1942,34 @@ mod test { let volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore + .volume_create(nexus_db_model::Volume::new( + old_snapshot_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step = RegionSnapshotReplacementStep::new(request_id, volume_id); step.replacement_state = RegionSnapshotReplacementStepState::Complete; @@ -1558,6 +2019,20 @@ mod test { let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); datastore diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 4e0d1ccac1..505533da50 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -59,6 +59,7 @@ use serde::Deserialize; use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; +use std::collections::HashSet; use std::collections::VecDeque; use std::net::AddrParseError; use std::net::SocketAddr; @@ -179,6 +180,23 @@ enum ReplaceSnapshotError { MultipleResourceUsageRecords(String), } +/// Crucible resources freed by previous volume deletes +#[derive(Debug, Serialize, Deserialize)] +pub struct FreedCrucibleResources { + /// Regions that previously could not be deleted (often due to region + /// snaphots) that were freed by a volume delete + pub datasets_and_regions: Vec<(Dataset, Region)>, + + /// Previously soft-deleted volumes that can now be hard-deleted + pub volumes: Vec, +} + +impl FreedCrucibleResources { + pub fn is_empty(&self) -> bool { + self.datasets_and_regions.is_empty() && self.volumes.is_empty() + } +} + impl DataStore { async fn volume_create_in_txn( conn: &async_bb8_diesel::Connection, @@ -412,7 +430,7 @@ impl DataStore { }) } - async fn volume_get_impl( + pub(super) async fn volume_get_impl( conn: &async_bb8_diesel::Connection, volume_id: Uuid, ) -> Result, diesel::result::Error> { @@ -1115,12 +1133,12 @@ impl DataStore { .await } - /// Find regions for deleted volumes that do not have associated region - /// snapshots and are not being used by any other non-deleted volumes, and - /// return them for garbage collection + /// Find read/write regions for deleted volumes that do not have associated + /// region snapshots and are not being used by any other non-deleted + /// volumes, and return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option)> { + ) -> LookupResult { let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("find_deleted_volume_regions") .transaction(&conn, |conn| async move { @@ -1132,8 +1150,7 @@ impl DataStore { async fn find_deleted_volume_regions_in_txn( conn: &async_bb8_diesel::Connection, - ) -> Result)>, diesel::result::Error> - { + ) -> Result { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; @@ -1179,6 +1196,9 @@ impl DataStore { let mut deleted_regions = Vec::with_capacity(unfiltered_deleted_regions.len()); + let mut volume_set: HashSet = + HashSet::with_capacity(unfiltered_deleted_regions.len()); + for (dataset, region, region_snapshot, volume) in unfiltered_deleted_regions { @@ -1212,10 +1232,61 @@ impl DataStore { continue; } + if let Some(volume) = &volume { + volume_set.insert(volume.id()); + } + deleted_regions.push((dataset, region, volume)); } - Ok(deleted_regions) + let regions_for_deletion: HashSet = + deleted_regions.iter().map(|(_, region, _)| region.id()).collect(); + + let mut volumes = Vec::with_capacity(deleted_regions.len()); + + for volume_id in volume_set { + // Do not return a volume hard-deletion if there are still lingering + // read/write regions, unless all those lingering read/write regions + // will be deleted from the result of returning from this function. + let allocated_rw_regions: HashSet = + Self::get_allocated_regions_query(volume_id) + .get_results_async::<(Dataset, Region)>(conn) + .await? + .into_iter() + .filter_map(|(_, region)| { + if !region.read_only() { + Some(region.id()) + } else { + None + } + }) + .collect(); + + if allocated_rw_regions.is_subset(®ions_for_deletion) { + // If all the allocated rw regions for this volume are in the + // set of regions being returned for deletion, then we can + // hard-delete this volume. Read-only region accounting should + // have already been updated by soft-deleting this volume. + // + // Note: we'll be in this branch if allocated_rw_regions is + // empty. I believe the only time we'll hit this empty case is + // when the volume is fully populated with read-only resources + // (read-only regions and region snapshots). + volumes.push(volume_id); + } else { + // Not all r/w regions allocated to this volume are being + // deleted here, so we can't hard-delete the volume yet. + } + } + + Ok(FreedCrucibleResources { + datasets_and_regions: deleted_regions + .into_iter() + .map(|(d, r, _)| (d, r)) + .collect(), + + volumes, + }) } pub async fn read_only_resources_associated_with_volume( @@ -2621,8 +2692,11 @@ pub enum VolumeReplaceResult { // this call performed the replacement Done, - // the "existing" volume was deleted - ExistingVolumeDeleted, + // the "existing" volume was soft deleted + ExistingVolumeSoftDeleted, + + // the "existing" volume was hard deleted + ExistingVolumeHardDeleted, } impl DataStore { @@ -2747,14 +2821,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = @@ -3001,14 +3075,14 @@ impl DataStore { // perform the region replacement now, and this will short-circuit // the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeHardDeleted); }; if old_volume.time_deleted.is_some() { // Existing volume was soft-deleted, so return here for the same // reason: the region replacement process should be short-circuited // now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + return Ok(VolumeReplaceResult::ExistingVolumeSoftDeleted); } let old_vcr: VolumeConstructionRequest = @@ -3682,7 +3756,15 @@ impl DataStore { let mut paginator = Paginator::new(SQL_BATCH_SIZE); let conn = self.pool_connection_authorized(opctx).await?; - let needle = address.to_string(); + let needle = match address { + SocketAddr::V4(_) => { + return Err(Error::internal_error(&format!( + "find_volumes_referencing_socket_addr not ipv6: {address}" + ))); + } + + SocketAddr::V6(addr) => addr, + }; while let Some(p) = paginator.next() { use db::schema::volume::dsl; @@ -3699,7 +3781,23 @@ impl DataStore { paginator = p.found_batch(&haystack, &|r| r.id()); for volume in haystack { - if volume.data().contains(&needle) { + let vcr: VolumeConstructionRequest = + match serde_json::from_str(&volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(Error::internal_error(&format!( + "cannot deserialize volume data for {}: {e}", + volume.id(), + ))); + } + }; + + let rw_reference = region_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + let ro_reference = read_only_target_in_vcr(&vcr, &needle) + .map_err(|e| Error::internal_error(&e.to_string()))?; + + if rw_reference || ro_reference { volumes.push(volume); } } diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs index 7ea88c8542..598d9d77a2 100644 --- a/nexus/db-queries/src/db/datastore/volume_repair.rs +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -11,6 +11,8 @@ use crate::db::datastore::RunnableQuery; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::VolumeRepair; +use crate::db::DbConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::result::DatabaseErrorKind; @@ -19,14 +21,75 @@ use omicron_common::api::external::Error; use uuid::Uuid; impl DataStore { - pub(super) fn volume_repair_insert_query( + /// Insert a volume repair record, taking a "lock" on the volume pointed to + /// by volume id with some repair id. + /// + /// If there exists a record that has a matching volume id and repair id, + /// return Ok(()). + /// + /// If there is no volume that matches the given volume id, return an error: + /// it should not be possible to lock a volume that does not exist! Note + /// that it is possible to lock a soft-deleted volume. + /// + /// If there is already an existing record that has a matching volume id but + /// a different repair id, then this function returns an Error::conflict. + pub(super) async fn volume_repair_insert_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, volume_id: Uuid, repair_id: Uuid, - ) -> impl RunnableQuery { + ) -> Result<(), diesel::result::Error> { use db::schema::volume_repair::dsl; - diesel::insert_into(dsl::volume_repair) + // If a lock that matches the arguments exists already, return Ok + // + // Note: if rerunning this function (for example if a saga node was + // rerun), the volume could have existed when this lock was inserted the + // first time, but have been deleted now. + let maybe_lock = dsl::volume_repair + .filter(dsl::repair_id.eq(repair_id)) + .filter(dsl::volume_id.eq(volume_id)) + .first_async::(conn) + .await + .optional()?; + + if maybe_lock.is_some() { + return Ok(()); + } + + // Do not allow a volume repair record to be created if the volume does + // not exist, or was hard-deleted! + let maybe_volume = Self::volume_get_impl(conn, volume_id).await?; + + if maybe_volume.is_none() { + return Err(err.bail(Error::invalid_request(format!( + "cannot create record: volume {volume_id} does not exist" + )))); + } + + // Do not check for soft-deletion here: We may want to request locks for + // soft-deleted volumes. + + match diesel::insert_into(dsl::volume_repair) .values(VolumeRepair { volume_id, repair_id }) + .execute_async(conn) + .await + { + Ok(_) => Ok(()), + + Err(e) => match e { + DieselError::DatabaseError( + DatabaseErrorKind::UniqueViolation, + ref error_information, + ) if error_information.constraint_name() + == Some("volume_repair_pkey") => + { + Err(err.bail(Error::conflict("volume repair lock"))) + } + + _ => Err(e), + }, + } } pub async fn volume_repair_lock( @@ -36,21 +99,25 @@ impl DataStore { repair_id: Uuid, ) -> Result<(), Error> { let conn = self.pool_connection_authorized(opctx).await?; - Self::volume_repair_insert_query(volume_id, repair_id) - .execute_async(&*conn) + let err = OptionalError::new(); + + self.transaction_retry_wrapper("volume_repair_lock") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::volume_repair_insert_in_txn( + &conn, err, volume_id, repair_id, + ) + .await + } + }) .await - .map(|_| ()) - .map_err(|e| match e { - DieselError::DatabaseError( - DatabaseErrorKind::UniqueViolation, - ref error_information, - ) if error_information.constraint_name() - == Some("volume_repair_pkey") => - { - Error::conflict("volume repair lock") + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - - _ => public_error_from_diesel(e, ErrorHandler::Server), }) } @@ -102,6 +169,7 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use omicron_test_utils::dev; + use sled_agent_client::types::VolumeConstructionRequest; #[tokio::test] async fn volume_lock_conflict_error_returned() { @@ -113,6 +181,20 @@ mod test { let lock_2 = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_1).await.unwrap(); let err = datastore @@ -125,4 +207,55 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + /// Assert that you can't take a volume repair lock if the volume does not + /// exist yet! + #[tokio::test] + async fn volume_lock_should_fail_without_volume() { + let logctx = + dev::test_setup_log("volume_lock_should_fail_without_volume"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_1 = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_repair_lock(&opctx, volume_id, lock_1) + .await + .unwrap_err(); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn volume_lock_relock_allowed() { + let logctx = dev::test_setup_log("volume_lock_relock_allowed"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let lock_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + datastore.volume_repair_lock(&opctx, volume_id, lock_id).await.unwrap(); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index ad39777054..f107f2af71 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -825,7 +825,7 @@ impl BackgroundTasksInitializer { done", period: config.region_snapshot_replacement_finish.period_secs, task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( - datastore, + datastore, sagas, )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], diff --git a/nexus/src/app/background/tasks/region_replacement.rs b/nexus/src/app/background/tasks/region_replacement.rs index 71b49c5a6f..caa262bc8c 100644 --- a/nexus/src/app/background/tasks/region_replacement.rs +++ b/nexus/src/app/background/tasks/region_replacement.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionReplacementStatus; +use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use serde_json::json; @@ -42,7 +43,7 @@ impl RegionReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_replacement_start::Params { serialized_authn, request, @@ -135,15 +136,31 @@ impl BackgroundTask for RegionReplacementDetector { } Err(e) => { - let s = format!( - "error adding region replacement request for \ - region {} volume id {}: {e}", - region.id(), - region.volume_id(), - ); - error!(&log, "{s}"); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error adding region replacement \ + request for region {} volume id {}: \ + {e}", + region.id(), + region.volume_id(), + ); + error!(&log, "{s}"); + + status.errors.push(s); + } + } - status.errors.push(s); continue; } } @@ -174,7 +191,9 @@ impl BackgroundTask for RegionReplacementDetector { // If the replacement request is in the `requested` state and // the request's volume was soft-deleted or hard-deleted, avoid // sending the start request and instead transition the request - // to completed + // to completed. Note the saga will do the right thing if the + // volume is deleted, but this avoids the overhead of starting + // it. let volume_deleted = match self .datastore @@ -315,6 +334,21 @@ mod test { // Add a region replacement request for a fake region let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = RegionReplacement::new(Uuid::new_v4(), volume_id); let request_id = request.id; diff --git a/nexus/src/app/background/tasks/region_replacement_driver.rs b/nexus/src/app/background/tasks/region_replacement_driver.rs index e7fe0d6338..6cc28f9dfd 100644 --- a/nexus/src/app/background/tasks/region_replacement_driver.rs +++ b/nexus/src/app/background/tasks/region_replacement_driver.rs @@ -258,6 +258,7 @@ mod test { use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; use omicron_uuid_kinds::UpstairsSessionKind; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -288,6 +289,20 @@ mod test { let new_region_id = Uuid::new_v4(); let volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let request = { let mut request = RegionReplacement::new(region_id, volume_id); request.replacement_state = RegionReplacementState::Running; @@ -382,6 +397,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state ReplacementDone. Set the new_region_id to the region created // above. @@ -481,6 +510,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { @@ -630,6 +673,20 @@ mod test { .unwrap(); } + datastore + .volume_create(nexus_db_model::Volume::new( + old_region.volume_id(), + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: old_region.volume_id(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + // Add a region replacement request for that region, and change it to // state Running. Set the new_region_id to the region created above. let request = { diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 0eebd37fb7..61a84c579d 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -8,9 +8,15 @@ //! Once all related region snapshot replacement steps are done, the region //! snapshot replacement can be completed. +use crate::app::authn; use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; +use crate::app::sagas; +use crate::app::sagas::region_snapshot_replacement_finish::*; +use crate::app::sagas::NexusSaga; use futures::future::BoxFuture; use futures::FutureExt; +use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus; @@ -19,11 +25,31 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, + sagas: Arc, } impl RegionSnapshotReplacementFinishDetector { - pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore } + pub fn new(datastore: Arc, sagas: Arc) -> Self { + RegionSnapshotReplacementFinishDetector { datastore, sagas } + } + + async fn send_finish_request( + &self, + opctx: &OpContext, + request: RegionSnapshotReplacement, + ) -> Result<(), omicron_common::api::external::Error> { + let params = sagas::region_snapshot_replacement_finish::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + request, + }; + + let saga_dag = SagaRegionSnapshotReplacementFinish::prepare(¶ms)?; + + // We only care that the saga was started, and don't wish to wait for it + // to complete, so use `StartSaga::saga_start`, rather than `saga_run`. + self.sagas.saga_start(saga_dag).await?; + + Ok(()) } async fn transition_requests_to_done( @@ -120,21 +146,23 @@ impl RegionSnapshotReplacementFinishDetector { } }; - // Transition region snapshot replacement to Complete - match self - .datastore - .set_region_snapshot_replacement_complete(opctx, request.id) - .await - { + let request_id = request.id; + + match self.send_finish_request(opctx, request).await { Ok(()) => { - let s = format!("set request {} to done", request.id); + let s = format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" + ); + info!(&log, "{s}"); - status.records_set_to_done.push(s); + status.finish_invoked_ok.push(s); } Err(e) => { let s = format!( - "marking snapshot replacement as done failed: {e}" + "invoking region snapshot replacement finish for \ + {request_id} failed: {e}", ); error!(&log, "{s}"); status.errors.push(s); @@ -170,6 +198,7 @@ mod test { use nexus_db_queries::db::datastore::region_snapshot_replacement; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -186,8 +215,10 @@ mod test { datastore.clone(), ); - let mut task = - RegionSnapshotReplacementFinishDetector::new(datastore.clone()); + let mut task = RegionSnapshotReplacementFinishDetector::new( + datastore.clone(), + nexus.sagas.clone(), + ); // Noop test let result: RegionSnapshotReplacementFinishStatus = @@ -208,11 +239,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -232,6 +277,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -240,6 +286,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -267,14 +314,44 @@ mod test { let operating_saga_id = Uuid::new_v4(); + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_1 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_1.replacement_state = RegionSnapshotReplacementStepState::Complete; step_1.operating_saga_id = Some(operating_saga_id); let step_1_id = step_1.id; + let step_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut step_2 = - RegionSnapshotReplacementStep::new(request_id, Uuid::new_v4()); + RegionSnapshotReplacementStep::new(request_id, step_volume_id); step_2.replacement_state = RegionSnapshotReplacementStepState::Complete; step_2.operating_saga_id = Some(operating_saga_id); let step_2_id = step_2.id; @@ -335,8 +412,9 @@ mod test { assert_eq!( result, RegionSnapshotReplacementFinishStatus { - records_set_to_done: vec![format!( - "set request {request_id} to done" + finish_invoked_ok: vec![format!( + "region snapshot replacement finish invoked ok for \ + {request_id}" )], errors: vec![], }, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index db30e1d074..57bbf3741c 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -155,6 +155,7 @@ mod test { use nexus_db_model::RegionSnapshotReplacementState; use nexus_test_utils_macros::nexus_test; use omicron_uuid_kinds::DatasetUuid; + use sled_agent_client::types::VolumeConstructionRequest; use uuid::Uuid; type ControlPlaneTestContext = @@ -199,11 +200,25 @@ mod test { let request_1_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -219,11 +234,25 @@ mod test { let request_2_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 04d86a1268..f2b82a3943 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -23,6 +23,7 @@ use nexus_db_model::RegionSnapshotReplacement; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -40,7 +41,7 @@ impl RegionSnapshotReplacementDetector { &self, serialized_authn: authn::saga::Serialized, request: RegionSnapshotReplacement, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_start::Params { serialized_authn, request, @@ -138,17 +139,33 @@ impl RegionSnapshotReplacementDetector { } Err(e) => { - let s = - format!("error creating replacement request: {e}"); - - error!( - &log, - "{s}"; - "snapshot_id" => %region_snapshot.snapshot_id, - "region_id" => %region_snapshot.region_id, - "dataset_id" => %region_snapshot.dataset_id, - ); - status.errors.push(s); + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + let s = format!( + "error creating replacement request: {e}" + ); + + error!( + &log, + "{s}"; + "snapshot_id" => %region_snapshot.snapshot_id, + "region_id" => %region_snapshot.region_id, + "dataset_id" => %region_snapshot.dataset_id, + ); + + status.errors.push(s); + } + } } } } @@ -185,6 +202,67 @@ impl RegionSnapshotReplacementDetector { for request in requests { let request_id = request.id; + // If the region snapshot is gone, then there are no more references + // in any volume, and the whole region snapshot replacement can be + // fast-tracked to Complete. + + let maybe_region_snapshot = match self + .datastore + .region_snapshot_get( + request.old_dataset_id.into(), + request.old_region_id, + request.old_snapshot_id, + ) + .await + { + Ok(maybe_region_snapshot) => maybe_region_snapshot, + + Err(e) => { + let s = format!("query for region snapshot failed: {e}"); + + error!( + &log, + "{s}"; + "request.snapshot_id" => %request.old_snapshot_id, + "request.region_id" => %request.old_region_id, + "request.dataset_id" => %request.old_dataset_id, + ); + status.errors.push(s); + return; + } + }; + + if maybe_region_snapshot.is_none() { + match self + .datastore + .set_region_snapshot_replacement_complete_from_requested( + &opctx, request.id, + ) + .await + { + Ok(()) => { + let s = format!( + "region snapshot replacement {request_id} \ + completed ok" + ); + info!(&log, "{s}"); + status.requests_completed_ok.push(s); + } + + Err(e) => { + let s = format!( + "query to set region snapshot request state \ + to complete failed: {e}" + ); + + error!(&log, "{s}"; "request.id" => %request_id); + status.errors.push(s); + } + } + + continue; + } + let result = self .send_start_request( authn::saga::Serialized::for_opctx(opctx), @@ -269,6 +347,7 @@ mod test { use nexus_db_model::Snapshot; use nexus_db_model::SnapshotIdentity; use nexus_db_model::SnapshotState; + use nexus_db_model::VolumeResourceUsage; use nexus_db_queries::authz; use nexus_db_queries::db::lookup::LookupPath; use nexus_test_utils::resource_helpers::create_project; @@ -276,6 +355,8 @@ mod test { use omicron_common::api::external; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::GenericUuid; + use sled_agent_client::types::CrucibleOpts; + use sled_agent_client::types::VolumeConstructionRequest; use std::collections::BTreeMap; use uuid::Uuid; @@ -309,19 +390,43 @@ mod test { // Add a region snapshot replacement request for a fake region snapshot - let request = RegionSnapshotReplacement::new( - DatasetUuid::new_v4(), // dataset id - Uuid::new_v4(), // region id - Uuid::new_v4(), // snapshot id + let dataset_id = DatasetUuid::new_v4(); + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::]:12345".to_string(), ); + datastore.region_snapshot_create(region_snapshot).await.unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -339,6 +444,7 @@ mod test { "region snapshot replacement start invoked ok for \ {request_id}" )], + requests_completed_ok: vec![], errors: vec![], }, ); @@ -407,6 +513,22 @@ mod test { .await .unwrap(); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .project_ensure_snapshot( &opctx, @@ -426,7 +548,7 @@ mod test { project_id, disk_id: Uuid::new_v4(), - volume_id: Uuid::new_v4(), + volume_id, destination_volume_id: Uuid::new_v4(), gen: Generation::new(), @@ -508,4 +630,194 @@ mod test { dataset_to_zpool.get(&first_zpool.id.to_string()).unwrap(); assert_eq!(&request.old_dataset_id.to_string(), dataset_id); } + + #[nexus_test(server = crate::Server)] + async fn test_delete_region_snapshot_replacement_volume_causes_complete( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let starter = Arc::new(NoopStartSaga::new()); + let mut task = RegionSnapshotReplacementDetector::new( + datastore.clone(), + starter.clone(), + ); + + // Noop test + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + assert_eq!(result, RegionSnapshotReplacementStartStatus::default()); + assert_eq!(starter.count_reset(), 0); + + // The volume reference counting machinery needs a fake dataset to exist + // (region snapshots are joined with the dataset table when creating the + // CrucibleResources object) + + let disk_test = DiskTest::new(cptestctx).await; + + let dataset_id = disk_test.zpools().next().unwrap().datasets[0].id; + + // Add a region snapshot replacement request for a fake region snapshot + + let region_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let region_snapshot = RegionSnapshot::new( + dataset_id, + region_id, + snapshot_id, + "[::1]:12345".to_string(), + ); + + datastore + .region_snapshot_create(region_snapshot.clone()) + .await + .unwrap(); + + let request = + RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id); + + let request_id = request.id; + + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + // the region snapshot + String::from("[::1]:12345"), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(!records.is_empty()); + assert_eq!(records[0].volume_id, volume_id); + + datastore + .insert_region_snapshot_replacement_request_with_volume_id( + &opctx, request, volume_id, + ) + .await + .unwrap(); + + // Before the task starts, soft-delete the volume, and delete the + // region snapshot (like the volume delete saga would do). + + let crucible_resources = + datastore.soft_delete_volume(volume_id).await.unwrap(); + + // Assert no more usage + + let records = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id.into(), + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert!(records.is_empty()); + + // The region snapshot should have been returned for deletion + + let datasets_and_snapshots = + datastore.snapshots_to_delete(&crucible_resources).await.unwrap(); + + assert!(!datasets_and_snapshots.is_empty()); + + let region_snapshot_to_delete = &datasets_and_snapshots[0].1; + + assert_eq!( + region_snapshot_to_delete.dataset_id, + region_snapshot.dataset_id, + ); + assert_eq!( + region_snapshot_to_delete.region_id, + region_snapshot.region_id, + ); + assert_eq!( + region_snapshot_to_delete.snapshot_id, + region_snapshot.snapshot_id, + ); + + // So delete it! + + datastore + .region_snapshot_remove( + region_snapshot_to_delete.dataset_id.into(), + region_snapshot_to_delete.region_id, + region_snapshot_to_delete.snapshot_id, + ) + .await + .unwrap(); + + // Activate the task - it should pick the request up but not attempt to + // run the start saga + + let result: RegionSnapshotReplacementStartStatus = + serde_json::from_value(task.activate(&opctx).await).unwrap(); + + assert_eq!( + result, + RegionSnapshotReplacementStartStatus { + requests_created_ok: vec![], + start_invoked_ok: vec![], + requests_completed_ok: vec![format!( + "region snapshot replacement {request_id} completed ok" + )], + errors: vec![], + }, + ); + + // Assert start saga not invoked + assert_eq!(starter.count_reset(), 0); + } } diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index ac259ecba8..f481126312 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -32,10 +32,11 @@ use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::RegionSnapshotReplacementStep; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::datastore::region_snapshot_replacement; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::DataStore; use nexus_types::identity::Asset; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; +use omicron_common::api::external::Error; use serde_json::json; use std::sync::Arc; @@ -53,7 +54,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let params = sagas::region_snapshot_replacement_step::Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), request, @@ -70,7 +71,7 @@ impl RegionSnapshotReplacementFindAffected { &self, opctx: &OpContext, request: RegionSnapshotReplacementStep, - ) -> Result<(), omicron_common::api::external::Error> { + ) -> Result<(), Error> { let Some(old_snapshot_volume_id) = request.old_snapshot_volume_id else { // This state is illegal! @@ -79,9 +80,7 @@ impl RegionSnapshotReplacementFindAffected { request.id, ); - return Err(omicron_common::api::external::Error::internal_error( - &s, - )); + return Err(Error::internal_error(&s)); }; let params = @@ -315,6 +314,21 @@ impl RegionSnapshotReplacementFindAffected { // functions execute), an indefinite amount of work would be // created, continually "moving" the snapshot_addr from // temporary volume to temporary volume. + // + // If the volume was soft deleted, then skip making a step for + // it. + + if volume.time_deleted.is_some() { + info!( + log, + "volume was soft-deleted, skipping creating a step for \ + it"; + "request id" => ?request.id, + "volume id" => ?volume.id(), + ); + + continue; + } match self .datastore @@ -326,7 +340,7 @@ impl RegionSnapshotReplacementFindAffected { .await { Ok(insertion_result) => match insertion_result { - region_snapshot_replacement::InsertStepResult::Inserted { step_id } => { + InsertStepResult::Inserted { step_id } => { let s = format!("created {step_id}"); info!( log, @@ -337,7 +351,7 @@ impl RegionSnapshotReplacementFindAffected { status.step_records_created_ok.push(s); } - region_snapshot_replacement::InsertStepResult::AlreadyHandled { .. } => { + InsertStepResult::AlreadyHandled { .. } => { info!( log, "step already exists for volume id"; @@ -345,17 +359,32 @@ impl RegionSnapshotReplacementFindAffected { "volume id" => ?volume.id(), ); } - } + }, Err(e) => { let s = format!("error creating step request: {e}"); - error!( + warn!( log, "{s}"; "request id" => ?request.id, "volume id" => ?volume.id(), ); - status.errors.push(s); + + match e { + Error::Conflict { message } + if message.external_message() + == "volume repair lock" => + { + // This is not a fatal error! If there are + // competing region replacement and region + // snapshot replacements, then they are both + // attempting to lock volumes. + } + + _ => { + status.errors.push(s); + } + } } } } @@ -392,13 +421,81 @@ impl RegionSnapshotReplacementFindAffected { }; for request in step_requests { - let request_id = request.id; + let request_step_id = request.id; + + // Check if the volume was deleted _after_ the replacement step was + // created. Avoid launching the region snapshot replacement step + // saga if it was deleted: the saga will do the right thing if it is + // deleted, but this avoids the overhead of starting it. + + let volume_deleted = + match self.datastore.volume_deleted(request.volume_id).await { + Ok(volume_deleted) => volume_deleted, + + Err(e) => { + let s = format!( + "error checking if volume id {} was \ + deleted: {e}", + request.volume_id, + ); + error!(&log, "{s}"); + + status.errors.push(s); + continue; + } + }; + + if volume_deleted { + // Volume was soft or hard deleted, so proceed with clean up, + // which if this is in state Requested there won't be any + // additional associated state, so transition the record to + // Completed. + + info!( + &log, + "request {} step {} volume {} was soft or hard deleted!", + request.request_id, + request_step_id, + request.volume_id, + ); + + let result = self + .datastore + .set_region_snapshot_replacement_step_volume_deleted_from_requested( + opctx, request, + ) + .await; + + match result { + Ok(()) => { + let s = format!( + "request step {request_step_id} transitioned from \ + requested to volume_deleted" + ); + + info!(&log, "{s}"); + status.step_set_volume_deleted_ok.push(s); + } + + Err(e) => { + let s = format!( + "error transitioning {request_step_id} from \ + requested to complete: {e}" + ); + + error!(&log, "{s}"); + status.errors.push(s); + } + } + + continue; + } match self.send_start_request(opctx, request.clone()).await { Ok(()) => { let s = format!( "region snapshot replacement step saga invoked ok for \ - {request_id}" + {request_step_id}" ); info!( @@ -413,7 +510,7 @@ impl RegionSnapshotReplacementFindAffected { Err(e) => { let s = format!( "invoking region snapshot replacement step saga for \ - {request_id} failed: {e}" + {request_step_id} failed: {e}" ); error!( @@ -575,11 +672,25 @@ mod test { let request_id = request.id; + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), // not required to match! + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( - &opctx, - request, - Uuid::new_v4(), + &opctx, request, volume_id, ) .await .unwrap(); @@ -599,6 +710,7 @@ mod test { .unwrap(); let new_region_id = Uuid::new_v4(); + let new_region_volume_id = Uuid::new_v4(); let old_snapshot_volume_id = Uuid::new_v4(); datastore @@ -607,6 +719,7 @@ mod test { request_id, operating_saga_id, new_region_id, + new_region_volume_id, old_snapshot_volume_id, ) .await @@ -731,11 +844,27 @@ mod test { // Now, add some Complete records and make sure the garbage collection // saga is invoked. + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = @@ -747,16 +876,29 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); + + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); let result = datastore .insert_region_snapshot_replacement_step(&opctx, { let mut record = RegionSnapshotReplacementStep::new( Uuid::new_v4(), - Uuid::new_v4(), + volume_id, ); record.replacement_state = @@ -768,10 +910,7 @@ mod test { .await .unwrap(); - assert!(matches!( - result, - region_snapshot_replacement::InsertStepResult::Inserted { .. } - )); + assert!(matches!(result, InsertStepResult::Inserted { .. })); // Activate the task - it should pick the complete steps up and try to // run the region snapshot replacement step garbage collect saga diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 405a972976..d8ba6abbdd 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -39,6 +39,7 @@ pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; pub mod region_replacement_start; +pub mod region_snapshot_replacement_finish; pub mod region_snapshot_replacement_garbage_collect; pub mod region_snapshot_replacement_start; pub mod region_snapshot_replacement_step; @@ -173,7 +174,8 @@ fn make_action_registry() -> ActionRegistry { region_snapshot_replacement_start::SagaRegionSnapshotReplacementStart, region_snapshot_replacement_garbage_collect::SagaRegionSnapshotReplacementGarbageCollect, region_snapshot_replacement_step::SagaRegionSnapshotReplacementStep, - region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect + region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect, + region_snapshot_replacement_finish::SagaRegionSnapshotReplacementFinish ]; #[cfg(test)] diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index c7efa2f03f..2212e6fdf3 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -311,6 +311,20 @@ pub(crate) mod test { operating_saga_id: None, }; + datastore + .volume_create(nexus_db_model::Volume::new( + new_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: new_volume_id, + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_replacement_request(&opctx, request.clone()) .await diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index ce063dc5be..aa9e83c037 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -491,7 +491,7 @@ async fn srrs_get_old_region_address( async fn srrs_replace_region_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; @@ -555,8 +555,6 @@ async fn srrs_replace_region_in_volume( .await .map_err(ActionError::action_failed)?; - debug!(log, "replacement returned {:?}", volume_replace_region_result); - match volume_replace_region_result { VolumeReplaceResult::AlreadyHappened | VolumeReplaceResult::Done => { // The replacement was done either by this run of this saga node, or @@ -565,10 +563,11 @@ async fn srrs_replace_region_in_volume( // with the rest of the saga (to properly clean up allocated // resources). - Ok(()) + Ok(volume_replace_region_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Unwind the saga here to clean up the resources allocated during // this saga. The associated background task will transition this // request's state to Completed. diff --git a/nexus/src/app/sagas/region_snapshot_replacement_finish.rs b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs new file mode 100644 index 0000000000..d992f753d6 --- /dev/null +++ b/nexus/src/app/sagas/region_snapshot_replacement_finish.rs @@ -0,0 +1,211 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! After the change to store a "new region volume" in the region snapshot +//! replacement request, that volume requires garbage collection before the +//! region snapshot replacement transitions to Complete. It's this saga's +//! responsibility to ensure that cleanup. This saga handles the following +//! region snapshot replacement request state transitions: +//! +//! ```text +//! Running <-- +//! | +//! | | +//! v | +//! | +//! Completing -- +//! +//! | +//! v +//! +//! Complete +//! ``` +//! +//! The first thing this saga does is set itself as the "operating saga" for the +//! request, and change the state to "Completing". Then, it performs the volume +//! delete sub-saga for the new region volume. Finally, it updates the region +//! snapshot replacement request by clearing the operating saga id and changing +//! the state to "Complete". +//! +//! Any unwind will place the state back into Running. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::sagas::declare_saga_actions; +use crate::app::sagas::volume_delete; +use crate::app::{authn, db}; +use serde::Deserialize; +use serde::Serialize; +use steno::ActionError; +use steno::Node; +use uuid::Uuid; + +// region snapshot replacement finish saga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub serialized_authn: authn::saga::Serialized, + pub request: db::model::RegionSnapshotReplacement, +} + +// region snapshot replacement finish saga: actions + +declare_saga_actions! { + region_snapshot_replacement_finish; + SET_SAGA_ID -> "unused_1" { + + rsrfs_set_saga_id + - rsrfs_set_saga_id_undo + } + UPDATE_REQUEST_RECORD -> "unused_4" { + + rsrfs_update_request_record + } +} + +// region snapshot replacement finish saga: definition + +#[derive(Debug)] +pub(crate) struct SagaRegionSnapshotReplacementFinish; +impl NexusSaga for SagaRegionSnapshotReplacementFinish { + const NAME: &'static str = "region-snapshot-replacement-finish"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + region_snapshot_replacement_finish_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(Node::action( + "saga_id", + "GenerateSagaId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(set_saga_id_action()); + + if let Some(new_region_volume_id) = params.request.new_region_volume_id + { + let subsaga_params = volume_delete::Params { + serialized_authn: params.serialized_authn.clone(), + volume_id: new_region_volume_id, + }; + + let subsaga_dag = { + let subsaga_builder = steno::DagBuilder::new( + steno::SagaName::new(volume_delete::SagaVolumeDelete::NAME), + ); + volume_delete::SagaVolumeDelete::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + "params_for_volume_delete_subsaga", + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + "params_for_volume_delete_subsaga".to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "volume_delete_subsaga_no_result", + subsaga_dag, + "params_for_volume_delete_subsaga", + )); + } + + builder.append(update_request_record_action()); + + Ok(builder.build()?) + } +} + +// region snapshot replacement finish saga: action implementations + +async fn rsrfs_set_saga_id( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Change the request record here to an intermediate "completing" state to + // block out other sagas that will be triggered for the same request. + osagactx + .datastore() + .set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrfs_set_saga_id_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + osagactx + .datastore() + .undo_set_region_snapshot_replacement_completing( + &opctx, + params.request.id, + saga_id, + ) + .await?; + + Ok(()) +} + +async fn rsrfs_update_request_record( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let params = sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); + let datastore = osagactx.datastore(); + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let saga_id = sagactx.lookup::("saga_id")?; + + // Update the replacement request record to 'Complete' and clear the + // operating saga id. There is no undo step for this, it should succeed + // idempotently. + datastore + .set_region_snapshot_replacement_complete( + &opctx, + params.request.id, + saga_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} diff --git a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs index 762182724b..675b2b0cb3 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_garbage_collect.rs @@ -284,11 +284,27 @@ pub(crate) mod test { RegionSnapshotReplacementState::ReplacementDone; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); + let volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + datastore .insert_region_snapshot_replacement_request_with_volume_id( &opctx, request.clone(), - Uuid::new_v4(), + volume_id, ) .await .unwrap(); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 2092c3065a..bb5fd60209 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -139,6 +139,10 @@ declare_saga_actions! { NEW_REGION_ENSURE -> "ensured_dataset_and_region" { + rsrss_new_region_ensure } + NEW_REGION_VOLUME_CREATE -> "new_region_volume" { + + rsrss_new_region_volume_create + - rsrss_new_region_volume_create_undo + } GET_OLD_SNAPSHOT_VOLUME_ID -> "old_snapshot_volume_id" { + rsrss_get_old_snapshot_volume_id } @@ -183,12 +187,19 @@ impl NexusSaga for SagaRegionSnapshotReplacementStart { ACTION_GENERATE_ID.as_ref(), )); + builder.append(Node::action( + "new_region_volume_id", + "GenerateNewRegionVolumeId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(set_saga_id_action()); builder.append(get_alloc_region_params_action()); builder.append(alloc_new_region_action()); builder.append(find_new_region_action()); builder.append(new_region_ensure_undo_action()); builder.append(new_region_ensure_action()); + builder.append(new_region_volume_create_action()); builder.append(get_old_snapshot_volume_id_action()); builder.append(create_fake_volume_action()); builder.append(replace_snapshot_in_volume_action()); @@ -509,6 +520,94 @@ async fn rsrss_new_region_ensure_undo( Ok(()) } +async fn rsrss_new_region_volume_create( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + + let (new_dataset, ensured_region) = sagactx.lookup::<( + db::model::Dataset, + crucible_agent_client::types::Region, + )>( + "ensured_dataset_and_region", + )?; + + let Some(new_dataset_address) = new_dataset.address() else { + return Err(ActionError::action_failed(format!( + "dataset {} does not have an address!", + new_dataset.id(), + ))); + }; + + let new_region_address = SocketAddrV6::new( + *new_dataset_address.ip(), + ensured_region.port_number, + 0, + 0, + ); + + // Create a volume to inflate the reference count of the newly created + // read-only region. If this is not done it's possible that a user could + // delete the snapshot volume _after_ the new read-only region was swapped + // in, removing the last reference to it and causing garbage collection. + + let volume_construction_request = VolumeConstructionRequest::Volume { + id: new_region_volume_id, + block_size: 0, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 0, + blocks_per_extent: 0, + extent_count: 0, + gen: 0, + opts: CrucibleOpts { + id: new_region_volume_id, + target: vec![new_region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&volume_construction_request) + .map_err(|e| { + ActionError::action_failed(Error::internal_error(&e.to_string())) + })?; + + let volume = db::model::Volume::new(new_region_volume_id, volume_data); + + osagactx + .datastore() + .volume_create(volume) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn rsrss_new_region_volume_create_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + + // Delete the volume. + + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + osagactx.datastore().volume_hard_delete(new_region_volume_id).await?; + + Ok(()) +} + async fn rsrss_get_old_snapshot_volume_id( sagactx: NexusActionContext, ) -> Result { @@ -695,7 +794,7 @@ async fn get_replace_params( async fn rsrss_replace_snapshot_in_volume( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let log = sagactx.user_data().log(); let osagactx = sagactx.user_data(); @@ -729,10 +828,11 @@ async fn rsrss_replace_snapshot_in_volume( // if the transaction occurred on the non-deleted volume so proceed // with the rest of the saga. - Ok(()) + Ok(volume_replace_snapshot_result) } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // If the snapshot volume was deleted, we still want to proceed with // replacing the rest of the uses of the region snapshot. Note this // also covers the case where this saga node runs (performing the @@ -741,7 +841,7 @@ async fn rsrss_replace_snapshot_in_volume( // deleted. If this saga unwound here, that would violate the // property of idempotency. - Ok(()) + Ok(volume_replace_snapshot_result) } } } @@ -815,6 +915,9 @@ async fn rsrss_update_request_record( let old_region_volume_id = sagactx.lookup::("new_volume_id")?; + let new_region_volume_id = + sagactx.lookup::("new_region_volume_id")?; + // Now that the region has been ensured and the construction request has // been updated, update the replacement request record to 'ReplacementDone' // and clear the operating saga id. There is no undo step for this, it @@ -825,6 +928,7 @@ async fn rsrss_update_request_record( params.request.id, saga_id, new_region_id, + new_region_volume_id, old_region_volume_id, ) .await @@ -1026,8 +1130,10 @@ pub(crate) mod test { .await .unwrap(); - assert_eq!(volumes.len(), 1); - assert_eq!(volumes[0].id(), db_snapshot.volume_id); + assert!(volumes + .iter() + .map(|v| v.id()) + .any(|vid| vid == db_snapshot.volume_id)); } fn new_test_params( diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 7b1d598861..a236fcf62c 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -375,7 +375,8 @@ async fn rsrss_replace_snapshot_in_volume( // with the saga. } - VolumeReplaceResult::ExistingVolumeDeleted => { + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted => { // Proceed with the saga but skip the notification step. } } @@ -423,6 +424,20 @@ async fn rsrss_notify_upstairs( let params = sagactx.saga_params::()?; let log = sagactx.user_data().log(); + // If the associated volume was deleted, then skip this notification step as + // there is no Upstairs to talk to. Continue with the saga to transition the + // step request to Complete, and then perform the associated clean up. + + let volume_replace_snapshot_result = sagactx + .lookup::("volume_replace_snapshot_result")?; + if matches!( + volume_replace_snapshot_result, + VolumeReplaceResult::ExistingVolumeSoftDeleted + | VolumeReplaceResult::ExistingVolumeHardDeleted + ) { + return Ok(()); + } + // Make an effort to notify a Propolis if one was booted for this volume. // This is best effort: if there is a failure, this saga will unwind and be // triggered again for the same request. If there is no Propolis booted for diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs index b83f917a70..15c6a39651 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs @@ -187,8 +187,24 @@ pub(crate) mod test { .await .unwrap(); + let step_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], // nothing needed here + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + let mut request = - RegionSnapshotReplacementStep::new(Uuid::new_v4(), Uuid::new_v4()); + RegionSnapshotReplacementStep::new(Uuid::new_v4(), step_volume_id); request.replacement_state = RegionSnapshotReplacementStepState::Complete; request.old_snapshot_volume_id = Some(old_snapshot_volume_id); diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index a9be1f34ee..a8ded4e33c 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -27,12 +27,9 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; use crate::app::sagas::declare_saga_actions; -use nexus_db_model::Dataset; -use nexus_db_model::Region; -use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; -use nexus_types::identity::Asset; +use nexus_db_queries::db::datastore::FreedCrucibleResources; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -330,8 +327,6 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; - /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent /// because a snapshot existed. Look for those here. These will be a different @@ -417,7 +412,7 @@ type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// another snapshot delete. async fn svd_find_freed_crucible_regions( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); // Find regions freed up for deletion by a previous saga node deleting the @@ -432,11 +427,7 @@ async fn svd_find_freed_crucible_regions( }, )?; - // Don't serialize the whole Volume, as the data field contains key material! - Ok(freed_datasets_regions_and_volumes - .into_iter() - .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) - .collect()) + Ok(freed_datasets_regions_and_volumes) } async fn svd_delete_freed_crucible_regions( @@ -448,9 +439,11 @@ async fn svd_delete_freed_crucible_regions( // Find regions freed up for deletion by a previous saga node deleting the // region snapshots. let freed_datasets_regions_and_volumes = - sagactx.lookup::("freed_crucible_regions")?; + sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { + for (dataset, region) in + &freed_datasets_regions_and_volumes.datasets_and_regions + { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -477,18 +470,17 @@ async fn svd_delete_freed_crucible_regions( e, )) })?; + } - // Remove volume DB record - if let Some(volume_id) = volume_id { - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; - } + for volume_id in &freed_datasets_regions_and_volumes.volumes { + osagactx.datastore().volume_hard_delete(*volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; } Ok(()) diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 32a2f24d9d..7c8857123c 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -305,7 +305,7 @@ pub async fn run_region_snapshot_replacement_finish( assert!(status.errors.is_empty()); - status.records_set_to_done.len() + status.finish_invoked_ok.len() } /// Run all replacement related background tasks until they aren't doing diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 5f7e92c393..c301372d5f 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -5,9 +5,12 @@ //! Tests related to region and region snapshot replacement use dropshot::test_util::ClientTestContext; +use nexus_client::types::LastResult; use nexus_db_model::PhysicalDiskPolicy; use nexus_db_model::RegionReplacementState; +use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::region_snapshot_replacement::*; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; use nexus_test_utils::background::*; @@ -15,11 +18,22 @@ use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::resource_helpers::create_default_ip_pool; use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; +use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; +use nexus_types::external_api::params; +use nexus_types::external_api::views; +use nexus_types::identity::Asset; +use nexus_types::internal_api::background::*; +use omicron_common::api::external; +use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::GenericUuid; use slog::Logger; +use std::collections::HashSet; +use std::net::SocketAddr; use std::sync::Arc; use uuid::Uuid; @@ -40,12 +54,37 @@ fn get_disk_url(disk_name: &str) -> String { format!("/v1/disks/{disk_name}?project={}", PROJECT_NAME) } +fn get_disks_url() -> String { + format!("/v1/disks?project={}", PROJECT_NAME) +} + +fn get_snapshot_url(snapshot_name: &str) -> String { + format!("/v1/snapshots/{snapshot_name}?project={}", PROJECT_NAME) +} + +fn get_snapshots_url() -> String { + format!("/v1/snapshots?project={}", PROJECT_NAME) +} + async fn create_project_and_pool(client: &ClientTestContext) -> Uuid { create_default_ip_pool(client).await; let project = create_project(client, PROJECT_NAME).await; project.identity.id } +async fn collection_list( + client: &ClientTestContext, + list_url: &str, +) -> Vec +where + T: Clone + serde::de::DeserializeOwned, +{ + NexusRequest::iter_collection_authn(client, list_url, "", None) + .await + .expect("failed to list") + .all_items +} + /// Assert that the first part of region replacement does not create a freed /// crucible region (that would be picked up by a volume delete saga) #[nexus_test] @@ -118,280 +157,355 @@ async fn test_region_replacement_does_not_create_freed_region( assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); } -struct RegionReplacementDeletedVolumeTest<'a> { - log: Logger, - datastore: Arc, - disk_test: DiskTest<'a>, - client: ClientTestContext, - internal_client: ClientTestContext, - replacement_request_id: Uuid, -} - -#[derive(Debug)] -struct ExpectedEndState(pub RegionReplacementState); - -#[derive(Debug)] -struct ExpectedIntermediateState(pub RegionReplacementState); - -impl<'a> RegionReplacementDeletedVolumeTest<'a> { - pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { - let nexus = &cptestctx.server.server_context().nexus; - - // Create four zpools, each with one dataset. This is required for - // region and region snapshot replacement to have somewhere to move the - // data. - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(cptestctx.first_sled()) - .with_zpool_count(4) - .build() - .await; - - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - let datastore = nexus.datastore().clone(); - - let opctx = OpContext::for_tests( - cptestctx.logctx.log.new(o!()), - datastore.clone(), - ); - - // Create a disk - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, "disk").await; - - // Manually create the region replacement request for the first - // allocated region of that disk - - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() - .await - .unwrap(); +mod region_replacement { + use super::*; - assert_eq!(db_disk.id(), disk.identity.id); + #[derive(Debug)] + struct ExpectedEndState(pub RegionReplacementState); - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionReplacementState); - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); + #[derive(Debug)] + struct ExpectedStartState(pub RegionReplacementState); - // Assert the request is in state Requested + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + } - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + // Manually create the region replacement request for the first + // allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let replacement_request_id = datastore + .create_region_replacement_request_for_region(&opctx, ®ion) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Requested, + ); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), replacement_request_id, - ) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); - - RegionReplacementDeletedVolumeTest { - log: cptestctx.logctx.log.new(o!()), - datastore, - disk_test, - client: client.clone(), - internal_client: internal_client.clone(), - replacement_request_id, + } } - } - pub fn opctx(&self) -> OpContext { - OpContext::for_tests(self.log.clone(), self.datastore.clone()) - } + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } - pub async fn delete_the_disk(&self) { - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(&self.client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); - } + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } - /// Make sure: - /// - /// - all region replacement related background tasks run to completion - /// - this harness' region replacement request has transitioned to Complete - /// - no Crucible resources are leaked - pub async fn finish_test(&self) { - // Make sure that all the background tasks can run to completion. + /// Make sure: + /// + /// - all region replacement related background tasks run to completion + /// - this harness' region replacement request has transitioned to + /// Complete + /// - no Crucible resources are leaked + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. - run_replacement_tasks_to_completion(&self.internal_client).await; + run_replacement_tasks_to_completion(&self.internal_client).await; - // Assert the request is in state Complete + // Assert the request is in state Complete - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, - ) - .await - .unwrap(); + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Complete, + ); - // Assert there are no more Crucible resources + // Assert there are no more Crucible resources - assert!(self.disk_test.crucible_resources_deleted().await); - } + assert!(self.disk_test.crucible_resources_deleted().await); + } - async fn wait_for_request_state( - &self, - expected_end_state: ExpectedEndState, - expected_intermediate_state: ExpectedIntermediateState, - ) { - wait_for_condition( - || { - let datastore = self.datastore.clone(); - let opctx = self.opctx(); - let replacement_request_id = self.replacement_request_id; - - async move { - let region_replacement = datastore - .get_region_replacement_request_by_id( - &opctx, - replacement_request_id, - ) - .await - .unwrap(); - - let state = region_replacement.replacement_state; - - if state == expected_end_state.0 { - // The saga transitioned the request ok - Ok(()) - } else if state == expected_intermediate_state.0 { - // The saga is still running - Err(CondCheckError::<()>::NotYet) - } else { - // Any other state is not expected - panic!("unexpected state {state:?}!"); + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + // If the expected start and end state are the same + // (i.e. there's a back edge in the associated request's + // state machine), then it's impossible to determine + // when the saga starts and stops based on the state. + if expected_end_state.0 == expected_start_state.0 { + if state == expected_end_state.0 { + // The saga transitioned the request ok, or + // hasn't started yet. Either way we have to + // return here, and the call site should perform + // an additional check for some associated + // expected result. + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } else { + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 + || state == expected_start_state.0 + { + // The saga is still running, or hasn't started + // yet. + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } } - } - }, - &std::time::Duration::from_millis(500), - &std::time::Duration::from_secs(60), - ) - .await - .expect("request transitioned to expected state"); - - // Assert the request state - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), ) .await - .unwrap(); - - assert_eq!(region_replacement.replacement_state, expected_end_state.0); - } - - /// Run the "region replacement" task to transition the request to Running. - pub async fn transition_request_to_running(&self) { - // Activate the "region replacement" background task - - run_region_replacement(&self.internal_client).await; + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + expected_end_state.0 + ); + } - // The activation above could only have started the associated saga, so - // wait until the request is in state Running. + /// Run the "region replacement" task to transition the request to + /// Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region replacement" background task - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Allocating), - ) - .await; - } + run_region_replacement(&self.internal_client).await; - /// Call the region replacement drive task to attach the associated volume - /// to the simulated pantry, ostensibly for reconciliation - pub async fn attach_request_volume_to_pantry(&self) { - // Run the "region replacement driver" task to attach the associated - // volume to the simulated pantry. + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. - run_region_replacement_driver(&self.internal_client).await; + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Allocating), + ExpectedStartState(RegionReplacementState::Requested), + ) + .await; + } - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. + /// Call the region replacement drive task to attach the associated volume + /// to the simulated pantry, ostensibly for reconciliation + pub async fn attach_request_volume_to_pantry(&self) { + // Run the "region replacement driver" task to attach the associated + // volume to the simulated pantry. - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::Running), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + run_region_replacement_driver(&self.internal_client).await; - // Additionally, assert that the drive saga recorded that it sent the - // attachment request to the simulated pantry + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. - let most_recent_step = self - .datastore - .current_region_replacement_request_step( - &self.opctx(), - self.replacement_request_id, + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), ) - .await - .unwrap() - .unwrap(); - - assert!(most_recent_step.pantry_address().is_some()); - } + .await; - /// Manually activate the background attachment for the request volume - pub async fn manually_activate_attached_volume( - &self, - cptestctx: &'a ControlPlaneTestContext, - ) { - let pantry = - cptestctx.sled_agent.pantry_server.as_ref().unwrap().pantry.clone(); - - let region_replacement = self - .datastore - .get_region_replacement_request_by_id( - &self.opctx(), - self.replacement_request_id, + // Additionally, assert that the drive saga recorded that it sent + // the attachment request to the simulated pantry. + // + // If `wait_for_request_state` has the same expected start and end + // state (as it does above), it's possible to exit that function + // having not yet started the saga yet, and this requires an + // additional `wait_for_condition` to wait for the expected recorded + // step. + + let most_recent_step = wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + match datastore + .current_region_replacement_request_step( + &opctx, + replacement_request_id, + ) + .await + .unwrap() + { + Some(step) => Ok(step), + + None => { + // The saga either has not started yet or is + // still running - see the comment before this + // check for more info. + Err(CondCheckError::<()>::NotYet) + } + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), ) .await - .unwrap(); + .expect("most recent step"); - pantry - .activate_background_attachment( - region_replacement.volume_id.to_string(), - ) - .await - .unwrap(); - } + assert!(most_recent_step.pantry_address().is_some()); + } - /// Transition request to ReplacementDone via the region replacement drive - /// saga - pub async fn transition_request_to_replacement_done(&self) { - // Run the "region replacement driver" task + /// Manually activate the background attachment for the request volume + pub async fn manually_activate_attached_volume( + &self, + cptestctx: &'a ControlPlaneTestContext, + ) { + let pantry = cptestctx + .sled_agent + .pantry_server + .as_ref() + .unwrap() + .pantry + .clone(); + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + pantry + .activate_background_attachment( + region_replacement.volume_id.to_string(), + ) + .await + .unwrap(); + } - run_region_replacement_driver(&self.internal_client).await; + /// Transition request to ReplacementDone via the region replacement + /// drive saga + pub async fn transition_request_to_replacement_done(&self) { + // Run the "region replacement driver" task - // The activation above could only have started the associated saga, so - // wait until the request is in the expected end state. + run_region_replacement_driver(&self.internal_client).await; - self.wait_for_request_state( - ExpectedEndState(RegionReplacementState::ReplacementDone), - ExpectedIntermediateState(RegionReplacementState::Driving), - ) - .await; + // The activation above could only have started the associated saga, + // so wait until the request is in the expected end state. + + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::ReplacementDone), + ExpectedIntermediateState(RegionReplacementState::Driving), + ExpectedStartState(RegionReplacementState::Running), + ) + .await; + } } } @@ -401,7 +515,8 @@ impl<'a> RegionReplacementDeletedVolumeTest<'a> { async fn test_delete_volume_region_replacement_state_requested( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: delete the // disk, then finish the test. @@ -417,7 +532,8 @@ async fn test_delete_volume_region_replacement_state_requested( async fn test_delete_volume_region_replacement_state_running( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -437,7 +553,8 @@ async fn test_delete_volume_region_replacement_state_running( async fn test_delete_volume_region_replacement_state_running_on_pantry( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -459,7 +576,8 @@ async fn test_delete_volume_region_replacement_state_running_on_pantry( async fn test_delete_volume_region_replacement_state_replacement_done( cptestctx: &ControlPlaneTestContext, ) { - let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + let test_harness = + region_replacement::DeletedVolumeTest::new(cptestctx).await; // The request leaves the `new` function in state Requested: // - transition the request to "Running" @@ -481,3 +599,1109 @@ async fn test_delete_volume_region_replacement_state_replacement_done( test_harness.finish_test().await; } + +/// Assert that the problem experienced in issue 6353 is fixed +#[nexus_test] +async fn test_racing_replacements_for_soft_deleted_disk_volume( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create four zpools, each with one dataset. This is required for region + // and region snapshot replacement to have somewhere to move the data. + let sled_id = cptestctx.first_sled(); + let mut disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(sled_id) + .with_zpool_count(4) + .build() + .await; + + // Create a disk, then a snapshot of that disk + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: "snapshot".parse().unwrap(), + description: String::from("a snapshot"), + }, + disk: disk.identity.name.into(), + }, + ) + .await; + + // Before deleting the disk, save the DB model + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + // Next, expunge a physical disk that contains a region snapshot (which + // means it'll have the region too) + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let (dataset, region) = &disk_allocated_regions[0]; + let zpool = disk_test + .zpools() + .find(|x| *x.id.as_untyped_uuid() == dataset.pool_id) + .expect("Expected at least one zpool"); + + let (_, db_zpool) = LookupPath::new(&opctx, datastore) + .zpool_id(zpool.id.into_untyped_uuid()) + .fetch() + .await + .unwrap(); + + datastore + .physical_disk_update_policy( + &opctx, + db_zpool.physical_disk_id.into(), + PhysicalDiskPolicy::Expunged, + ) + .await + .unwrap(); + + // Only one region snapshot should be been returned by the following call + // due to the allocation policy. + + let expunged_region_snapshots = datastore + .find_region_snapshots_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + assert_eq!(expunged_region_snapshots.len(), 1); + + for expunged_region_snapshot in expunged_region_snapshots { + assert_eq!(expunged_region_snapshot.snapshot_id, snapshot.identity.id); + } + + // Either one or two regions can be returned, depending on if the snapshot + // destination volume was allocated on to the physical disk that was + // expunged. + + let expunged_regions = datastore + .find_regions_on_expunged_physical_disks(&opctx) + .await + .unwrap(); + + match expunged_regions.len() { + 1 => { + assert_eq!(expunged_regions[0].id(), region.id()); + } + + 2 => { + assert!(expunged_regions.iter().any(|r| r.id() == region.id())); + + let (.., db_snapshot) = LookupPath::new(&opctx, datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let snapshot_allocated_datasets_and_regions = datastore + .get_allocated_regions(db_snapshot.destination_volume_id) + .await + .unwrap(); + + let snapshot_allocated_regions: Vec = + snapshot_allocated_datasets_and_regions + .into_iter() + .map(|(_, r)| r.id()) + .collect(); + + assert!(expunged_regions.iter().any(|region| { + snapshot_allocated_regions.contains(®ion.id()) + })); + } + + _ => { + panic!("unexpected number of expunged regions!"); + } + } + + // Now, race the region replacement with the region snapshot replacement: + // + // 1) region replacement will allocate a new region and swap it into the + // disk volume. + + let internal_client = &cptestctx.internal_client; + + let _ = + activate_background_task(&internal_client, "region_replacement").await; + + // After that task invocation, there should be one running region + // replacement for the disk's region. Filter out the replacement request for + // the snapshot destination volume if it's there. The above background task + // only starts the associated saga, so wait for it to complete. + + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + if region_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_replacements: Vec<_> = datastore + .get_running_region_replacements(&opctx) + .await + .unwrap() + .into_iter() + .filter(|x| x.old_region_id == region.id()) + .collect(); + + assert_eq!(region_replacements.len(), 1); + + // 2) region snapshot replacement start will replace the region snapshot in + // the snapshot volume + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_start", + ) + .await; + + // After that, there should be one "replacement done" region snapshot + // replacement for the associated region snapshot. The above background task + // only starts the associated saga, so wait for it to complete. + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + if region_snapshot_replacements.len() == 1 { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(20), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacements = datastore + .get_replacement_done_region_snapshot_replacements(&opctx) + .await + .unwrap(); + + assert_eq!(region_snapshot_replacements.len(), 1); + assert_eq!( + region_snapshot_replacements[0].old_dataset_id, + dataset.id().into() + ); + assert_eq!(region_snapshot_replacements[0].old_region_id, region.id()); + assert_eq!( + region_snapshot_replacements[0].old_snapshot_id, + snapshot.identity.id + ); + assert_eq!( + region_snapshot_replacements[0].replacement_state, + RegionSnapshotReplacementState::ReplacementDone, + ); + + assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); + + // 3) Delete the disk + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // The volume should be soft-deleted now. The region snapshot replacement + // swapped out the region snapshot from the snapshot volume to the temporary + // volume for later deletion, but has not actually deleted that temporary + // volume yet, so the count will not have gone to 0. + + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + assert!(volume.unwrap().time_deleted.is_some()); + + // 4) region snapshot replacement garbage collect will delete the temporary + // volume with the stashed reference to the region snapshot, bringing the + // reference count to zero. + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_garbage_collection", + ) + .await; + + // Assert the region snapshot was deleted. + assert!(datastore + .region_snapshot_get(dataset.id(), region.id(), snapshot.identity.id) + .await + .unwrap() + .is_none()); + + // Assert that the disk's volume is still only soft-deleted, because the two + // other associated region snapshots still exist. + let volume = datastore.volume_get(db_disk.volume_id).await.unwrap(); + assert!(volume.is_some()); + + // Check on the old region id - it should not be deleted + let maybe_region = + datastore.get_region_optional(region.id()).await.unwrap(); + + eprintln!("old_region_id: {:?}", &maybe_region); + assert!(maybe_region.is_some()); + + // But the new region id will be! + let maybe_region = datastore + .get_region_optional(region_replacements[0].new_region_id.unwrap()) + .await + .unwrap(); + + eprintln!("new region id: {:?}", &maybe_region); + assert!(maybe_region.is_none()); + + // The region_replacement drive task should invoke the drive saga now, which + // will skip over all notification steps and transition the request to + // ReplacementDone + + let last_background_task = + activate_background_task(&internal_client, "region_replacement_driver") + .await; + + assert!(match last_background_task.last { + LastResult::Completed(last_result_completed) => { + match serde_json::from_value::( + last_result_completed.details, + ) { + Err(e) => { + eprintln!("{e}"); + false + } + + Ok(v) => !v.drive_invoked_ok.is_empty(), + } + } + + _ => { + false + } + }); + + // wait for the drive saga to complete here + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let replacement_request_id = region_replacements[0].id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionReplacementState::ReplacementDone { + // The saga transitioned the request ok + Ok(()) + } else if state == RegionReplacementState::Driving { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else if state == RegionReplacementState::Completing { + // The saga transitioned the request ok, and it's now being + // finished by the region replacement finish saga + Ok(()) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // After the region snapshot replacement process runs to completion, there + // should be no more crucible resources left. Run the "region snapshot + // replacement step" background task until there's nothing left, then the + // "region snapshot replacement finish", then make sure there are no + // crucible resources left. + + let mut count = 0; + loop { + let actions_taken = + run_region_snapshot_replacement_step(&internal_client).await; + + if actions_taken == 0 { + break; + } + + count += 1; + + if count > 20 { + assert!(false); + } + } + + let _ = activate_background_task( + &internal_client, + "region_snapshot_replacement_finish", + ) + .await; + + // Ensure the region snapshot replacement request went to Complete + + wait_for_condition( + || { + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let request_id = region_snapshot_replacements[0].id; + + async move { + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = region_snapshot_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + // Any other state is not expected + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // Delete the snapshot + + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // and now there should be no higher level resources left + + let disks_url = get_disks_url(); + assert_eq!( + collection_list::(&client, &disks_url).await.len(), + 0 + ); + + let snapshots_url = get_snapshots_url(); + assert_eq!( + collection_list::(&client, &snapshots_url).await.len(), + 0 + ); + + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&internal_client).await; + + // The disk volume should be deleted by the snapshot delete: wait until this + // happens + + wait_for_condition( + || { + let datastore = datastore.clone(); + let volume_id = db_disk.volume_id; + + async move { + let volume = datastore.volume_get(volume_id).await.unwrap(); + if volume.is_none() { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("disk volume deleted"); + + // There should be no more crucible resources left. Don't just check for + // `crucible_resources_deleted` here! We have set one of the physical disk + // policies to expunged, so Nexus will not attempt to clean up any resources + // on that physical disk. + + disk_test.remove_zpool(db_zpool.id()).await; + + // Now, assert that all crucible resources are cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} + +mod region_snapshot_replacement { + use super::*; + + #[derive(Debug)] + struct ExpectedEndState(pub RegionSnapshotReplacementState); + + #[derive(Debug)] + struct ExpectedIntermediateState(pub RegionSnapshotReplacementState); + + #[derive(Debug)] + struct ExpectedStartState(pub RegionSnapshotReplacementState); + + pub(super) struct DeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, + snapshot_socket_addr: SocketAddr, + } + + impl<'a> DeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move + // the data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk, a snapshot of that disk, and a disk from that + // snapshot + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot") + .await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Manually create the region snapshot replacement request for the + // first allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = datastore + .get_allocated_regions(db_disk.volume_id) + .await + .unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let region_snapshot = datastore + .region_snapshot_get( + region.dataset_id(), + region.id(), + snapshot.identity.id, + ) + .await + .expect("found region snapshot without error") + .unwrap(); + + let replacement_request_id = datastore + .create_region_snapshot_replacement_request( + &opctx, + ®ion_snapshot, + ) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_snapshot_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Requested, + ); + + // Assert two volumes reference the snapshot addr + + let snapshot_socket_addr = + region_snapshot.snapshot_addr.parse().unwrap(); + + let volumes = datastore + .find_volumes_referencing_socket_addr( + &opctx, + snapshot_socket_addr, + ) + .await + .unwrap(); + + assert_eq!(volumes.len(), 2); + + // Validate that they are snapshot and disk from snapshot + + let volumes_set: HashSet = + volumes.into_iter().map(|v| v.id()).collect(); + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + assert!(volumes_set.contains(&db_snapshot.volume_id)); + assert!(volumes_set.contains(&db_disk_from_snapshot.volume_id)); + + DeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), + replacement_request_id, + snapshot_socket_addr, + } + } + + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } + + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } + + pub async fn delete_the_snapshot(&self) { + let snapshot_url = get_snapshot_url("snapshot"); + NexusRequest::object_delete(&self.client, &snapshot_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + } + + pub async fn delete_the_disk_from_snapshot(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + } + + /// Make sure: + /// + /// - all region snapshot replacement related background tasks run to + /// completion + /// - this harness' region snapshot replacement request has transitioned + /// to Complete + /// - there are no more volumes that reference the request's region + /// snapshot + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&self.internal_client).await; + + // Assert the request is in state Complete + + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + // The saga transitioned the request ok + Ok(()) + } else { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(10), + ) + .await + .expect("request transitioned to expected state"); + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + RegionSnapshotReplacementState::Complete, + ); + + // Assert no volumes are referencing the snapshot address + + let volumes = self + .datastore + .find_volumes_referencing_socket_addr( + &self.opctx(), + self.snapshot_socket_addr, + ) + .await + .unwrap(); + + if !volumes.is_empty() { + eprintln!("{:?}", volumes); + } + + assert!(volumes.is_empty()); + } + + /// Assert no Crucible resources are leaked + pub async fn assert_no_crucible_resources_leaked(&self) { + assert!(self.disk_test.crucible_resources_deleted().await); + } + + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + expected_start_state: ExpectedStartState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else if state == expected_start_state.0 { + // The saga hasn't started yet + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + // Assert the request state + + let region_snapshot_replacement = self + .datastore + .get_region_snapshot_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_snapshot_replacement.replacement_state, + expected_end_state.0, + ); + } + + /// Run the "region snapshot replacement" task to transition the request + /// to ReplacementDone. + pub async fn transition_request_to_replacement_done(&self) { + // Activate the "region snapshot replacement start" background task + + run_region_snapshot_replacement_start(&self.internal_client).await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState( + RegionSnapshotReplacementState::ReplacementDone, + ), + ExpectedIntermediateState( + RegionSnapshotReplacementState::Allocating, + ), + ExpectedStartState(RegionSnapshotReplacementState::Requested), + ) + .await; + } + + /// Run the "region snapshot replacement garbage collection" task to + /// transition the request to Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region snapshot replacement garbage collection" + // background task + + run_region_snapshot_replacement_garbage_collection( + &self.internal_client, + ) + .await; + + // The activation above could only have started the associated saga, + // so wait until the request is in state Running. + + self.wait_for_request_state( + ExpectedEndState(RegionSnapshotReplacementState::Running), + ExpectedIntermediateState( + RegionSnapshotReplacementState::DeletingOldVolume, + ), + ExpectedStartState( + RegionSnapshotReplacementState::ReplacementDone, + ), + ) + .await; + } + + /// Manually create a region snapshot replacement step for the disk + /// created from the snapshot + pub async fn create_manual_region_snapshot_replacement_step(&self) { + let disk_url = get_disk_url("disk-from-snapshot"); + + let disk_from_snapshot: external::Disk = + NexusRequest::object_get(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body() + .unwrap(); + + let (.., db_disk_from_snapshot) = + LookupPath::new(&self.opctx(), &self.datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap(); + + let result = self + .datastore + .create_region_snapshot_replacement_step( + &self.opctx(), + self.replacement_request_id, + db_disk_from_snapshot.volume_id, + ) + .await + .unwrap(); + + match result { + InsertStepResult::Inserted { .. } => {} + + _ => { + assert!(false, "bad result from create_region_snapshot_replacement_step"); + } + } + } + } +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted and still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: delete + // the snapshot, then finish the test. + + test_harness.delete_the_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have its snapshot deleted, and the snapshot's source disk can be deleted, +/// and the request will still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_2( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - the only thing that will remain is the disk-from-snap that was created + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Requested" can +/// have everything be deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_requested_3( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - delete the snapshot + // - delete the snapshot's source disk + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "ReplacementDone" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_replacement_done( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement request in state "Running" +/// can have its snapshot deleted, and the request will still transition to +/// Complete +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_state_running( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - delete the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.delete_the_snapshot().await; + test_harness.delete_the_disk().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk_from_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} + +/// Assert that a region snapshot replacement step can have its associated +/// volume deleted and still transition to VolumeDeleted +#[nexus_test] +async fn test_delete_volume_region_snapshot_replacement_step( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = + region_snapshot_replacement::DeletedVolumeTest::new(cptestctx).await; + + // The request leaves the above `new` function in state Requested: + // - transition the request to "ReplacementDone" + // - transition the request to "Running" + // - manually create a region snapshot replacement step for the disk created + // from the snapshot + // - delete the disk created from the snapshot + // - finally, call finish_test + + test_harness.transition_request_to_replacement_done().await; + test_harness.transition_request_to_running().await; + + test_harness.create_manual_region_snapshot_replacement_step().await; + test_harness.delete_the_disk_from_snapshot().await; + + test_harness.finish_test().await; + + // Delete all the non-deleted resources + + test_harness.delete_the_disk().await; + test_harness.delete_the_snapshot().await; + + // Assert there are no more Crucible resources + + test_harness.assert_no_crucible_resources_leaked().await; +} diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 6a9ce28389..f0eb294e58 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -3764,12 +3764,13 @@ impl TestReadOnlyRegionReferenceUsage { // read-only regions should never be returned by find_deleted_volume_regions pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { - let deleted_volume_regions = + let freed_crucible_resources = self.datastore.find_deleted_volume_regions().await.unwrap(); - assert!(!deleted_volume_regions + assert!(!freed_crucible_resources + .datasets_and_regions .into_iter() - .any(|(_, r, _)| r.id() == self.region.id())); + .any(|(_, r)| r.id() == self.region.id())); } pub async fn create_first_volume_region_in_rop(&self) { diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index cf3d652587..bee2f56c34 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -37,6 +37,7 @@ pub struct LookupRegionPortStatus { pub struct RegionSnapshotReplacementStartStatus { pub requests_created_ok: Vec, pub start_invoked_ok: Vec, + pub requests_completed_ok: Vec, pub errors: Vec, } @@ -55,13 +56,14 @@ pub struct RegionSnapshotReplacementStepStatus { pub step_records_created_ok: Vec, pub step_garbage_collect_invoked_ok: Vec, pub step_invoked_ok: Vec, + pub step_set_volume_deleted_ok: Vec, pub errors: Vec, } /// The status of a `region_snapshot_replacement_finish` background task activation #[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] pub struct RegionSnapshotReplacementFinishStatus { - pub records_set_to_done: Vec, + pub finish_invoked_ok: Vec, pub errors: Vec, } diff --git a/schema/crdb/add-completing-and-new-region-volume/up01.sql b/schema/crdb/add-completing-and-new-region-volume/up01.sql new file mode 100644 index 0000000000..6a973eb3c3 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up01.sql @@ -0,0 +1 @@ +ALTER TYPE omicron.public.region_snapshot_replacement_state ADD VALUE IF NOT EXISTS 'completing' AFTER 'complete'; diff --git a/schema/crdb/add-completing-and-new-region-volume/up02.sql b/schema/crdb/add-completing-and-new-region-volume/up02.sql new file mode 100644 index 0000000000..42c0028ff5 --- /dev/null +++ b/schema/crdb/add-completing-and-new-region-volume/up02.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot_replacement ADD COLUMN IF NOT EXISTS new_region_volume_id UUID; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 75b7dbaf08..67b8ab0041 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4427,7 +4427,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.region_snapshot_replacement_state AS EN 'replacement_done', 'deleting_old_volume', 'running', - 'complete' + 'complete', + 'completing' ); CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( @@ -4445,7 +4446,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot_replacement ( replacement_state omicron.public.region_snapshot_replacement_state NOT NULL, - operating_saga_id UUID + operating_saga_id UUID, + + new_region_volume_id UUID ); CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_by_state on omicron.public.region_snapshot_replacement (replacement_state); @@ -4694,7 +4697,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '116.0.0', NULL) + (TRUE, NOW(), NOW(), '117.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From 5cf0c3ee96b35c3151f33ea1707dba2539260ed4 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 19 Dec 2024 13:35:24 -0500 Subject: [PATCH 8/9] Fix the build break, and expand the comment (#7284) --- nexus/tests/integration_tests/crucible_replacements.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index c301372d5f..e84c8a0614 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -689,12 +689,14 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume( assert_eq!(expunged_region_snapshot.snapshot_id, snapshot.identity.id); } - // Either one or two regions can be returned, depending on if the snapshot - // destination volume was allocated on to the physical disk that was - // expunged. + // Either one or two read/write regions will be returned: + // + // - one for the disk, and + // - one for the snapshot destination volume, depending on if it was + // allocated on to the physical disk that was expunged. let expunged_regions = datastore - .find_regions_on_expunged_physical_disks(&opctx) + .find_read_write_regions_on_expunged_physical_disks(&opctx) .await .unwrap(); From c5dc8963709268a93ce03ed509b61b762f70b0df Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Thu, 19 Dec 2024 11:06:21 -0800 Subject: [PATCH 9/9] Relax ordering constraints in oximeter counter tests (#7262) - Allow server and collection-task-side counters to differ by at most one in tests checking that we track requests in the task. - Fixes #7255 --- oximeter/collector/src/agent.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index e924cb2ee3..7e28831fa0 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -745,9 +745,9 @@ mod tests { let count = stats.collections.datum.value() as usize; assert!(count != 0); - assert_eq!( - count, - collection_count.load(Ordering::SeqCst), + let server_count = collection_count.load(Ordering::SeqCst); + assert!( + count == server_count || count - 1 == server_count, "number of collections reported by the collection \ task differs from the number reported by the empty \ producer server itself" @@ -892,9 +892,16 @@ mod tests { assert_eq!(stats.collections.datum.value(), 0); assert!(count != 0); - assert_eq!( - count, - collection_count.load(Ordering::SeqCst), + + // The server may have handled a request that we've not yet recorded on + // our collection task side, so we allow the server count to be greater + // than our own. But since the collection task is single-threaded, it + // cannot ever be more than _one_ greater than our count, since we + // should increment that counter before making another request to the + // server. + let server_count = collection_count.load(Ordering::SeqCst); + assert!( + count == server_count || count - 1 == server_count, "number of collections reported by the collection \ task differs from the number reported by the always-ded \ producer server itself"