From 383d87c09bc60bbe6e73c825cdff776b6f9dbd34 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 1 Oct 2024 01:46:24 +0000 Subject: [PATCH] wait for associated sagas to transition the requests github runners are slow, but this revealed some race conditions with the replacement tests also refactor tests to use common harness --- .../crucible_replacements.rs | 718 ++++++++---------- 1 file changed, 312 insertions(+), 406 deletions(-) diff --git a/nexus/tests/integration_tests/crucible_replacements.rs b/nexus/tests/integration_tests/crucible_replacements.rs index 16ce892528..2f5317e249 100644 --- a/nexus/tests/integration_tests/crucible_replacements.rs +++ b/nexus/tests/integration_tests/crucible_replacements.rs @@ -9,6 +9,7 @@ use nexus_db_model::PhysicalDiskPolicy; use nexus_db_model::RegionReplacementState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::DataStore; use nexus_test_utils::background::*; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; @@ -16,11 +17,18 @@ use nexus_test_utils::resource_helpers::create_default_ip_pool; use nexus_test_utils::resource_helpers::create_disk; use nexus_test_utils::resource_helpers::create_project; use nexus_test_utils_macros::nexus_test; +use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use omicron_uuid_kinds::GenericUuid; +use slog::Logger; +use std::sync::Arc; use uuid::Uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; + +type DiskTest<'a> = + nexus_test_utils::resource_helpers::DiskTest<'a, omicron_nexus::Server>; + type DiskTestBuilder<'a> = nexus_test_utils::resource_helpers::DiskTestBuilder< 'a, omicron_nexus::Server, @@ -110,92 +118,297 @@ async fn test_region_replacement_does_not_create_freed_region( assert!(datastore.find_deleted_volume_regions().await.unwrap().is_empty()); } -/// Assert that a region replacement request in state "Requested" can have its -/// volume deleted and still transition to Complete -#[nexus_test] -async fn test_delete_volume_region_replacement_state_requested( - cptestctx: &ControlPlaneTestContext, -) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - - // Create four zpools, each with one dataset. This is required for region - // and region snapshot replacement to have somewhere to move the data. - let sled_id = cptestctx.first_sled(); - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(sled_id) - .with_zpool_count(4) - .build() - .await; - - // Create a disk - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, "disk").await; - - // Manually create the region replacement request +struct RegionReplacementDeletedVolumeTest<'a> { + log: Logger, + datastore: Arc, + disk_test: DiskTest<'a>, + client: ClientTestContext, + internal_client: ClientTestContext, + replacement_request_id: Uuid, +} - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() +#[derive(Debug)] +struct ExpectedEndState(pub RegionReplacementState); + +#[derive(Debug)] +struct ExpectedIntermediateState(pub RegionReplacementState); + +impl<'a> RegionReplacementDeletedVolumeTest<'a> { + pub async fn new(cptestctx: &'a ControlPlaneTestContext) -> Self { + let nexus = &cptestctx.server.server_context().nexus; + + // Create four zpools, each with one dataset. This is required for + // region and region snapshot replacement to have somewhere to move the + // data. + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let datastore = nexus.datastore().clone(); + + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + // Create a disk + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + // Manually create the region replacement request for the first + // allocated region of that disk + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap(); + + assert_eq!(db_disk.id(), disk.identity.id); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + let (_, region) = &disk_allocated_regions[0]; + + let replacement_request_id = datastore + .create_region_replacement_request_for_region(&opctx, ®ion) + .await + .unwrap(); + + // Assert the request is in state Requested + + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Requested, + ); + + RegionReplacementDeletedVolumeTest { + log: cptestctx.logctx.log.new(o!()), + datastore, + disk_test, + client: client.clone(), + internal_client: internal_client.clone(), + replacement_request_id, + } + } + + pub fn opctx(&self) -> OpContext { + OpContext::for_tests(self.log.clone(), self.datastore.clone()) + } + + pub async fn delete_the_disk(&self) { + let disk_url = get_disk_url("disk"); + NexusRequest::object_delete(&self.client, &disk_url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + } + + /// Make sure: + /// + /// - all region replacement related background tasks run to completion + /// - this harness' region replacement request has transitioned to Complete + /// - no Crucible resources are leaked + pub async fn finish_test(&self) { + // Make sure that all the background tasks can run to completion. + + run_replacement_tasks_to_completion(&self.internal_client).await; + + // Assert the request is in state Complete + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + assert_eq!( + region_replacement.replacement_state, + RegionReplacementState::Complete, + ); + + // Assert there are no more Crucible resources + + assert!(self.disk_test.crucible_resources_deleted().await); + } + + async fn wait_for_request_state( + &self, + expected_end_state: ExpectedEndState, + expected_intermediate_state: ExpectedIntermediateState, + ) { + wait_for_condition( + || { + let datastore = self.datastore.clone(); + let opctx = self.opctx(); + let replacement_request_id = self.replacement_request_id; + + async move { + let region_replacement = datastore + .get_region_replacement_request_by_id( + &opctx, + replacement_request_id, + ) + .await + .unwrap(); + + let state = region_replacement.replacement_state; + + if state == expected_end_state.0 { + // The saga transitioned the request ok + Ok(()) + } else if state == expected_intermediate_state.0 { + // The saga is still running + Err(CondCheckError::<()>::NotYet) + } else { + // Any other state is not expected + panic!("unexpected state {state:?}!"); + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) .await - .unwrap(); + .expect("request transitioned to expected state"); - assert_eq!(db_disk.id(), disk.identity.id); + // Assert the request state - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); + assert_eq!(region_replacement.replacement_state, expected_end_state.0); + } - // Assert the request is in state Requested + /// Run the "region replacement" task to transition the request to Running. + pub async fn transition_request_to_running(&self) { + // Activate the "region replacement" background task - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); + run_region_replacement(&self.internal_client).await; - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); + // The activation above could only have started the associated saga, so + // wait until the request is in state Running. - // Delete the disk + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Allocating), + ) + .await; + } - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); + /// Call the region replacement drive task to attach the associated volume + /// to the simulated pantry, ostensibly for reconciliation + pub async fn attach_request_volume_to_pantry(&self) { + // Run the "region replacement driver" task to attach the associated + // volume to the simulated pantry. - // Make sure that all the background tasks can run to completion. + run_region_replacement_driver(&self.internal_client).await; - run_replacement_tasks_to_completion(&internal_client).await; + // The activation above could only have started the associated saga, so + // wait until the request is in the expected end state. - // Assert the request is in state Complete + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::Running), + ExpectedIntermediateState(RegionReplacementState::Driving), + ) + .await; - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); + // Additionally, assert that the drive saga recorded that it sent the + // attachment request to the simulated pantry + + let most_recent_step = self + .datastore + .current_region_replacement_request_step( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap() + .unwrap(); + + assert!(most_recent_step.pantry_address().is_some()); + } + + /// Manually activate the background attachment for the request volume + pub async fn manually_activate_attached_volume( + &self, + cptestctx: &'a ControlPlaneTestContext, + ) { + let pantry = + cptestctx.sled_agent.pantry_server.as_ref().unwrap().pantry.clone(); + + let region_replacement = self + .datastore + .get_region_replacement_request_by_id( + &self.opctx(), + self.replacement_request_id, + ) + .await + .unwrap(); + + pantry + .activate_background_attachment( + region_replacement.volume_id.to_string(), + ) + .await + .unwrap(); + } + + /// Transition request to ReplacementDone via the region replacement drive + /// saga + pub async fn transition_request_to_replacement_done(&self) { + // Run the "region replacement driver" task + + run_region_replacement_driver(&self.internal_client).await; + + // The activation above could only have started the associated saga, so + // wait until the request is in the expected end state. + + self.wait_for_request_state( + ExpectedEndState(RegionReplacementState::ReplacementDone), + ExpectedIntermediateState(RegionReplacementState::Driving), + ) + .await; + } +} - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); +/// Assert that a region replacement request in state "Requested" can have its +/// volume deleted and still transition to Complete +#[nexus_test] +async fn test_delete_volume_region_replacement_state_requested( + cptestctx: &ControlPlaneTestContext, +) { + let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; + + // The request leaves the `new` function in state Requested: delete the + // disk, then finish the test. - // Assert there are no more Crucible resources + test_harness.delete_the_disk().await; - assert!(disk_test.crucible_resources_deleted().await); + test_harness.finish_test().await; } /// Assert that a region replacement request in state "Running" can have its @@ -204,100 +417,17 @@ async fn test_delete_volume_region_replacement_state_requested( async fn test_delete_volume_region_replacement_state_running( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - - // Create four zpools, each with one dataset. This is required for region - // and region snapshot replacement to have somewhere to move the data. - let sled_id = cptestctx.first_sled(); - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(sled_id) - .with_zpool_count(4) - .build() - .await; - - // Create a disk - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, "disk").await; - - // Manually create the region replacement request - - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() - .await - .unwrap(); - - assert_eq!(db_disk.id(), disk.identity.id); - - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; - - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); - - // Assert the request is in state Requested - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); - - // Run the "region replacement" task to transition the request to Running. - - run_region_replacement(&internal_client).await; + let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Running, - ); - - // Delete the disk + // The request leaves the `new` function in state Requested: + // - transition the request to "Running" + // - delete the disk, then finish the test. - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); - - // Make sure that all the background tasks can run to completion. + test_harness.transition_request_to_running().await; - run_replacement_tasks_to_completion(&internal_client).await; + test_harness.delete_the_disk().await; - // Assert the request is in state Complete - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); - - // Assert there are no more Crucible resources - - assert!(disk_test.crucible_resources_deleted().await); + test_harness.finish_test().await; } /// Assert that a region replacement request in state "Running" that has @@ -307,123 +437,20 @@ async fn test_delete_volume_region_replacement_state_running( async fn test_delete_volume_region_replacement_state_running_on_pantry( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - - // Create four zpools, each with one dataset. This is required for region - // and region snapshot replacement to have somewhere to move the data. - let sled_id = cptestctx.first_sled(); - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(sled_id) - .with_zpool_count(4) - .build() - .await; - - // Create a disk - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, "disk").await; - - // Manually create the region replacement request - - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() - .await - .unwrap(); - - assert_eq!(db_disk.id(), disk.identity.id); - - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; - - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); - - // Assert the request is in state Requested - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); - - // Run the "region replacement" task to transition the request to Running. - - run_region_replacement(&internal_client).await; + let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Running, - ); - - // Run the "region replacement driver" task to attach the associated volume - // to the simulated pantry - - run_region_replacement_driver(&internal_client).await; - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Running, - ); - - let most_recent_step = datastore - .current_region_replacement_request_step(&opctx, region_replacement.id) - .await - .unwrap() - .unwrap(); - - assert!(most_recent_step.pantry_address().is_some()); - - // Delete the disk - - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); + // The request leaves the `new` function in state Requested: + // - transition the request to "Running" + // - call the drive task to attach the volume to the simulated pantry + // - delete the disk, then finish the test. - // Make sure that all the background tasks can run to completion. + test_harness.transition_request_to_running().await; - run_replacement_tasks_to_completion(&internal_client).await; + test_harness.attach_request_volume_to_pantry().await; - // Assert the request is in state Complete + test_harness.delete_the_disk().await; - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); - - // Assert there are no more Crucible resources - - assert!(disk_test.crucible_resources_deleted().await); + test_harness.finish_test().await; } /// Assert that a region replacement request in state "ReplacementDone" can have @@ -432,146 +459,25 @@ async fn test_delete_volume_region_replacement_state_running_on_pantry( async fn test_delete_volume_region_replacement_state_replacement_done( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - let client = &cptestctx.external_client; - let internal_client = &cptestctx.internal_client; - - // Create four zpools, each with one dataset. This is required for region - // and region snapshot replacement to have somewhere to move the data. - let sled_id = cptestctx.first_sled(); - let disk_test = DiskTestBuilder::new(&cptestctx) - .on_specific_sled(sled_id) - .with_zpool_count(4) - .build() - .await; - - // Create a disk - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, "disk").await; - - // Manually create the region replacement request - - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk.identity.id) - .fetch() - .await - .unwrap(); - - assert_eq!(db_disk.id(), disk.identity.id); - - let disk_allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - let (_, region) = &disk_allocated_regions[0]; - - let replacement_request_id = datastore - .create_region_replacement_request_for_region(&opctx, ®ion) - .await - .unwrap(); - - // Assert the request is in state Requested - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Requested, - ); - - // Run the "region replacement" task to transition the request to Running. + let test_harness = RegionReplacementDeletedVolumeTest::new(cptestctx).await; - run_region_replacement(&internal_client).await; + // The request leaves the `new` function in state Requested: + // - transition the request to "Running" + // - call the drive task to attach the volume to the simulated pantry + // - simulate that the volume activated ok + // - call the drive task again, which will observe that activation and + // transition the request to "ReplacementDone" + // - delete the disk, then finish the test. - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Running, - ); - - // Run the "region replacement driver" task to attach the associated volume - // to the simulated pantry - - run_region_replacement_driver(&internal_client).await; - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); + test_harness.transition_request_to_running().await; - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Running, - ); + test_harness.attach_request_volume_to_pantry().await; - let most_recent_step = datastore - .current_region_replacement_request_step(&opctx, region_replacement.id) - .await - .unwrap() - .unwrap(); - - assert!(most_recent_step.pantry_address().is_some()); - - // Manually activate the background attachment - - let pantry = &cptestctx.sled_agent.pantry_server.as_ref().unwrap().pantry; - pantry - .activate_background_attachment( - region_replacement.volume_id.to_string(), - ) - .await - .unwrap(); - - // Run the "region replacement driver" task again, this time it should - // transition the request to ReplacementDone - - run_region_replacement_driver(&internal_client).await; - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); - - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::ReplacementDone, - ); - - // Delete the disk - - let disk_url = get_disk_url("disk"); - NexusRequest::object_delete(client, &disk_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("failed to delete disk"); - - // Make sure that all the background tasks can run to completion. - - run_replacement_tasks_to_completion(&internal_client).await; - - // Assert the request is in state Complete - - let region_replacement = datastore - .get_region_replacement_request_by_id(&opctx, replacement_request_id) - .await - .unwrap(); + test_harness.manually_activate_attached_volume(&cptestctx).await; - assert_eq!( - region_replacement.replacement_state, - RegionReplacementState::Complete, - ); + test_harness.transition_request_to_replacement_done().await; - // Assert there are no more Crucible resources + test_harness.delete_the_disk().await; - assert!(disk_test.crucible_resources_deleted().await); + test_harness.finish_test().await; }