From 51d475b820a89eaa746c26df33b1f9dfb2bd685e Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 12 Dec 2023 14:29:10 -0500 Subject: [PATCH 1/3] Rename disks when un-deleting and faulting When un-deleting phantom disks and setting them to faulted, use a new name that includes the disk's UUID: this ensures that even if a user created another disk with the same name in the project, the phantom disk can still be un-deleted and faulted, and eventually cleaned up. Fixes #4673 --- nexus/db-queries/src/db/datastore/disk.rs | 12 ++- nexus/tests/integration_tests/disks.rs | 126 +++++++++++++++++++++- 2 files changed, 136 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index 94d950f86a..abbe09235d 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -657,7 +657,10 @@ impl DataStore { /// If the disk delete saga unwinds, then the disk should _not_ remain /// deleted: disk delete saga should be triggered again in order to fully /// complete, and the only way to do that is to un-delete the disk. Set it - /// to faulted to ensure that it won't be used. + /// to faulted to ensure that it won't be used. Use the disk's UUID as part + /// of its new name to ensure that even if a user created another disk that + /// shadows this "phantom" disk the original can still be un-deleted and + /// faulted. pub async fn project_undelete_disk_set_faulted_no_auth( &self, disk_id: &Uuid, @@ -667,12 +670,19 @@ impl DataStore { let faulted = api::external::DiskState::Faulted.label(); + // If only the UUID is used, you will hit "name cannot be a UUID to + // avoid ambiguity with IDs". Add a small prefix to avoid this, and use + // "deleted" to be unambigious to the user about what they should do + // with this disk. + let new_name = format!("deleted-{disk_id}"); + let result = diesel::update(dsl::disk) .filter(dsl::time_deleted.is_not_null()) .filter(dsl::id.eq(*disk_id)) .set(( dsl::time_deleted.eq(None::>), dsl::disk_state.eq(faulted), + dsl::name.eq(new_name), )) .check_if_exists::(*disk_id) .execute_and_check(&conn) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 807c054b64..a0c8241ca6 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -1245,7 +1245,7 @@ async fn test_disk_virtual_provisioning_collection( async fn test_disk_virtual_provisioning_collection_failed_delete( cptestctx: &ControlPlaneTestContext, ) { - // Confirm that there's a panic deleting a project if a disk deletion fails + // Confirm that there's no panic deleting a project if a disk deletion fails let client = &cptestctx.external_client; let nexus = &cptestctx.server.apictx().nexus; let datastore = nexus.datastore(); @@ -1373,6 +1373,130 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( .expect("unexpected failure deleting project"); } +#[nexus_test] +async fn test_phantom_disk_rename(cptestctx: &ControlPlaneTestContext) { + // Confirm that phantom disks are renamed when they are un-deleted and + // faulted + + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + + let _disk_test = DiskTest::new(&cptestctx).await; + + populate_ip_pool(&client, "default", None).await; + let _project_id1 = create_project(client, PROJECT_NAME).await.identity.id; + + // Create a 1 GB disk + let disk_size = ByteCount::from_gibibytes_u32(1); + let disks_url = get_disks_url(); + let disk_one = params::DiskCreate { + identity: IdentityMetadataCreateParams { + name: "disk-one".parse().unwrap(), + description: String::from("sells rainsticks"), + }, + disk_source: params::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: disk_size, + }; + + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &disks_url) + .body(Some(&disk_one)) + .expect_status(Some(StatusCode::CREATED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("unexpected failure creating 1 GiB disk"); + + let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME); + + // Confirm it's there + let disk = disk_get(&client, &disk_url).await; + assert_eq!(disk.state, DiskState::Detached); + + let original_disk_id = disk.identity.id; + + // Now, request disk delete + NexusRequest::new( + RequestBuilder::new(client, Method::DELETE, &disk_url) + .expect_status(Some(StatusCode::NO_CONTENT)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("unexpected failure deleting 1 GiB disk"); + + // It's gone! + NexusRequest::new( + RequestBuilder::new(client, Method::GET, &disk_url) + .expect_status(Some(StatusCode::NOT_FOUND)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("unexpected success finding 1 GiB disk"); + + // Create a new disk with the same name + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &disks_url) + .body(Some(&disk_one)) + .expect_status(Some(StatusCode::CREATED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("unexpected failure creating 1 GiB disk"); + + // Confirm it's there + let disk = disk_get(&client, &disk_url).await; + assert_eq!(disk.state, DiskState::Detached); + + // Confirm it's not the same disk + let new_disk_id = disk.identity.id; + assert_ne!(original_disk_id, new_disk_id); + + // Un-delete the original and set it to faulted + datastore + .project_undelete_disk_set_faulted_no_auth(&original_disk_id) + .await + .unwrap(); + + // The original disk is now faulted + let disk_url = format!( + "/v1/disks/deleted-{}?project={}", + original_disk_id, PROJECT_NAME + ); + let disk = disk_get(&client, &disk_url).await; + assert_eq!(disk.state, DiskState::Faulted); + + // Make sure original can still be deleted + NexusRequest::new( + RequestBuilder::new(client, Method::DELETE, &disk_url) + .expect_status(Some(StatusCode::NO_CONTENT)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("unexpected failure deleting 1 GiB disk"); + + // Make sure new can be deleted too + let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME); + let disk = disk_get(&client, &disk_url).await; + assert_eq!(disk.state, DiskState::Detached); + + NexusRequest::new( + RequestBuilder::new(client, Method::DELETE, &disk_url) + .expect_status(Some(StatusCode::NO_CONTENT)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("unexpected failure deleting 1 GiB disk"); +} + // Test disk size accounting #[nexus_test] async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { From 33bca15ec97b26ee928d06a47ca51ff5f42fe945 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 14 Dec 2023 12:27:50 -0500 Subject: [PATCH 2/3] handle instance rename in test --- nexus/tests/integration_tests/disks.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index a0c8241ca6..a7c9c99509 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -1271,6 +1271,7 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( }, size: disk_size, }; + NexusRequest::new( RequestBuilder::new(client, Method::POST, &disks_url) .body(Some(&disk_one)) @@ -1281,6 +1282,11 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( .await .expect("unexpected failure creating 1 GiB disk"); + // Get the disk + let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME); + let disk = disk_get(&client, &disk_url).await; + assert_eq!(disk.state, DiskState::Detached); + // Assert correct virtual provisioning collection numbers let virtual_provisioning_collection = datastore .virtual_provisioning_collection_get(&opctx, project_id1) @@ -1302,8 +1308,6 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( .await; // Delete the disk - expect this to fail - let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME); - NexusRequest::new( RequestBuilder::new(client, Method::DELETE, &disk_url) .expect_status(Some(StatusCode::INTERNAL_SERVER_ERROR)), @@ -1323,7 +1327,12 @@ async fn test_disk_virtual_provisioning_collection_failed_delete( disk_size ); - // And the disk is now faulted + // And the disk is now faulted. The name will have changed due to the + // "undelete and fault" function. + let disk_url = format!( + "/v1/disks/deleted-{}?project={}", + disk.identity.id, PROJECT_NAME + ); let disk = disk_get(&client, &disk_url).await; assert_eq!(disk.state, DiskState::Faulted); From 1ac073e8a023fa0ea4789513deb293960db08da0 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 14 Dec 2023 13:10:49 -0500 Subject: [PATCH 3/3] customers could technically cause disk delete unwinds to get stuck --- nexus/db-queries/src/db/datastore/disk.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs index abbe09235d..2055287e62 100644 --- a/nexus/db-queries/src/db/datastore/disk.rs +++ b/nexus/db-queries/src/db/datastore/disk.rs @@ -661,6 +661,16 @@ impl DataStore { /// of its new name to ensure that even if a user created another disk that /// shadows this "phantom" disk the original can still be un-deleted and /// faulted. + /// + /// It's worth pointing out that it's possible that the user created a disk, + /// then used that disk's ID to make a new disk with the same name as this + /// function would have picked when undeleting the original disk. In the + /// event that the original disk's delete saga unwound, this would cause + /// that unwind to fail at this step, and would cause a stuck saga that + /// requires manual intervention. The fixes as part of addressing issue 3866 + /// should greatly reduce the number of disk delete sagas that unwind, but + /// this possibility does exist. To any customer reading this: please don't + /// name your disks `deleted-{another disk's id}` :) pub async fn project_undelete_disk_set_faulted_no_auth( &self, disk_id: &Uuid,