Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename disks when un-deleting and faulting #4681

Merged
merged 4 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion nexus/db-queries/src/db/datastore/disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,20 @@ impl DataStore {
/// If the disk delete saga unwinds, then the disk should _not_ remain
/// deleted: disk delete saga should be triggered again in order to fully
/// complete, and the only way to do that is to un-delete the disk. Set it
/// to faulted to ensure that it won't be used.
/// to faulted to ensure that it won't be used. Use the disk's UUID as part
/// of its new name to ensure that even if a user created another disk that
/// shadows this "phantom" disk the original can still be un-deleted and
/// faulted.
Comment on lines +660 to +663
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth calling out - no matter how silly - that like, technically the user could be using this name for another too. Seems unlikely though!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're not wrong! added something in 1ac073e

///
/// It's worth pointing out that it's possible that the user created a disk,
/// then used that disk's ID to make a new disk with the same name as this
/// function would have picked when undeleting the original disk. In the
/// event that the original disk's delete saga unwound, this would cause
/// that unwind to fail at this step, and would cause a stuck saga that
/// requires manual intervention. The fixes as part of addressing issue 3866
/// should greatly reduce the number of disk delete sagas that unwind, but
/// this possibility does exist. To any customer reading this: please don't
/// name your disks `deleted-{another disk's id}` :)
pub async fn project_undelete_disk_set_faulted_no_auth(
&self,
disk_id: &Uuid,
Expand All @@ -667,12 +680,19 @@ impl DataStore {

let faulted = api::external::DiskState::Faulted.label();

// If only the UUID is used, you will hit "name cannot be a UUID to
// avoid ambiguity with IDs". Add a small prefix to avoid this, and use
// "deleted" to be unambigious to the user about what they should do
// with this disk.
let new_name = format!("deleted-{disk_id}");

let result = diesel::update(dsl::disk)
.filter(dsl::time_deleted.is_not_null())
.filter(dsl::id.eq(*disk_id))
.set((
dsl::time_deleted.eq(None::<DateTime<Utc>>),
dsl::disk_state.eq(faulted),
dsl::name.eq(new_name),
))
.check_if_exists::<Disk>(*disk_id)
.execute_and_check(&conn)
Expand Down
141 changes: 137 additions & 4 deletions nexus/tests/integration_tests/disks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1245,7 +1245,7 @@ async fn test_disk_virtual_provisioning_collection(
async fn test_disk_virtual_provisioning_collection_failed_delete(
cptestctx: &ControlPlaneTestContext,
) {
// Confirm that there's a panic deleting a project if a disk deletion fails
// Confirm that there's no panic deleting a project if a disk deletion fails
let client = &cptestctx.external_client;
let nexus = &cptestctx.server.apictx().nexus;
let datastore = nexus.datastore();
Expand All @@ -1271,6 +1271,7 @@ async fn test_disk_virtual_provisioning_collection_failed_delete(
},
size: disk_size,
};

NexusRequest::new(
RequestBuilder::new(client, Method::POST, &disks_url)
.body(Some(&disk_one))
Expand All @@ -1281,6 +1282,11 @@ async fn test_disk_virtual_provisioning_collection_failed_delete(
.await
.expect("unexpected failure creating 1 GiB disk");

// Get the disk
let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME);
let disk = disk_get(&client, &disk_url).await;
assert_eq!(disk.state, DiskState::Detached);

// Assert correct virtual provisioning collection numbers
let virtual_provisioning_collection = datastore
.virtual_provisioning_collection_get(&opctx, project_id1)
Expand All @@ -1302,8 +1308,6 @@ async fn test_disk_virtual_provisioning_collection_failed_delete(
.await;

// Delete the disk - expect this to fail
let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME);

NexusRequest::new(
RequestBuilder::new(client, Method::DELETE, &disk_url)
.expect_status(Some(StatusCode::INTERNAL_SERVER_ERROR)),
Expand All @@ -1323,7 +1327,12 @@ async fn test_disk_virtual_provisioning_collection_failed_delete(
disk_size
);

// And the disk is now faulted
// And the disk is now faulted. The name will have changed due to the
// "undelete and fault" function.
let disk_url = format!(
"/v1/disks/deleted-{}?project={}",
disk.identity.id, PROJECT_NAME
);
let disk = disk_get(&client, &disk_url).await;
assert_eq!(disk.state, DiskState::Faulted);

Expand Down Expand Up @@ -1373,6 +1382,130 @@ async fn test_disk_virtual_provisioning_collection_failed_delete(
.expect("unexpected failure deleting project");
}

#[nexus_test]
async fn test_phantom_disk_rename(cptestctx: &ControlPlaneTestContext) {
// Confirm that phantom disks are renamed when they are un-deleted and
// faulted

let client = &cptestctx.external_client;
let nexus = &cptestctx.server.apictx().nexus;
let datastore = nexus.datastore();

let _disk_test = DiskTest::new(&cptestctx).await;

populate_ip_pool(&client, "default", None).await;
let _project_id1 = create_project(client, PROJECT_NAME).await.identity.id;

// Create a 1 GB disk
let disk_size = ByteCount::from_gibibytes_u32(1);
let disks_url = get_disks_url();
let disk_one = params::DiskCreate {
identity: IdentityMetadataCreateParams {
name: "disk-one".parse().unwrap(),
description: String::from("sells rainsticks"),
},
disk_source: params::DiskSource::Blank {
block_size: params::BlockSize::try_from(512).unwrap(),
},
size: disk_size,
};

NexusRequest::new(
RequestBuilder::new(client, Method::POST, &disks_url)
.body(Some(&disk_one))
.expect_status(Some(StatusCode::CREATED)),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("unexpected failure creating 1 GiB disk");

let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME);

// Confirm it's there
let disk = disk_get(&client, &disk_url).await;
assert_eq!(disk.state, DiskState::Detached);

let original_disk_id = disk.identity.id;

// Now, request disk delete
NexusRequest::new(
RequestBuilder::new(client, Method::DELETE, &disk_url)
.expect_status(Some(StatusCode::NO_CONTENT)),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("unexpected failure deleting 1 GiB disk");

// It's gone!
NexusRequest::new(
RequestBuilder::new(client, Method::GET, &disk_url)
.expect_status(Some(StatusCode::NOT_FOUND)),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("unexpected success finding 1 GiB disk");

// Create a new disk with the same name
NexusRequest::new(
RequestBuilder::new(client, Method::POST, &disks_url)
.body(Some(&disk_one))
.expect_status(Some(StatusCode::CREATED)),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("unexpected failure creating 1 GiB disk");

// Confirm it's there
let disk = disk_get(&client, &disk_url).await;
assert_eq!(disk.state, DiskState::Detached);

// Confirm it's not the same disk
let new_disk_id = disk.identity.id;
assert_ne!(original_disk_id, new_disk_id);

// Un-delete the original and set it to faulted
datastore
.project_undelete_disk_set_faulted_no_auth(&original_disk_id)
.await
.unwrap();

// The original disk is now faulted
let disk_url = format!(
"/v1/disks/deleted-{}?project={}",
original_disk_id, PROJECT_NAME
);
let disk = disk_get(&client, &disk_url).await;
assert_eq!(disk.state, DiskState::Faulted);

// Make sure original can still be deleted
NexusRequest::new(
RequestBuilder::new(client, Method::DELETE, &disk_url)
.expect_status(Some(StatusCode::NO_CONTENT)),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("unexpected failure deleting 1 GiB disk");

// Make sure new can be deleted too
let disk_url = format!("/v1/disks/{}?project={}", "disk-one", PROJECT_NAME);
let disk = disk_get(&client, &disk_url).await;
assert_eq!(disk.state, DiskState::Detached);

NexusRequest::new(
RequestBuilder::new(client, Method::DELETE, &disk_url)
.expect_status(Some(StatusCode::NO_CONTENT)),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.expect("unexpected failure deleting 1 GiB disk");
}

// Test disk size accounting
#[nexus_test]
async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) {
Expand Down
Loading