Skip to content

Commit

Permalink
destroy both VMMs from the update saga
Browse files Browse the repository at this point in the history
As per [this comment][1] from @gjcolombo, we've determined that the
update saga *should* be responsible for cleaning up sled resource
allocations if handling an update where the active *or* target VMM has
been `Destroyed`. This commit updates the saga to do that, turning the
destroyed VMM cleanup actions into a subsaga so that we can,
potentially, perform it for both VMMs.

Releasing sled resources and marking VMMs as deleted is performed after
the saga has written back the instance record with both VMMs unlinked,
and dropped the lock. This ensures we don't leave "dangling pointers" in
the instance record, by having a foreign key into the VMM table for a
VMM that no longer exists. Since writing back the instance record
releases the lock, this means that cleaning up resource allocations
happens outside the lock. This way, sled resources are deallocated
immediately, to prevent sled resource exhaustion from rapidly churning
failed migrations, but we also release the update lock so that another
update saga can start.

Resources owned by the *instance* --- the oximeter producer and virtual
provisioning resources --- are deallocated within the lock if and only
if the instance has no active VMM. VMMs are cleaned up when they are
destroyed regardless of whether the instance has an active VMM (e.g.
destroyed targets are deleted when a migration fails, and destroyed
source VMMs are deleted when a migration succeeds, but the instance's
virtual resources are not deallocated in either case).

[1]: #5749 (comment)
  • Loading branch information
hawkw committed Jul 3, 2024
1 parent a5f7c91 commit c95fac7
Show file tree
Hide file tree
Showing 3 changed files with 285 additions and 180 deletions.
160 changes: 69 additions & 91 deletions nexus/src/app/sagas/instance_update/destroyed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,88 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::NexusActionContext;
use super::RealParams;
use super::DESTROYED_VMM_ID;
use super::{
declare_saga_actions, ActionRegistry, DagBuilder, NexusActionContext,
NexusSaga, SagaInitError,
};
use crate::app::sagas::ActionError;
use nexus_db_queries::authn;
use omicron_common::api::external::Error;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::InstanceUuid;
use omicron_uuid_kinds::PropolisUuid;
use serde::{Deserialize, Serialize};

pub(super) async fn siu_destroyed_release_sled_resources(
// destroy VMM subsaga: input parameters

#[derive(Debug, Deserialize, Serialize)]
pub(super) struct Params {
/// Authentication context to use to fetch the instance's current state from
/// the database.
pub(super) serialized_authn: authn::saga::Serialized,

/// Instance UUID of the instance being updated. This is only just used
/// for logging, so we just use the instance ID here instead of serializing
/// a whole instance record.
pub(super) instance_id: InstanceUuid,

/// UUID of the VMM to destroy.
pub(super) vmm_id: PropolisUuid,
}

// destroy VMM subsaga: actions

declare_saga_actions! {
destroy_vmm;

// Deallocate physical sled resources reserved for the destroyed VMM, as it
// is no longer using them.
RELEASE_SLED_RESOURCES -> "release_sled_resources" {
+ siu_destroyed_release_sled_resources
}

// Mark the VMM record as deleted.
MARK_VMM_DELETED -> "mark_vmm_deleted" {
+ siu_destroyed_mark_vmm_deleted
}
}

// destroy VMM subsaga: definition

#[derive(Debug)]
pub(super) struct SagaDestroyVmm;
impl NexusSaga for SagaDestroyVmm {
const NAME: &'static str = "destroy-vmm";
type Params = Params;

fn register_actions(registry: &mut ActionRegistry) {
destroy_vmm_register_actions(registry)
}

fn make_saga_dag(
_: &Self::Params,
mut builder: DagBuilder,
) -> Result<steno::Dag, SagaInitError> {
builder.append(release_sled_resources_action());
builder.append(mark_vmm_deleted_action());
Ok(builder.build()?)
}
}

async fn siu_destroyed_release_sled_resources(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let RealParams { ref serialized_authn, ref authz_instance, .. } =
sagactx.saga_params::<RealParams>()?;
let vmm_id = sagactx.lookup::<PropolisUuid>(DESTROYED_VMM_ID)?;
let Params { ref serialized_authn, instance_id, vmm_id, .. } =
sagactx.saga_params::<Params>()?;

let opctx =
crate::context::op_context_for_saga_action(&sagactx, serialized_authn);

info!(
osagactx.log(),
"instance update (active VMM destroyed): deallocating sled resource reservation";
"instance_id" => %authz_instance.id(),
"instance_id" => %instance_id,
"propolis_id" => %vmm_id,
"instance_update" => %"VMM destroyed",
);
Expand All @@ -44,100 +102,20 @@ pub(super) async fn siu_destroyed_release_sled_resources(
.map_err(ActionError::action_failed)
}

pub(super) async fn siu_destroyed_release_virtual_provisioning(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let RealParams { ref serialized_authn, ref authz_instance, state, .. } =
sagactx.saga_params::<RealParams>()?;

let vmm_id = sagactx.lookup::<PropolisUuid>(DESTROYED_VMM_ID)?;
let instance = state.instance;
let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id());

let opctx =
crate::context::op_context_for_saga_action(&sagactx, serialized_authn);

let result = osagactx
.datastore()
.virtual_provisioning_collection_delete_instance(
&opctx,
instance_id,
instance.project_id,
i64::from(instance.ncpus.0 .0),
instance.memory,
)
.await;
match result {
Ok(deleted) => {
info!(
osagactx.log(),
"instance update (VMM destroyed): deallocated virtual \
provisioning resources";
"instance_id" => %instance_id,
"propolis_id" => %vmm_id,
"records_deleted" => ?deleted,
"instance_update" => %"VMM destroyed",
);
}
// Necessary for idempotency --- the virtual provisioning resources may
// have been deleted already, that's fine.
Err(Error::ObjectNotFound { .. }) => {
// TODO(eliza): it would be nice if we could distinguish
// between errors returned by
// `virtual_provisioning_collection_delete_instance` where
// the instance ID was not found, and errors where the
// generation number was too low...
info!(
osagactx.log(),
"instance update (VMM destroyed): virtual provisioning \
record not found; perhaps it has already been deleted?";
"instance_id" => %instance_id,
"propolis_id" => %vmm_id,
"instance_update" => %"VMM destroyed",
);
}
Err(err) => return Err(ActionError::action_failed(err)),
};

Ok(())
}

pub(super) async fn siu_destroyed_unassign_oximeter_producer(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let RealParams { ref serialized_authn, ref authz_instance, .. } =
sagactx.saga_params::<RealParams>()?;

let opctx =
crate::context::op_context_for_saga_action(&sagactx, serialized_authn);

crate::app::oximeter::unassign_producer(
osagactx.datastore(),
osagactx.log(),
&opctx,
&authz_instance.id(),
)
.await
.map_err(ActionError::action_failed)
}

pub(super) async fn siu_destroyed_mark_vmm_deleted(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let RealParams { ref authz_instance, ref serialized_authn, .. } =
sagactx.saga_params::<RealParams>()?;
let vmm_id = sagactx.lookup::<PropolisUuid>(DESTROYED_VMM_ID)?;
let Params { ref serialized_authn, instance_id, vmm_id, .. } =
sagactx.saga_params::<Params>()?;

let opctx =
crate::context::op_context_for_saga_action(&sagactx, serialized_authn);

info!(
osagactx.log(),
"instance update (VMM destroyed): marking VMM record deleted";
"instance_id" => %authz_instance.id(),
"instance_id" => %instance_id,
"propolis_id" => %vmm_id,
"instance_update" => %"VMM destroyed",
);
Expand Down
Loading

0 comments on commit c95fac7

Please sign in to comment.