Skip to content

Commit

Permalink
fix lock generations getting eaten
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw committed Jun 3, 2024
1 parent 23b2496 commit 8e7a1ef
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 51 deletions.
42 changes: 20 additions & 22 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,24 @@ pub struct Instance {

#[diesel(embed)]
pub runtime_state: InstanceRuntimeState,

/// A UUID identifying the saga currently holding the update lock on this
/// instance. If this is [`None`] the instance is not locked. Otherwise, if
/// this is [`Some`], the instance is locked by the saga owning this UUID.
/// Note that this is not (presently) the UUID *of* the locking saga, but
/// rather, a UUID *generated by* that saga. Therefore, it may not be
/// useable to look up which saga holds the lock.
///
/// This field is guarded by the instance's `updater_gen`
#[diesel(column_name = updater_id)]
pub updater_id: Option<Uuid>,

/// The generation number for the updater lock. This is updated whenever the
/// lock is acquired or released, and is used in attempts to set the
/// `updater_id` field to ensure that the snapshot which indicated that the
/// lock was not held is still valid when setting the lock ID.
#[diesel(column_name = updater_gen)]
pub updater_gen: Generation,
}

impl Instance {
Expand Down Expand Up @@ -87,6 +105,8 @@ impl Instance {
hostname: params.hostname.to_string(),
boot_on_fault: false,
runtime_state,
updater_id: None,
updater_gen: Generation::new(),
}
}

Expand Down Expand Up @@ -179,24 +199,6 @@ pub struct InstanceRuntimeState {
/// This field is guarded by the instance's `gen`.
#[diesel(column_name = migration_id)]
pub migration_id: Option<Uuid>,

/// A UUID identifying the saga currently holding the update lock on this
/// instance. If this is [`None`] the instance is not locked. Otherwise, if
/// this is [`Some`], the instance is locked by the saga owning this UUID.
/// Note that this is not (presently) the UUID *of* the locking saga, but
/// rather, a UUID *generated by* that saga. Therefore, it may not be
/// useable to look up which saga holds the lock.
///
/// This field is guarded by the instance's `updater_gen`
#[diesel(column_name = updater_id)]
pub updater_id: Option<Uuid>,

/// The generation number for the updater lock. This is updated whenever the
/// lock is acquired or released, and is used in attempts to set the
/// `updater_id` field to ensure that the snapshot which indicated that the
/// lock was not held is still valid when setting the lock ID.
#[diesel(column_name = updater_gen)]
pub updater_gen: Generation,
}

impl InstanceRuntimeState {
Expand All @@ -208,8 +210,6 @@ impl InstanceRuntimeState {
dst_propolis_id: None,
migration_id: None,
gen: Generation::new(),
updater_gen: Generation::new(),
updater_id: None,
}
}
}
Expand All @@ -233,8 +233,6 @@ impl From<omicron_common::api::internal::nexus::InstanceRuntimeState>
propolis_id: state.propolis_id,
dst_propolis_id: state.dst_propolis_id,
migration_id: state.migration_id,
updater_gen: Generation::new(),
updater_id: None,
}
}
}
Expand Down
67 changes: 40 additions & 27 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ pub struct InstanceAndVmms {
/// when the lock is released.
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct UpdaterLock {
saga_lock_id: Uuid,
locked_gen: Generation,
pub saga_lock_id: Uuid,
pub locked_gen: Generation,
}

/// Errors returned by [`DataStore::instance_updater_lock`].
Expand Down Expand Up @@ -721,22 +721,21 @@ impl DataStore {
// *same* instance at the same time. So, idempotency is probably more
// important than handling that extremely unlikely edge case.
let mut did_lock = false;
let mut locked_gen = instance.updater_gen;
loop {
match instance.runtime_state.updater_id {
match instance.updater_id {
// If the `updater_id` field is not null and the ID equals this
// saga's ID, we already have the lock. We're done here!
Some(lock_id) if lock_id == saga_lock_id => {
slog::info!(
slog::debug!(
&opctx.log,
"instance updater lock acquired!";
"instance_id" => %instance_id,
"saga_id" => %saga_lock_id,
"already_locked" => !did_lock,
"locked_gen" => ?locked_gen,
);
return Ok(UpdaterLock {
saga_lock_id,
locked_gen: instance.runtime_state.updater_gen,
});
return Ok(UpdaterLock { saga_lock_id, locked_gen });
}
// The `updater_id` field is set, but it's not our ID. The instance
// is locked by a different saga, so give up.
Expand All @@ -756,7 +755,8 @@ impl DataStore {
}

// Okay, now attempt to acquire the lock
let current_gen = instance.runtime_state.updater_gen;
let current_gen = instance.updater_gen;
locked_gen = Generation(current_gen.0.next());
slog::debug!(
&opctx.log,
"attempting to acquire instance updater lock";
Expand All @@ -779,7 +779,7 @@ impl DataStore {
// of a non-distributed, single-process mutex.
.filter(dsl::updater_gen.eq(current_gen))
.set((
dsl::updater_gen.eq(dsl::updater_gen + 1),
dsl::updater_gen.eq(locked_gen),
dsl::updater_id.eq(Some(saga_lock_id)),
))
.check_if_exists::<Instance>(instance_id)
Expand Down Expand Up @@ -819,14 +819,15 @@ impl DataStore {
use db::schema::instance::dsl;

let instance_id = authz_instance.id();
let new_gen = Generation(locked_gen.0.next());

let result = diesel::update(dsl::instance)
.filter(dsl::time_deleted.is_null())
.filter(dsl::id.eq(instance_id))
.filter(dsl::updater_gen.eq(locked_gen))
.filter(dsl::updater_id.eq(parent_id))
.set((
dsl::updater_gen.eq(dsl::updater_gen + 1),
dsl::updater_gen.eq(new_gen),
dsl::updater_id.eq(Some(child_lock_id)),
))
.check_if_exists::<Instance>(instance_id)
Expand All @@ -845,27 +846,30 @@ impl DataStore {
match result {
// If we updated the record, the lock has been released! Return
// `Ok(true)` to indicate that we released the lock successfully.
UpdateAndQueryResult {
status: UpdateStatus::Updated,
ref found,
} => Ok(UpdaterLock {
saga_lock_id: child_lock_id,
locked_gen: found.runtime_state.updater_gen,
}),
UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => {
slog::info!(
&opctx.log,
"inherited lock from {parent_id} to {child_lock_id}";
"instance_id" => %instance_id,
"locked_gen" => ?new_gen,
"parent_gen" => ?locked_gen,
);
Ok(UpdaterLock {
saga_lock_id: child_lock_id,
locked_gen: new_gen,
})
}
// The generation has advanced past the generation at which the
// lock was held. This means that we have already released the
// lock was held. This means that we have already inherited the
// lock. Return `Ok(false)` here for idempotency.
UpdateAndQueryResult {
status: UpdateStatus::NotUpdatedButExists,
ref found,
} if found.runtime_state.updater_id == Some(child_lock_id) => {
debug_assert_eq!(
found.runtime_state.updater_gen,
Generation(locked_gen.0.next()),
);
} if found.updater_id == Some(child_lock_id) => {
debug_assert_eq!(found.updater_gen, new_gen,);
Ok(UpdaterLock {
saga_lock_id: child_lock_id,
locked_gen: found.runtime_state.updater_gen,
locked_gen: new_gen,
})
}
// The instance exists, but the lock ID doesn't match our lock ID.
Expand Down Expand Up @@ -937,13 +941,22 @@ impl DataStore {
UpdateAndQueryResult {
status: UpdateStatus::NotUpdatedButExists,
ref found,
} if found.runtime_state.updater_gen > locked_gen => Ok(false),
} if found.updater_gen > locked_gen => Ok(false),
// The instance exists, but the lock ID doesn't match our lock ID.
// This means we were trying to release a lock we never held, whcih
// is almost certainly a programmer error.
UpdateAndQueryResult { ref found, .. } => {
match found.runtime_state.updater_id {
match found.updater_id {
Some(lock_holder) => {
slog::error!(
&opctx.log,
"attempted to release a lock held by another saga";
"instance_id" => %instance_id,
"saga_id" => %saga_lock_id,
"lock_holder" => %lock_holder,
"found_gen" => ?found.updater_gen,
"locked_gen" => ?locked_gen,
);
debug_assert_ne!(lock_holder, saga_lock_id);
Err(Error::internal_error(
"attempted to release a lock held by another saga! this is a bug!",
Expand Down
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/instance_update/destroyed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use nexus_db_model::Instance;
use nexus_db_model::InstanceRuntimeState;
use nexus_db_queries::authn;
use nexus_db_queries::authz;
use nexus_db_queries::db::datastore::instance;
use omicron_common::api::external;
use omicron_common::api::external::Error;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -249,7 +250,6 @@ async fn siud_update_instance(
) -> Result<(), ActionError> {
let Params { ref authz_instance, ref vmm_id, instance, .. } =
sagactx.saga_params::<Params>()?;

let osagactx = sagactx.user_data();
let new_runtime = InstanceRuntimeState {
propolis_id: None,
Expand Down
9 changes: 8 additions & 1 deletion nexus/src/app/sagas/instance_update/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,18 @@ async fn unlock_instance_inner(
"lock" => ?lock,
);

osagactx
let did_unlock = osagactx
.datastore()
.instance_updater_unlock(&opctx, authz_instance, lock)
.await
.map_err(ActionError::action_failed)?;

slog::info!(
osagactx.log(),
"instance update: unlocked instance";
"instance_id" => %authz_instance.id(),
"did_unlock" => ?did_unlock,
);

Ok(())
}

0 comments on commit 8e7a1ef

Please sign in to comment.