Skip to content

Commit

Permalink
Remove updater lock from instance runtime state
Browse files Browse the repository at this point in the history
In #5831, I added the `updater_gen` and `updater_id` fields for the
instance-updater lock to the `InstanceRuntimeState` struct, based purely
on vibes: "They change at runtime, right? Therefore, they ought to be
'runtime state'...". It turns out that this is actually Super Ultra
Wrong, because any query which writes an `InstanceRuntimeState` without
paying attention to the lock fields can inadvertantly clobber them,
either releasing the lock or setting it back to a previous lock holder.

These fields should be on the `Instance` struct instead, to avoid this
kind of thing. This commit moves them to the correct place. I figured it
would be nice to land this separately from #5749, purely for the sake of
keeping that diff small.
  • Loading branch information
hawkw committed Jun 12, 2024
1 parent 9d20b90 commit 3cb42cb
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 27 deletions.
43 changes: 21 additions & 22 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,24 @@ pub struct Instance {

#[diesel(embed)]
pub runtime_state: InstanceRuntimeState,

/// The generation number for the updater lock. This is updated whenever the
/// lock is acquired or released, and is used in attempts to set the
/// `updater_id` field to ensure that the snapshot which indicated that the
/// lock was not held is still valid when setting the lock ID.
#[diesel(column_name = updater_gen)]
pub updater_gen: Generation,

/// A UUID identifying the saga currently holding the update lock on this
/// instance. If this is [`None`] the instance is not locked. Otherwise, if
/// this is [`Some`], the instance is locked by the saga owning this UUID.
/// Note that this is not (presently) the UUID *of* the locking saga, but
/// rather, a UUID *generated by* that saga. Therefore, it may not be
/// useable to look up which saga holds the lock.
///
/// This field is guarded by the instance's `updater_gen`
#[diesel(column_name = updater_id)]
pub updater_id: Option<Uuid>,
}

impl Instance {
Expand Down Expand Up @@ -85,6 +103,9 @@ impl Instance {
hostname: params.hostname.to_string(),
boot_on_fault: false,
runtime_state,

updater_gen: Generation::new(),
updater_id: None,
}
}

Expand Down Expand Up @@ -171,24 +192,6 @@ pub struct InstanceRuntimeState {
#[diesel(column_name = migration_id)]
pub migration_id: Option<Uuid>,

/// A UUID identifying the saga currently holding the update lock on this
/// instance. If this is [`None`] the instance is not locked. Otherwise, if
/// this is [`Some`], the instance is locked by the saga owning this UUID.
/// Note that this is not (presently) the UUID *of* the locking saga, but
/// rather, a UUID *generated by* that saga. Therefore, it may not be
/// useable to look up which saga holds the lock.
///
/// This field is guarded by the instance's `updater_gen`
#[diesel(column_name = updater_id)]
pub updater_id: Option<Uuid>,

/// The generation number for the updater lock. This is updated whenever the
/// lock is acquired or released, and is used in attempts to set the
/// `updater_id` field to ensure that the snapshot which indicated that the
/// lock was not held is still valid when setting the lock ID.
#[diesel(column_name = updater_gen)]
pub updater_gen: Generation,

/// The "internal" state of this instance. The instance's externally-visible
/// state may be delegated to the instance's active VMM, if it has one.
///
Expand All @@ -206,8 +209,6 @@ impl InstanceRuntimeState {
dst_propolis_id: None,
migration_id: None,
gen: Generation::new(),
updater_gen: Generation::new(),
updater_id: None,
}
}
}
Expand All @@ -231,8 +232,6 @@ impl From<omicron_common::api::internal::nexus::InstanceRuntimeState>
propolis_id: state.propolis_id,
dst_propolis_id: state.dst_propolis_id,
migration_id: state.migration_id,
updater_gen: Generation::new(),
updater_id: None,
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ impl DataStore {
// important than handling that extremely unlikely edge case.
let mut did_lock = false;
loop {
match instance.runtime_state.updater_id {
match instance.updater_id {
// If the `updater_id` field is not null and the ID equals this
// saga's ID, we already have the lock. We're done here!
Some(lock_id) if lock_id == saga_lock_id => {
Expand All @@ -659,7 +659,7 @@ impl DataStore {
);
return Ok(UpdaterLock {
saga_lock_id,
locked_gen: instance.runtime_state.updater_gen,
locked_gen: instance.updater_gen,
});
}
// The `updater_id` field is set, but it's not our ID. The instance
Expand All @@ -680,7 +680,7 @@ impl DataStore {
}

// Okay, now attempt to acquire the lock
let current_gen = instance.runtime_state.updater_gen;
let current_gen = instance.updater_gen;
slog::debug!(
&opctx.log,
"attempting to acquire instance updater lock";
Expand Down Expand Up @@ -795,12 +795,12 @@ impl DataStore {
UpdateAndQueryResult {
status: UpdateStatus::NotUpdatedButExists,
ref found,
} if found.runtime_state.updater_gen > locked_gen => Ok(false),
} if found.updater_gen > locked_gen => Ok(false),
// The instance exists, but the lock ID doesn't match our lock ID.
// This means we were trying to release a lock we never held, whcih
// is almost certainly a programmer error.
UpdateAndQueryResult { ref found, .. } => {
match found.runtime_state.updater_id {
match found.updater_id {
Some(lock_holder) => {
debug_assert_ne!(lock_holder, saga_lock_id);
Err(Error::internal_error(
Expand Down

0 comments on commit 3cb42cb

Please sign in to comment.