From 8e7a1ef4e485cea3f388f1c82666ecf33b727760 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 3 Jun 2024 15:23:28 -0700 Subject: [PATCH] fix lock generations getting eaten --- nexus/db-model/src/instance.rs | 42 ++++++------ nexus/db-queries/src/db/datastore/instance.rs | 67 +++++++++++-------- .../app/sagas/instance_update/destroyed.rs | 2 +- nexus/src/app/sagas/instance_update/mod.rs | 9 ++- 4 files changed, 69 insertions(+), 51 deletions(-) diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index 8f110aff714..5332aedb4a6 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -58,6 +58,24 @@ pub struct Instance { #[diesel(embed)] pub runtime_state: InstanceRuntimeState, + + /// A UUID identifying the saga currently holding the update lock on this + /// instance. If this is [`None`] the instance is not locked. Otherwise, if + /// this is [`Some`], the instance is locked by the saga owning this UUID. + /// Note that this is not (presently) the UUID *of* the locking saga, but + /// rather, a UUID *generated by* that saga. Therefore, it may not be + /// useable to look up which saga holds the lock. + /// + /// This field is guarded by the instance's `updater_gen` + #[diesel(column_name = updater_id)] + pub updater_id: Option, + + /// The generation number for the updater lock. This is updated whenever the + /// lock is acquired or released, and is used in attempts to set the + /// `updater_id` field to ensure that the snapshot which indicated that the + /// lock was not held is still valid when setting the lock ID. + #[diesel(column_name = updater_gen)] + pub updater_gen: Generation, } impl Instance { @@ -87,6 +105,8 @@ impl Instance { hostname: params.hostname.to_string(), boot_on_fault: false, runtime_state, + updater_id: None, + updater_gen: Generation::new(), } } @@ -179,24 +199,6 @@ pub struct InstanceRuntimeState { /// This field is guarded by the instance's `gen`. #[diesel(column_name = migration_id)] pub migration_id: Option, - - /// A UUID identifying the saga currently holding the update lock on this - /// instance. If this is [`None`] the instance is not locked. Otherwise, if - /// this is [`Some`], the instance is locked by the saga owning this UUID. - /// Note that this is not (presently) the UUID *of* the locking saga, but - /// rather, a UUID *generated by* that saga. Therefore, it may not be - /// useable to look up which saga holds the lock. - /// - /// This field is guarded by the instance's `updater_gen` - #[diesel(column_name = updater_id)] - pub updater_id: Option, - - /// The generation number for the updater lock. This is updated whenever the - /// lock is acquired or released, and is used in attempts to set the - /// `updater_id` field to ensure that the snapshot which indicated that the - /// lock was not held is still valid when setting the lock ID. - #[diesel(column_name = updater_gen)] - pub updater_gen: Generation, } impl InstanceRuntimeState { @@ -208,8 +210,6 @@ impl InstanceRuntimeState { dst_propolis_id: None, migration_id: None, gen: Generation::new(), - updater_gen: Generation::new(), - updater_id: None, } } } @@ -233,8 +233,6 @@ impl From propolis_id: state.propolis_id, dst_propolis_id: state.dst_propolis_id, migration_id: state.migration_id, - updater_gen: Generation::new(), - updater_id: None, } } } diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 315d1ee6c8f..ec2f4c05593 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -159,8 +159,8 @@ pub struct InstanceAndVmms { /// when the lock is released. #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct UpdaterLock { - saga_lock_id: Uuid, - locked_gen: Generation, + pub saga_lock_id: Uuid, + pub locked_gen: Generation, } /// Errors returned by [`DataStore::instance_updater_lock`]. @@ -721,22 +721,21 @@ impl DataStore { // *same* instance at the same time. So, idempotency is probably more // important than handling that extremely unlikely edge case. let mut did_lock = false; + let mut locked_gen = instance.updater_gen; loop { - match instance.runtime_state.updater_id { + match instance.updater_id { // If the `updater_id` field is not null and the ID equals this // saga's ID, we already have the lock. We're done here! Some(lock_id) if lock_id == saga_lock_id => { - slog::info!( + slog::debug!( &opctx.log, "instance updater lock acquired!"; "instance_id" => %instance_id, "saga_id" => %saga_lock_id, "already_locked" => !did_lock, + "locked_gen" => ?locked_gen, ); - return Ok(UpdaterLock { - saga_lock_id, - locked_gen: instance.runtime_state.updater_gen, - }); + return Ok(UpdaterLock { saga_lock_id, locked_gen }); } // The `updater_id` field is set, but it's not our ID. The instance // is locked by a different saga, so give up. @@ -756,7 +755,8 @@ impl DataStore { } // Okay, now attempt to acquire the lock - let current_gen = instance.runtime_state.updater_gen; + let current_gen = instance.updater_gen; + locked_gen = Generation(current_gen.0.next()); slog::debug!( &opctx.log, "attempting to acquire instance updater lock"; @@ -779,7 +779,7 @@ impl DataStore { // of a non-distributed, single-process mutex. .filter(dsl::updater_gen.eq(current_gen)) .set(( - dsl::updater_gen.eq(dsl::updater_gen + 1), + dsl::updater_gen.eq(locked_gen), dsl::updater_id.eq(Some(saga_lock_id)), )) .check_if_exists::(instance_id) @@ -819,6 +819,7 @@ impl DataStore { use db::schema::instance::dsl; let instance_id = authz_instance.id(); + let new_gen = Generation(locked_gen.0.next()); let result = diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) @@ -826,7 +827,7 @@ impl DataStore { .filter(dsl::updater_gen.eq(locked_gen)) .filter(dsl::updater_id.eq(parent_id)) .set(( - dsl::updater_gen.eq(dsl::updater_gen + 1), + dsl::updater_gen.eq(new_gen), dsl::updater_id.eq(Some(child_lock_id)), )) .check_if_exists::(instance_id) @@ -845,27 +846,30 @@ impl DataStore { match result { // If we updated the record, the lock has been released! Return // `Ok(true)` to indicate that we released the lock successfully. - UpdateAndQueryResult { - status: UpdateStatus::Updated, - ref found, - } => Ok(UpdaterLock { - saga_lock_id: child_lock_id, - locked_gen: found.runtime_state.updater_gen, - }), + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + slog::info!( + &opctx.log, + "inherited lock from {parent_id} to {child_lock_id}"; + "instance_id" => %instance_id, + "locked_gen" => ?new_gen, + "parent_gen" => ?locked_gen, + ); + Ok(UpdaterLock { + saga_lock_id: child_lock_id, + locked_gen: new_gen, + }) + } // The generation has advanced past the generation at which the - // lock was held. This means that we have already released the + // lock was held. This means that we have already inherited the // lock. Return `Ok(false)` here for idempotency. UpdateAndQueryResult { status: UpdateStatus::NotUpdatedButExists, ref found, - } if found.runtime_state.updater_id == Some(child_lock_id) => { - debug_assert_eq!( - found.runtime_state.updater_gen, - Generation(locked_gen.0.next()), - ); + } if found.updater_id == Some(child_lock_id) => { + debug_assert_eq!(found.updater_gen, new_gen,); Ok(UpdaterLock { saga_lock_id: child_lock_id, - locked_gen: found.runtime_state.updater_gen, + locked_gen: new_gen, }) } // The instance exists, but the lock ID doesn't match our lock ID. @@ -937,13 +941,22 @@ impl DataStore { UpdateAndQueryResult { status: UpdateStatus::NotUpdatedButExists, ref found, - } if found.runtime_state.updater_gen > locked_gen => Ok(false), + } if found.updater_gen > locked_gen => Ok(false), // The instance exists, but the lock ID doesn't match our lock ID. // This means we were trying to release a lock we never held, whcih // is almost certainly a programmer error. UpdateAndQueryResult { ref found, .. } => { - match found.runtime_state.updater_id { + match found.updater_id { Some(lock_holder) => { + slog::error!( + &opctx.log, + "attempted to release a lock held by another saga"; + "instance_id" => %instance_id, + "saga_id" => %saga_lock_id, + "lock_holder" => %lock_holder, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); debug_assert_ne!(lock_holder, saga_lock_id); Err(Error::internal_error( "attempted to release a lock held by another saga! this is a bug!", diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 4455dddb2e2..e9ca7d4ea66 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -12,6 +12,7 @@ use nexus_db_model::Instance; use nexus_db_model::InstanceRuntimeState; use nexus_db_queries::authn; use nexus_db_queries::authz; +use nexus_db_queries::db::datastore::instance; use omicron_common::api::external; use omicron_common::api::external::Error; use serde::{Deserialize, Serialize}; @@ -249,7 +250,6 @@ async fn siud_update_instance( ) -> Result<(), ActionError> { let Params { ref authz_instance, ref vmm_id, instance, .. } = sagactx.saga_params::()?; - let osagactx = sagactx.user_data(); let new_runtime = InstanceRuntimeState { propolis_id: None, diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index d8ae65a18cb..35a6ea4c708 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -206,11 +206,18 @@ async fn unlock_instance_inner( "lock" => ?lock, ); - osagactx + let did_unlock = osagactx .datastore() .instance_updater_unlock(&opctx, authz_instance, lock) .await .map_err(ActionError::action_failed)?; + slog::info!( + osagactx.log(), + "instance update: unlocked instance"; + "instance_id" => %authz_instance.id(), + "did_unlock" => ?did_unlock, + ); + Ok(()) }