From 5831dc67ee5c1d3eeddd82ac97eda23dcf625f02 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 6 Aug 2024 13:24:02 -0400 Subject: [PATCH] Snapshot creation races with region replacement (#6213) Prevent the snapshot creation saga from racing with the region replacement saga by having the snapshot creation saga create a volume repair record, locking the volume for the duration of the snapshot creation. There's a large comment in the new `ssc_take_volume_lock` talking about the implications of these two sagas racing. --- nexus/db-queries/src/db/datastore/mod.rs | 1 + .../src/db/datastore/region_replacement.rs | 30 ++-- .../src/db/datastore/volume_repair.rs | 85 +++++++++++ .../app/sagas/region_replacement_finish.rs | 2 +- nexus/src/app/sagas/snapshot_create.rs | 144 ++++++++++++++++++ 5 files changed, 242 insertions(+), 20 deletions(-) create mode 100644 nexus/db-queries/src/db/datastore/volume_repair.rs diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 2540790477..b6696ee60d 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -103,6 +103,7 @@ mod v2p_mapping; mod virtual_provisioning_collection; mod vmm; mod volume; +mod volume_repair; mod vpc; mod zpool; diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index 46baf1fd16..d87356e40f 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -16,7 +16,6 @@ use crate::db::model::RegionReplacementState; use crate::db::model::RegionReplacementStep; use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; -use crate::db::model::VolumeRepair; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::update_and_check::UpdateAndCheck; @@ -57,13 +56,8 @@ impl DataStore { .await? .transaction_async(|conn| async move { use db::schema::region_replacement::dsl; - use db::schema::volume_repair::dsl as volume_repair_dsl; - diesel::insert_into(volume_repair_dsl::volume_repair) - .values(VolumeRepair { - volume_id: request.volume_id, - repair_id: request.id, - }) + Self::volume_repair_insert_query(request.volume_id, request.id) .execute_async(&conn) .await?; @@ -667,7 +661,7 @@ impl DataStore { pub async fn set_region_replacement_complete( &self, opctx: &OpContext, - region_replacement_id: Uuid, + request: RegionReplacement, operating_saga_id: Uuid, ) -> Result<(), Error> { type TxnError = TransactionError; @@ -675,19 +669,17 @@ impl DataStore { self.pool_connection_authorized(opctx) .await? .transaction_async(|conn| async move { - use db::schema::volume_repair::dsl as volume_repair_dsl; - - diesel::delete( - volume_repair_dsl::volume_repair - .filter(volume_repair_dsl::repair_id.eq(region_replacement_id)) - ) - .execute_async(&conn) - .await?; + Self::volume_repair_delete_query( + request.volume_id, + request.id, + ) + .execute_async(&conn) + .await?; use db::schema::region_replacement::dsl; let result = diesel::update(dsl::region_replacement) - .filter(dsl::id.eq(region_replacement_id)) + .filter(dsl::id.eq(request.id)) .filter( dsl::replacement_state.eq(RegionReplacementState::Completing), ) @@ -696,7 +688,7 @@ impl DataStore { dsl::replacement_state.eq(RegionReplacementState::Complete), dsl::operating_saga_id.eq(Option::::None), )) - .check_if_exists::(region_replacement_id) + .check_if_exists::(request.id) .execute_and_check(&conn) .await?; @@ -713,7 +705,7 @@ impl DataStore { } else { Err(TxnError::CustomError(Error::conflict(format!( "region replacement {} set to {:?} (operating saga id {:?})", - region_replacement_id, + request.id, record.replacement_state, record.operating_saga_id, )))) diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs new file mode 100644 index 0000000000..5230e60e3e --- /dev/null +++ b/nexus/db-queries/src/db/datastore/volume_repair.rs @@ -0,0 +1,85 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on [`VolumeRepair`]s. + +use super::DataStore; +use crate::db; +use crate::db::datastore::OpContext; +use crate::db::datastore::RunnableQuery; +use crate::db::error::public_error_from_diesel; +use crate::db::error::ErrorHandler; +use crate::db::model::VolumeRepair; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use diesel::result::Error as DieselError; +use omicron_common::api::external::Error; +use uuid::Uuid; + +impl DataStore { + pub(super) fn volume_repair_insert_query( + volume_id: Uuid, + repair_id: Uuid, + ) -> impl RunnableQuery { + use db::schema::volume_repair::dsl; + + diesel::insert_into(dsl::volume_repair) + .values(VolumeRepair { volume_id, repair_id }) + } + + pub async fn volume_repair_lock( + &self, + opctx: &OpContext, + volume_id: Uuid, + repair_id: Uuid, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + Self::volume_repair_insert_query(volume_id, repair_id) + .execute_async(&*conn) + .await + .map(|_| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub(super) fn volume_repair_delete_query( + volume_id: Uuid, + repair_id: Uuid, + ) -> impl RunnableQuery { + use db::schema::volume_repair::dsl; + + diesel::delete( + dsl::volume_repair + .filter(dsl::volume_id.eq(volume_id)) + .filter(dsl::repair_id.eq(repair_id)), + ) + } + + pub async fn volume_repair_unlock( + &self, + opctx: &OpContext, + volume_id: Uuid, + repair_id: Uuid, + ) -> Result<(), Error> { + let conn = self.pool_connection_authorized(opctx).await?; + Self::volume_repair_delete_query(volume_id, repair_id) + .execute_async(&*conn) + .await + .map(|_| ()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn volume_repair_get( + conn: &async_bb8_diesel::Connection, + volume_id: Uuid, + repair_id: Uuid, + ) -> Result { + use db::schema::volume_repair::dsl; + + dsl::volume_repair + .filter(dsl::repair_id.eq(repair_id)) + .filter(dsl::volume_id.eq(volume_id)) + .first_async::(conn) + .await + } +} diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs index e17f3405a0..8d8e75ea91 100644 --- a/nexus/src/app/sagas/region_replacement_finish.rs +++ b/nexus/src/app/sagas/region_replacement_finish.rs @@ -196,7 +196,7 @@ async fn srrf_update_request_record( // record to 'Complete' and clear the operating saga id. There is no undo // step for this, it should succeed idempotently. datastore - .set_region_replacement_complete(&opctx, params.request.id, saga_id) + .set_region_replacement_complete(&opctx, params.request, saga_id) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 5a8313229a..1d6903fa61 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -137,6 +137,10 @@ pub(crate) struct Params { // snapshot create saga: actions declare_saga_actions! { snapshot_create; + TAKE_VOLUME_LOCK -> "volume_lock" { + + ssc_take_volume_lock + - ssc_take_volume_lock_undo + } REGIONS_ALLOC -> "datasets_and_regions" { + ssc_alloc_regions - ssc_alloc_regions_undo @@ -200,6 +204,9 @@ declare_saga_actions! { FINALIZE_SNAPSHOT_RECORD -> "finalized_snapshot" { + ssc_finalize_snapshot_record } + RELEASE_VOLUME_LOCK -> "volume_unlock" { + + ssc_release_volume_lock + } } // snapshot create saga: definition @@ -237,6 +244,14 @@ impl NexusSaga for SagaSnapshotCreate { ACTION_GENERATE_ID.as_ref(), )); + builder.append(Node::action( + "lock_id", + "GenerateLockId", + ACTION_GENERATE_ID.as_ref(), + )); + + builder.append(take_volume_lock_action()); + // (DB) Allocate region space for snapshot to store blocks post-scrub builder.append(regions_alloc_action()); // (Sleds) Reaches out to each dataset, and ensures the regions exist @@ -291,6 +306,8 @@ impl NexusSaga for SagaSnapshotCreate { builder.append(detach_disk_from_pantry_action()); } + builder.append(release_volume_lock_action()); + Ok(builder.build()?) } } @@ -301,6 +318,106 @@ async fn ssc_noop(_sagactx: NexusActionContext) -> Result<(), ActionError> { Ok(()) } +async fn ssc_take_volume_lock( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let lock_id = sagactx.lookup::("lock_id")?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + // Without a lock on the volume, this saga can race with the region + // replacement saga, causing (at least!) two problems: + // + // - the request that this saga sends to a propolis to create a snapshot + // races with the vcr_replace request that the region replacement drive + // saga sends. Imagining that the region set for a disk's volume is [A, B, + // C] before either saga starts, then the following scenarios can occur: + // + // 1. the snapshot request lands before the vcr_replace request, meaning + // snapshots are created for the A, B, and C regions, but the region + // set was then modified to [A, B, D]: this means that two of the three + // regions have associated snapshots, and this will cause this saga to + // unwind when it checks that the associated region snapshots were + // created ok. as it's currently written, this saga could also error + // during unwind, as it's trying to clean up the snapshots for A, B, + // and C, and it could see a "Not Found" when querying C for the + // snapshot. + // + // 2. the vcr_replace request lands before the snapshot request, meaning + // snapshots would be created for the A, B, and D regions. this is a + // problem because D is new (having been allocated to replace C), and + // the upstairs could be performing live repair. taking a snapshot of + // an upstairs during live repair means either: + // + // a. the Upstairs will reject it, causing this saga to unwind ok + // + // b. the Upstairs will allow it, meaning any read-only Upstairs that + // uses the associated snapshots ([A', B', D']) will detect that + // reconciliation is required, not be able to perform reconciliation + // because it is read-only, and panic. note: accepting and + // performing a snapshot during live repair is almost certainly a + // bug in Crucible, not Nexus! + // + // if the upstairs is _not_ performing live repair yet, then the + // snapshot could succeed. This means each of the A, B, and D regions + // will have an associated snapshot, but the D snapshot is of a blank + // region! the same problem will occur as 2b: a read-only Upstairs that + // uses those snapshots as targets will panic because the data doesn't + // match and it can't perform reconciliation. + // + // - get_allocated_regions will change during the execution of this saga, + // due to region replacement(s) occurring. + // + // With a lock on the volume, the snapshot creation and region replacement + // drive sagas are serialized. Note this does mean that region replacement + // is blocked by a snapshot being created, and snapshot creation is blocked + // by region replacement. + + let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore()) + .disk_id(params.disk_id) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + osagactx + .datastore() + .volume_repair_lock(&opctx, disk.volume_id, lock_id) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn ssc_take_volume_lock_undo( + sagactx: NexusActionContext, +) -> anyhow::Result<()> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let lock_id = sagactx.lookup::("lock_id")?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore()) + .disk_id(params.disk_id) + .fetch() + .await?; + + osagactx + .datastore() + .volume_repair_unlock(&opctx, disk.volume_id, lock_id) + .await?; + + Ok(()) +} + async fn ssc_alloc_regions( sagactx: NexusActionContext, ) -> Result, ActionError> { @@ -1538,6 +1655,33 @@ async fn ssc_finalize_snapshot_record( Ok(snapshot) } +async fn ssc_release_volume_lock( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let lock_id = sagactx.lookup::("lock_id")?; + + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + + let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore()) + .disk_id(params.disk_id) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + osagactx + .datastore() + .volume_repair_unlock(&opctx, disk.volume_id, lock_id) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + // helper functions /// Create a Snapshot VolumeConstructionRequest by copying a disk's