Skip to content

Commit

Permalink
Snapshot creation races with region replacement (#6213)
Browse files Browse the repository at this point in the history
Prevent the snapshot creation saga from racing with the region
replacement saga by having the snapshot creation saga create a volume
repair record, locking the volume for the duration of the snapshot
creation.

There's a large comment in the new `ssc_take_volume_lock` talking about
the implications of these two sagas racing.
  • Loading branch information
jmpesp authored Aug 6, 2024
1 parent 8f5232a commit 5831dc6
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 20 deletions.
1 change: 1 addition & 0 deletions nexus/db-queries/src/db/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ mod v2p_mapping;
mod virtual_provisioning_collection;
mod vmm;
mod volume;
mod volume_repair;
mod vpc;
mod zpool;

Expand Down
30 changes: 11 additions & 19 deletions nexus/db-queries/src/db/datastore/region_replacement.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use crate::db::model::RegionReplacementState;
use crate::db::model::RegionReplacementStep;
use crate::db::model::UpstairsRepairNotification;
use crate::db::model::UpstairsRepairNotificationType;
use crate::db::model::VolumeRepair;
use crate::db::pagination::paginated;
use crate::db::pagination::Paginator;
use crate::db::update_and_check::UpdateAndCheck;
Expand Down Expand Up @@ -57,13 +56,8 @@ impl DataStore {
.await?
.transaction_async(|conn| async move {
use db::schema::region_replacement::dsl;
use db::schema::volume_repair::dsl as volume_repair_dsl;

diesel::insert_into(volume_repair_dsl::volume_repair)
.values(VolumeRepair {
volume_id: request.volume_id,
repair_id: request.id,
})
Self::volume_repair_insert_query(request.volume_id, request.id)
.execute_async(&conn)
.await?;

Expand Down Expand Up @@ -667,27 +661,25 @@ impl DataStore {
pub async fn set_region_replacement_complete(
&self,
opctx: &OpContext,
region_replacement_id: Uuid,
request: RegionReplacement,
operating_saga_id: Uuid,
) -> Result<(), Error> {
type TxnError = TransactionError<Error>;

self.pool_connection_authorized(opctx)
.await?
.transaction_async(|conn| async move {
use db::schema::volume_repair::dsl as volume_repair_dsl;

diesel::delete(
volume_repair_dsl::volume_repair
.filter(volume_repair_dsl::repair_id.eq(region_replacement_id))
)
.execute_async(&conn)
.await?;
Self::volume_repair_delete_query(
request.volume_id,
request.id,
)
.execute_async(&conn)
.await?;

use db::schema::region_replacement::dsl;

let result = diesel::update(dsl::region_replacement)
.filter(dsl::id.eq(region_replacement_id))
.filter(dsl::id.eq(request.id))
.filter(
dsl::replacement_state.eq(RegionReplacementState::Completing),
)
Expand All @@ -696,7 +688,7 @@ impl DataStore {
dsl::replacement_state.eq(RegionReplacementState::Complete),
dsl::operating_saga_id.eq(Option::<Uuid>::None),
))
.check_if_exists::<RegionReplacement>(region_replacement_id)
.check_if_exists::<RegionReplacement>(request.id)
.execute_and_check(&conn)
.await?;

Expand All @@ -713,7 +705,7 @@ impl DataStore {
} else {
Err(TxnError::CustomError(Error::conflict(format!(
"region replacement {} set to {:?} (operating saga id {:?})",
region_replacement_id,
request.id,
record.replacement_state,
record.operating_saga_id,
))))
Expand Down
85 changes: 85 additions & 0 deletions nexus/db-queries/src/db/datastore/volume_repair.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! [`DataStore`] methods on [`VolumeRepair`]s.
use super::DataStore;
use crate::db;
use crate::db::datastore::OpContext;
use crate::db::datastore::RunnableQuery;
use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::model::VolumeRepair;
use async_bb8_diesel::AsyncRunQueryDsl;
use diesel::prelude::*;
use diesel::result::Error as DieselError;
use omicron_common::api::external::Error;
use uuid::Uuid;

impl DataStore {
pub(super) fn volume_repair_insert_query(
volume_id: Uuid,
repair_id: Uuid,
) -> impl RunnableQuery<VolumeRepair> {
use db::schema::volume_repair::dsl;

diesel::insert_into(dsl::volume_repair)
.values(VolumeRepair { volume_id, repair_id })
}

pub async fn volume_repair_lock(
&self,
opctx: &OpContext,
volume_id: Uuid,
repair_id: Uuid,
) -> Result<(), Error> {
let conn = self.pool_connection_authorized(opctx).await?;
Self::volume_repair_insert_query(volume_id, repair_id)
.execute_async(&*conn)
.await
.map(|_| ())
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

pub(super) fn volume_repair_delete_query(
volume_id: Uuid,
repair_id: Uuid,
) -> impl RunnableQuery<VolumeRepair> {
use db::schema::volume_repair::dsl;

diesel::delete(
dsl::volume_repair
.filter(dsl::volume_id.eq(volume_id))
.filter(dsl::repair_id.eq(repair_id)),
)
}

pub async fn volume_repair_unlock(
&self,
opctx: &OpContext,
volume_id: Uuid,
repair_id: Uuid,
) -> Result<(), Error> {
let conn = self.pool_connection_authorized(opctx).await?;
Self::volume_repair_delete_query(volume_id, repair_id)
.execute_async(&*conn)
.await
.map(|_| ())
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

pub async fn volume_repair_get(
conn: &async_bb8_diesel::Connection<db::DbConnection>,
volume_id: Uuid,
repair_id: Uuid,
) -> Result<VolumeRepair, DieselError> {
use db::schema::volume_repair::dsl;

dsl::volume_repair
.filter(dsl::repair_id.eq(repair_id))
.filter(dsl::volume_id.eq(volume_id))
.first_async::<VolumeRepair>(conn)
.await
}
}
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/region_replacement_finish.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ async fn srrf_update_request_record(
// record to 'Complete' and clear the operating saga id. There is no undo
// step for this, it should succeed idempotently.
datastore
.set_region_replacement_complete(&opctx, params.request.id, saga_id)
.set_region_replacement_complete(&opctx, params.request, saga_id)
.await
.map_err(ActionError::action_failed)?;

Expand Down
144 changes: 144 additions & 0 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ pub(crate) struct Params {
// snapshot create saga: actions
declare_saga_actions! {
snapshot_create;
TAKE_VOLUME_LOCK -> "volume_lock" {
+ ssc_take_volume_lock
- ssc_take_volume_lock_undo
}
REGIONS_ALLOC -> "datasets_and_regions" {
+ ssc_alloc_regions
- ssc_alloc_regions_undo
Expand Down Expand Up @@ -200,6 +204,9 @@ declare_saga_actions! {
FINALIZE_SNAPSHOT_RECORD -> "finalized_snapshot" {
+ ssc_finalize_snapshot_record
}
RELEASE_VOLUME_LOCK -> "volume_unlock" {
+ ssc_release_volume_lock
}
}

// snapshot create saga: definition
Expand Down Expand Up @@ -237,6 +244,14 @@ impl NexusSaga for SagaSnapshotCreate {
ACTION_GENERATE_ID.as_ref(),
));

builder.append(Node::action(
"lock_id",
"GenerateLockId",
ACTION_GENERATE_ID.as_ref(),
));

builder.append(take_volume_lock_action());

// (DB) Allocate region space for snapshot to store blocks post-scrub
builder.append(regions_alloc_action());
// (Sleds) Reaches out to each dataset, and ensures the regions exist
Expand Down Expand Up @@ -291,6 +306,8 @@ impl NexusSaga for SagaSnapshotCreate {
builder.append(detach_disk_from_pantry_action());
}

builder.append(release_volume_lock_action());

Ok(builder.build()?)
}
}
Expand All @@ -301,6 +318,106 @@ async fn ssc_noop(_sagactx: NexusActionContext) -> Result<(), ActionError> {
Ok(())
}

async fn ssc_take_volume_lock(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let params = sagactx.saga_params::<Params>()?;
let lock_id = sagactx.lookup::<Uuid>("lock_id")?;

let opctx = crate::context::op_context_for_saga_action(
&sagactx,
&params.serialized_authn,
);

// Without a lock on the volume, this saga can race with the region
// replacement saga, causing (at least!) two problems:
//
// - the request that this saga sends to a propolis to create a snapshot
// races with the vcr_replace request that the region replacement drive
// saga sends. Imagining that the region set for a disk's volume is [A, B,
// C] before either saga starts, then the following scenarios can occur:
//
// 1. the snapshot request lands before the vcr_replace request, meaning
// snapshots are created for the A, B, and C regions, but the region
// set was then modified to [A, B, D]: this means that two of the three
// regions have associated snapshots, and this will cause this saga to
// unwind when it checks that the associated region snapshots were
// created ok. as it's currently written, this saga could also error
// during unwind, as it's trying to clean up the snapshots for A, B,
// and C, and it could see a "Not Found" when querying C for the
// snapshot.
//
// 2. the vcr_replace request lands before the snapshot request, meaning
// snapshots would be created for the A, B, and D regions. this is a
// problem because D is new (having been allocated to replace C), and
// the upstairs could be performing live repair. taking a snapshot of
// an upstairs during live repair means either:
//
// a. the Upstairs will reject it, causing this saga to unwind ok
//
// b. the Upstairs will allow it, meaning any read-only Upstairs that
// uses the associated snapshots ([A', B', D']) will detect that
// reconciliation is required, not be able to perform reconciliation
// because it is read-only, and panic. note: accepting and
// performing a snapshot during live repair is almost certainly a
// bug in Crucible, not Nexus!
//
// if the upstairs is _not_ performing live repair yet, then the
// snapshot could succeed. This means each of the A, B, and D regions
// will have an associated snapshot, but the D snapshot is of a blank
// region! the same problem will occur as 2b: a read-only Upstairs that
// uses those snapshots as targets will panic because the data doesn't
// match and it can't perform reconciliation.
//
// - get_allocated_regions will change during the execution of this saga,
// due to region replacement(s) occurring.
//
// With a lock on the volume, the snapshot creation and region replacement
// drive sagas are serialized. Note this does mean that region replacement
// is blocked by a snapshot being created, and snapshot creation is blocked
// by region replacement.

let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
.disk_id(params.disk_id)
.fetch()
.await
.map_err(ActionError::action_failed)?;

osagactx
.datastore()
.volume_repair_lock(&opctx, disk.volume_id, lock_id)
.await
.map_err(ActionError::action_failed)?;

Ok(())
}

async fn ssc_take_volume_lock_undo(
sagactx: NexusActionContext,
) -> anyhow::Result<()> {
let osagactx = sagactx.user_data();
let params = sagactx.saga_params::<Params>()?;
let lock_id = sagactx.lookup::<Uuid>("lock_id")?;

let opctx = crate::context::op_context_for_saga_action(
&sagactx,
&params.serialized_authn,
);

let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
.disk_id(params.disk_id)
.fetch()
.await?;

osagactx
.datastore()
.volume_repair_unlock(&opctx, disk.volume_id, lock_id)
.await?;

Ok(())
}

async fn ssc_alloc_regions(
sagactx: NexusActionContext,
) -> Result<Vec<(db::model::Dataset, db::model::Region)>, ActionError> {
Expand Down Expand Up @@ -1538,6 +1655,33 @@ async fn ssc_finalize_snapshot_record(
Ok(snapshot)
}

async fn ssc_release_volume_lock(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let params = sagactx.saga_params::<Params>()?;
let lock_id = sagactx.lookup::<Uuid>("lock_id")?;

let opctx = crate::context::op_context_for_saga_action(
&sagactx,
&params.serialized_authn,
);

let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
.disk_id(params.disk_id)
.fetch()
.await
.map_err(ActionError::action_failed)?;

osagactx
.datastore()
.volume_repair_unlock(&opctx, disk.volume_id, lock_id)
.await
.map_err(ActionError::action_failed)?;

Ok(())
}

// helper functions

/// Create a Snapshot VolumeConstructionRequest by copying a disk's
Expand Down

0 comments on commit 5831dc6

Please sign in to comment.