Skip to content

Commit

Permalink
Snapshot creation races with region replacement
Browse files Browse the repository at this point in the history
Prevent the snapshot creation saga from racing with the region
replacement saga by having the snapshot creation saga create a volume
repair record, locking the volume for the duration of the snapshot
creation.

There's a large comment in the new `ssc_take_volume_lock` talking about
the implications of these two sagas racing.
  • Loading branch information
jmpesp committed Aug 2, 2024
1 parent 0224eb8 commit fcaf48b
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 20 deletions.
1 change: 1 addition & 0 deletions nexus/db-queries/src/db/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ mod v2p_mapping;
mod virtual_provisioning_collection;
mod vmm;
mod volume;
mod volume_repair;
mod vpc;
mod zpool;

Expand Down
30 changes: 11 additions & 19 deletions nexus/db-queries/src/db/datastore/region_replacement.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use crate::db::model::RegionReplacementState;
use crate::db::model::RegionReplacementStep;
use crate::db::model::UpstairsRepairNotification;
use crate::db::model::UpstairsRepairNotificationType;
use crate::db::model::VolumeRepair;
use crate::db::pagination::paginated;
use crate::db::pagination::Paginator;
use crate::db::update_and_check::UpdateAndCheck;
Expand Down Expand Up @@ -57,13 +56,8 @@ impl DataStore {
.await?
.transaction_async(|conn| async move {
use db::schema::region_replacement::dsl;
use db::schema::volume_repair::dsl as volume_repair_dsl;

diesel::insert_into(volume_repair_dsl::volume_repair)
.values(VolumeRepair {
volume_id: request.volume_id,
repair_id: request.id,
})
Self::volume_repair_insert_query(request.volume_id, request.id)
.execute_async(&conn)
.await?;

Expand Down Expand Up @@ -667,27 +661,25 @@ impl DataStore {
pub async fn set_region_replacement_complete(
&self,
opctx: &OpContext,
region_replacement_id: Uuid,
request: RegionReplacement,
operating_saga_id: Uuid,
) -> Result<(), Error> {
type TxnError = TransactionError<Error>;

self.pool_connection_authorized(opctx)
.await?
.transaction_async(|conn| async move {
use db::schema::volume_repair::dsl as volume_repair_dsl;

diesel::delete(
volume_repair_dsl::volume_repair
.filter(volume_repair_dsl::repair_id.eq(region_replacement_id))
)
.execute_async(&conn)
.await?;
Self::volume_repair_delete_query(
request.volume_id,
request.id,
)
.execute_async(&conn)
.await?;

use db::schema::region_replacement::dsl;

let result = diesel::update(dsl::region_replacement)
.filter(dsl::id.eq(region_replacement_id))
.filter(dsl::id.eq(request.id))
.filter(
dsl::replacement_state.eq(RegionReplacementState::Completing),
)
Expand All @@ -696,7 +688,7 @@ impl DataStore {
dsl::replacement_state.eq(RegionReplacementState::Complete),
dsl::operating_saga_id.eq(Option::<Uuid>::None),
))
.check_if_exists::<RegionReplacement>(region_replacement_id)
.check_if_exists::<RegionReplacement>(request.id)
.execute_and_check(&conn)
.await?;

Expand All @@ -713,7 +705,7 @@ impl DataStore {
} else {
Err(TxnError::CustomError(Error::conflict(format!(
"region replacement {} set to {:?} (operating saga id {:?})",
region_replacement_id,
request.id,
record.replacement_state,
record.operating_saga_id,
))))
Expand Down
85 changes: 85 additions & 0 deletions nexus/db-queries/src/db/datastore/volume_repair.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! [`DataStore`] methods on [`VolumeRepair`]s.
use super::DataStore;
use crate::db;
use crate::db::datastore::OpContext;
use crate::db::datastore::RunnableQuery;
use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::model::VolumeRepair;
use async_bb8_diesel::AsyncRunQueryDsl;
use diesel::prelude::*;
use diesel::result::Error as DieselError;
use omicron_common::api::external::Error;
use uuid::Uuid;

impl DataStore {
pub(super) fn volume_repair_insert_query(
volume_id: Uuid,
repair_id: Uuid,
) -> impl RunnableQuery<VolumeRepair> {
use db::schema::volume_repair::dsl;

diesel::insert_into(dsl::volume_repair)
.values(VolumeRepair { volume_id, repair_id })
}

pub async fn volume_repair_lock(
&self,
opctx: &OpContext,
volume_id: Uuid,
repair_id: Uuid,
) -> Result<(), Error> {
let conn = self.pool_connection_authorized(opctx).await?;
Self::volume_repair_insert_query(volume_id, repair_id)
.execute_async(&*conn)
.await
.map(|_| ())
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

pub(super) fn volume_repair_delete_query(
volume_id: Uuid,
repair_id: Uuid,
) -> impl RunnableQuery<VolumeRepair> {
use db::schema::volume_repair::dsl;

diesel::delete(
dsl::volume_repair
.filter(dsl::volume_id.eq(volume_id))
.filter(dsl::repair_id.eq(repair_id)),
)
}

pub async fn volume_repair_unlock(
&self,
opctx: &OpContext,
volume_id: Uuid,
repair_id: Uuid,
) -> Result<(), Error> {
let conn = self.pool_connection_authorized(opctx).await?;
Self::volume_repair_delete_query(volume_id, repair_id)
.execute_async(&*conn)
.await
.map(|_| ())
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

pub async fn volume_repair_get(
conn: &async_bb8_diesel::Connection<db::DbConnection>,
volume_id: Uuid,
repair_id: Uuid,
) -> Result<VolumeRepair, DieselError> {
use db::schema::volume_repair::dsl;

dsl::volume_repair
.filter(dsl::repair_id.eq(repair_id))
.filter(dsl::volume_id.eq(volume_id))
.first_async::<VolumeRepair>(conn)
.await
}
}
2 changes: 1 addition & 1 deletion nexus/src/app/sagas/region_replacement_finish.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ async fn srrf_update_request_record(
// record to 'Complete' and clear the operating saga id. There is no undo
// step for this, it should succeed idempotently.
datastore
.set_region_replacement_complete(&opctx, params.request.id, saga_id)
.set_region_replacement_complete(&opctx, params.request, saga_id)
.await
.map_err(ActionError::action_failed)?;

Expand Down
144 changes: 144 additions & 0 deletions nexus/src/app/sagas/snapshot_create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ pub(crate) struct Params {
// snapshot create saga: actions
declare_saga_actions! {
snapshot_create;
TAKE_VOLUME_LOCK -> "volume_lock" {
+ ssc_take_volume_lock
- ssc_take_volume_lock_undo
}
REGIONS_ALLOC -> "datasets_and_regions" {
+ ssc_alloc_regions
- ssc_alloc_regions_undo
Expand Down Expand Up @@ -200,6 +204,9 @@ declare_saga_actions! {
FINALIZE_SNAPSHOT_RECORD -> "finalized_snapshot" {
+ ssc_finalize_snapshot_record
}
RELEASE_VOLUME_LOCK -> "volume_unlock" {
+ ssc_release_volume_lock
}
}

// snapshot create saga: definition
Expand Down Expand Up @@ -237,6 +244,14 @@ impl NexusSaga for SagaSnapshotCreate {
ACTION_GENERATE_ID.as_ref(),
));

builder.append(Node::action(
"lock_id",
"GenerateLockId",
ACTION_GENERATE_ID.as_ref(),
));

builder.append(take_volume_lock_action());

// (DB) Allocate region space for snapshot to store blocks post-scrub
builder.append(regions_alloc_action());
// (Sleds) Reaches out to each dataset, and ensures the regions exist
Expand Down Expand Up @@ -291,6 +306,8 @@ impl NexusSaga for SagaSnapshotCreate {
builder.append(detach_disk_from_pantry_action());
}

builder.append(release_volume_lock_action());

Ok(builder.build()?)
}
}
Expand All @@ -301,6 +318,106 @@ async fn ssc_noop(_sagactx: NexusActionContext) -> Result<(), ActionError> {
Ok(())
}

async fn ssc_take_volume_lock(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let params = sagactx.saga_params::<Params>()?;
let lock_id = sagactx.lookup::<Uuid>("lock_id")?;

let opctx = crate::context::op_context_for_saga_action(
&sagactx,
&params.serialized_authn,
);

// Without a lock on the volume, this saga can race with the region
// replacement saga, causing (at least!) two problems:
//
// - the request that this saga sends to a propolis to create a snapshot
// races with the vcr_replace request that the region replacement drive
// saga sends. Imagining that the region set for a disk's volume is [A, B,
// C] before either saga starts, then the following scenarios can occur:
//
// 1. the snapshot request lands before the vcr_replace request, meaning
// snapshots are created for the A, B, and C regions, but the region
// set was then modified to [A, B, D]: this means that two of the three
// regions have associated snapshots, and this will cause this saga to
// unwind when it checks that the associated region snapshots were
// created ok. as it's currently written, this saga could also error
// during unwind, as it's trying to clean up the snapshots for A, B,
// and C, and it could see a "Not Found" when querying C for the
// snapshot.
//
// 2. the vcr_replace request lands before the snapshot request, meaning
// snapshots would be created for the A, B, and D regions. this is a
// problem because D is new (having been allocated to replace C), and
// the upstairs could be performing live repair. taking a snapshot of
// an upstairs during live repair means either:
//
// a. the Upstairs will reject it, causing this saga to unwind ok
//
// b. the Upstairs will allow it, meaning any read-only Upstairs that
// uses the associated snapshots ([A', B', D']) will detect that
// reconciliation is required, not be able to perform reconciliation
// because it is read-only, and panic. note: accepting and
// performing a snapshot during live repair is almost certainly a
// bug in Crucible, not Nexus!
//
// if the upstairs is _not_ performing live repair yet, then the
// snapshot could succeed. This means each of the A, B, and D regions
// will have an associated snapshot, but the D snapshot is of a blank
// region! the same problem will occur as 2b: a read-only Upstairs that
// uses those snapshots as targets will panic because the data doesn't
// match and it can't perform reconciliation.
//
// - get_allocated_regions will change during the execution of this saga,
// due to region replacement(s) occurring.
//
// With a lock on the volume, the snapshot creation and region replacement
// drive sagas are serialized. Note this does mean that region replacement
// is blocked by a snapshot being created, and snapshot creation is blocked
// by region replacement.

let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
.disk_id(params.disk_id)
.fetch()
.await
.map_err(ActionError::action_failed)?;

osagactx
.datastore()
.volume_repair_lock(&opctx, disk.volume_id, lock_id)
.await
.map_err(ActionError::action_failed)?;

Ok(())
}

async fn ssc_take_volume_lock_undo(
sagactx: NexusActionContext,
) -> anyhow::Result<()> {
let osagactx = sagactx.user_data();
let params = sagactx.saga_params::<Params>()?;
let lock_id = sagactx.lookup::<Uuid>("lock_id")?;

let opctx = crate::context::op_context_for_saga_action(
&sagactx,
&params.serialized_authn,
);

let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
.disk_id(params.disk_id)
.fetch()
.await?;

osagactx
.datastore()
.volume_repair_unlock(&opctx, disk.volume_id, lock_id)
.await?;

Ok(())
}

async fn ssc_alloc_regions(
sagactx: NexusActionContext,
) -> Result<Vec<(db::model::Dataset, db::model::Region)>, ActionError> {
Expand Down Expand Up @@ -1538,6 +1655,33 @@ async fn ssc_finalize_snapshot_record(
Ok(snapshot)
}

async fn ssc_release_volume_lock(
sagactx: NexusActionContext,
) -> Result<(), ActionError> {
let osagactx = sagactx.user_data();
let params = sagactx.saga_params::<Params>()?;
let lock_id = sagactx.lookup::<Uuid>("lock_id")?;

let opctx = crate::context::op_context_for_saga_action(
&sagactx,
&params.serialized_authn,
);

let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
.disk_id(params.disk_id)
.fetch()
.await
.map_err(ActionError::action_failed)?;

osagactx
.datastore()
.volume_repair_unlock(&opctx, disk.volume_id, lock_id)
.await
.map_err(ActionError::action_failed)?;

Ok(())
}

// helper functions

/// Create a Snapshot VolumeConstructionRequest by copying a disk's
Expand Down

0 comments on commit fcaf48b

Please sign in to comment.