From 5831dc67ee5c1d3eeddd82ac97eda23dcf625f02 Mon Sep 17 00:00:00 2001
From: James MacMahon <james@oxide.computer>
Date: Tue, 6 Aug 2024 13:24:02 -0400
Subject: [PATCH] Snapshot creation races with region replacement (#6213)

Prevent the snapshot creation saga from racing with the region
replacement saga by having the snapshot creation saga create a volume
repair record, locking the volume for the duration of the snapshot
creation.

There's a large comment in the new `ssc_take_volume_lock` talking about
the implications of these two sagas racing.
---
 nexus/db-queries/src/db/datastore/mod.rs      |   1 +
 .../src/db/datastore/region_replacement.rs    |  30 ++--
 .../src/db/datastore/volume_repair.rs         |  85 +++++++++++
 .../app/sagas/region_replacement_finish.rs    |   2 +-
 nexus/src/app/sagas/snapshot_create.rs        | 144 ++++++++++++++++++
 5 files changed, 242 insertions(+), 20 deletions(-)
 create mode 100644 nexus/db-queries/src/db/datastore/volume_repair.rs
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index 2540790477..b6696ee60d 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -103,6 +103,7 @@ mod v2p_mapping;
 mod virtual_provisioning_collection;
 mod vmm;
 mod volume;
+mod volume_repair;
 mod vpc;
 mod zpool;
 
diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs
index 46baf1fd16..d87356e40f 100644
--- a/nexus/db-queries/src/db/datastore/region_replacement.rs
+++ b/nexus/db-queries/src/db/datastore/region_replacement.rs
@@ -16,7 +16,6 @@ use crate::db::model::RegionReplacementState;
 use crate::db::model::RegionReplacementStep;
 use crate::db::model::UpstairsRepairNotification;
 use crate::db::model::UpstairsRepairNotificationType;
-use crate::db::model::VolumeRepair;
 use crate::db::pagination::paginated;
 use crate::db::pagination::Paginator;
 use crate::db::update_and_check::UpdateAndCheck;
@@ -57,13 +56,8 @@ impl DataStore {
             .await?
             .transaction_async(|conn| async move {
                 use db::schema::region_replacement::dsl;
-                use db::schema::volume_repair::dsl as volume_repair_dsl;
 
-                diesel::insert_into(volume_repair_dsl::volume_repair)
-                    .values(VolumeRepair {
-                        volume_id: request.volume_id,
-                        repair_id: request.id,
-                    })
+                Self::volume_repair_insert_query(request.volume_id, request.id)
                     .execute_async(&conn)
                     .await?;
 
@@ -667,7 +661,7 @@ impl DataStore {
     pub async fn set_region_replacement_complete(
         &self,
         opctx: &OpContext,
-        region_replacement_id: Uuid,
+        request: RegionReplacement,
         operating_saga_id: Uuid,
     ) -> Result<(), Error> {
         type TxnError = TransactionError<Error>;
@@ -675,19 +669,17 @@ impl DataStore {
         self.pool_connection_authorized(opctx)
             .await?
             .transaction_async(|conn| async move {
-                use db::schema::volume_repair::dsl as volume_repair_dsl;
-
-                diesel::delete(
-                    volume_repair_dsl::volume_repair
-                        .filter(volume_repair_dsl::repair_id.eq(region_replacement_id))
-                    )
-                    .execute_async(&conn)
-                    .await?;
+                Self::volume_repair_delete_query(
+                    request.volume_id,
+                    request.id,
+                )
+                .execute_async(&conn)
+                .await?;
 
                 use db::schema::region_replacement::dsl;
 
                 let result = diesel::update(dsl::region_replacement)
-                    .filter(dsl::id.eq(region_replacement_id))
+                    .filter(dsl::id.eq(request.id))
                     .filter(
                         dsl::replacement_state.eq(RegionReplacementState::Completing),
                     )
@@ -696,7 +688,7 @@ impl DataStore {
                         dsl::replacement_state.eq(RegionReplacementState::Complete),
                         dsl::operating_saga_id.eq(Option::<Uuid>::None),
                     ))
-                    .check_if_exists::<RegionReplacement>(region_replacement_id)
+                    .check_if_exists::<RegionReplacement>(request.id)
                     .execute_and_check(&conn)
                     .await?;
 
@@ -713,7 +705,7 @@ impl DataStore {
                         } else {
                             Err(TxnError::CustomError(Error::conflict(format!(
                                 "region replacement {} set to {:?} (operating saga id {:?})",
-                                region_replacement_id,
+                                request.id,
                                 record.replacement_state,
                                 record.operating_saga_id,
                             ))))
diff --git a/nexus/db-queries/src/db/datastore/volume_repair.rs b/nexus/db-queries/src/db/datastore/volume_repair.rs
new file mode 100644
index 0000000000..5230e60e3e
--- /dev/null
+++ b/nexus/db-queries/src/db/datastore/volume_repair.rs
@@ -0,0 +1,85 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! [`DataStore`] methods on [`VolumeRepair`]s.
+
+use super::DataStore;
+use crate::db;
+use crate::db::datastore::OpContext;
+use crate::db::datastore::RunnableQuery;
+use crate::db::error::public_error_from_diesel;
+use crate::db::error::ErrorHandler;
+use crate::db::model::VolumeRepair;
+use async_bb8_diesel::AsyncRunQueryDsl;
+use diesel::prelude::*;
+use diesel::result::Error as DieselError;
+use omicron_common::api::external::Error;
+use uuid::Uuid;
+
+impl DataStore {
+    pub(super) fn volume_repair_insert_query(
+        volume_id: Uuid,
+        repair_id: Uuid,
+    ) -> impl RunnableQuery<VolumeRepair> {
+        use db::schema::volume_repair::dsl;
+
+        diesel::insert_into(dsl::volume_repair)
+            .values(VolumeRepair { volume_id, repair_id })
+    }
+
+    pub async fn volume_repair_lock(
+        &self,
+        opctx: &OpContext,
+        volume_id: Uuid,
+        repair_id: Uuid,
+    ) -> Result<(), Error> {
+        let conn = self.pool_connection_authorized(opctx).await?;
+        Self::volume_repair_insert_query(volume_id, repair_id)
+            .execute_async(&*conn)
+            .await
+            .map(|_| ())
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
+    }
+
+    pub(super) fn volume_repair_delete_query(
+        volume_id: Uuid,
+        repair_id: Uuid,
+    ) -> impl RunnableQuery<VolumeRepair> {
+        use db::schema::volume_repair::dsl;
+
+        diesel::delete(
+            dsl::volume_repair
+                .filter(dsl::volume_id.eq(volume_id))
+                .filter(dsl::repair_id.eq(repair_id)),
+        )
+    }
+
+    pub async fn volume_repair_unlock(
+        &self,
+        opctx: &OpContext,
+        volume_id: Uuid,
+        repair_id: Uuid,
+    ) -> Result<(), Error> {
+        let conn = self.pool_connection_authorized(opctx).await?;
+        Self::volume_repair_delete_query(volume_id, repair_id)
+            .execute_async(&*conn)
+            .await
+            .map(|_| ())
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
+    }
+
+    pub async fn volume_repair_get(
+        conn: &async_bb8_diesel::Connection<db::DbConnection>,
+        volume_id: Uuid,
+        repair_id: Uuid,
+    ) -> Result<VolumeRepair, DieselError> {
+        use db::schema::volume_repair::dsl;
+
+        dsl::volume_repair
+            .filter(dsl::repair_id.eq(repair_id))
+            .filter(dsl::volume_id.eq(volume_id))
+            .first_async::<VolumeRepair>(conn)
+            .await
+    }
+}
diff --git a/nexus/src/app/sagas/region_replacement_finish.rs b/nexus/src/app/sagas/region_replacement_finish.rs
index e17f3405a0..8d8e75ea91 100644
--- a/nexus/src/app/sagas/region_replacement_finish.rs
+++ b/nexus/src/app/sagas/region_replacement_finish.rs
@@ -196,7 +196,7 @@ async fn srrf_update_request_record(
     // record to 'Complete' and clear the operating saga id. There is no undo
     // step for this, it should succeed idempotently.
     datastore
-        .set_region_replacement_complete(&opctx, params.request.id, saga_id)
+        .set_region_replacement_complete(&opctx, params.request, saga_id)
         .await
         .map_err(ActionError::action_failed)?;
 
diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs
index 5a8313229a..1d6903fa61 100644
--- a/nexus/src/app/sagas/snapshot_create.rs
+++ b/nexus/src/app/sagas/snapshot_create.rs
@@ -137,6 +137,10 @@ pub(crate) struct Params {
 // snapshot create saga: actions
 declare_saga_actions! {
     snapshot_create;
+    TAKE_VOLUME_LOCK -> "volume_lock" {
+        + ssc_take_volume_lock
+        - ssc_take_volume_lock_undo
+    }
     REGIONS_ALLOC -> "datasets_and_regions" {
         + ssc_alloc_regions
         - ssc_alloc_regions_undo
@@ -200,6 +204,9 @@ declare_saga_actions! {
     FINALIZE_SNAPSHOT_RECORD -> "finalized_snapshot" {
         + ssc_finalize_snapshot_record
     }
+    RELEASE_VOLUME_LOCK -> "volume_unlock" {
+        + ssc_release_volume_lock
+    }
 }
 
 // snapshot create saga: definition
@@ -237,6 +244,14 @@ impl NexusSaga for SagaSnapshotCreate {
             ACTION_GENERATE_ID.as_ref(),
         ));
 
+        builder.append(Node::action(
+            "lock_id",
+            "GenerateLockId",
+            ACTION_GENERATE_ID.as_ref(),
+        ));
+
+        builder.append(take_volume_lock_action());
+
         // (DB) Allocate region space for snapshot to store blocks post-scrub
         builder.append(regions_alloc_action());
         // (Sleds) Reaches out to each dataset, and ensures the regions exist
@@ -291,6 +306,8 @@ impl NexusSaga for SagaSnapshotCreate {
             builder.append(detach_disk_from_pantry_action());
         }
 
+        builder.append(release_volume_lock_action());
+
         Ok(builder.build()?)
     }
 }
@@ -301,6 +318,106 @@ async fn ssc_noop(_sagactx: NexusActionContext) -> Result<(), ActionError> {
     Ok(())
 }
 
+async fn ssc_take_volume_lock(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let lock_id = sagactx.lookup::<Uuid>("lock_id")?;
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    // Without a lock on the volume, this saga can race with the region
+    // replacement saga, causing (at least!) two problems:
+    //
+    // - the request that this saga sends to a propolis to create a snapshot
+    //   races with the vcr_replace request that the region replacement drive
+    //   saga sends. Imagining that the region set for a disk's volume is [A, B,
+    //   C] before either saga starts, then the following scenarios can occur:
+    //
+    //   1. the snapshot request lands before the vcr_replace request, meaning
+    //      snapshots are created for the A, B, and C regions, but the region
+    //      set was then modified to [A, B, D]: this means that two of the three
+    //      regions have associated snapshots, and this will cause this saga to
+    //      unwind when it checks that the associated region snapshots were
+    //      created ok. as it's currently written, this saga could also error
+    //      during unwind, as it's trying to clean up the snapshots for A, B,
+    //      and C, and it could see a "Not Found" when querying C for the
+    //      snapshot.
+    //
+    //   2. the vcr_replace request lands before the snapshot request, meaning
+    //      snapshots would be created for the A, B, and D regions. this is a
+    //      problem because D is new (having been allocated to replace C), and
+    //      the upstairs could be performing live repair. taking a snapshot of
+    //      an upstairs during live repair means either:
+    //
+    //      a. the Upstairs will reject it, causing this saga to unwind ok
+    //
+    //      b. the Upstairs will allow it, meaning any read-only Upstairs that
+    //         uses the associated snapshots ([A', B', D']) will detect that
+    //         reconciliation is required, not be able to perform reconciliation
+    //         because it is read-only, and panic. note: accepting and
+    //         performing a snapshot during live repair is almost certainly a
+    //         bug in Crucible, not Nexus!
+    //
+    //      if the upstairs is _not_ performing live repair yet, then the
+    //      snapshot could succeed. This means each of the A, B, and D regions
+    //      will have an associated snapshot, but the D snapshot is of a blank
+    //      region! the same problem will occur as 2b: a read-only Upstairs that
+    //      uses those snapshots as targets will panic because the data doesn't
+    //      match and it can't perform reconciliation.
+    //
+    // - get_allocated_regions will change during the execution of this saga,
+    //   due to region replacement(s) occurring.
+    //
+    // With a lock on the volume, the snapshot creation and region replacement
+    // drive sagas are serialized. Note this does mean that region replacement
+    // is blocked by a snapshot being created, and snapshot creation is blocked
+    // by region replacement.
+
+    let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
+        .disk_id(params.disk_id)
+        .fetch()
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    osagactx
+        .datastore()
+        .volume_repair_lock(&opctx, disk.volume_id, lock_id)
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
+async fn ssc_take_volume_lock_undo(
+    sagactx: NexusActionContext,
+) -> anyhow::Result<()> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let lock_id = sagactx.lookup::<Uuid>("lock_id")?;
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
+        .disk_id(params.disk_id)
+        .fetch()
+        .await?;
+
+    osagactx
+        .datastore()
+        .volume_repair_unlock(&opctx, disk.volume_id, lock_id)
+        .await?;
+
+    Ok(())
+}
+
 async fn ssc_alloc_regions(
     sagactx: NexusActionContext,
 ) -> Result<Vec<(db::model::Dataset, db::model::Region)>, ActionError> {
@@ -1538,6 +1655,33 @@ async fn ssc_finalize_snapshot_record(
     Ok(snapshot)
 }
 
+async fn ssc_release_volume_lock(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let lock_id = sagactx.lookup::<Uuid>("lock_id")?;
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let (.., disk) = LookupPath::new(&opctx, &osagactx.datastore())
+        .disk_id(params.disk_id)
+        .fetch()
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    osagactx
+        .datastore()
+        .volume_repair_unlock(&opctx, disk.volume_id, lock_id)
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
 // helper functions
 
 /// Create a Snapshot VolumeConstructionRequest by copying a disk's