diff --git a/nexus/db-model/src/region_replacement.rs b/nexus/db-model/src/region_replacement.rs index a04710f53d..9ae64d6d38 100644 --- a/nexus/db-model/src/region_replacement.rs +++ b/nexus/db-model/src/region_replacement.rs @@ -111,6 +111,10 @@ impl std::str::FromStr for RegionReplacementState { /// modification was committed to the database and will not change or be /// unwound. /// +/// It's also possible to transition from Running to ReplacementDone if a +/// "finish" notification is seen by the region replacement drive background +/// task. This check is done before invoking the region replacement drive saga. +/// /// See also: RegionReplacementStep records #[derive( Queryable, diff --git a/nexus/db-queries/src/db/datastore/region_replacement.rs b/nexus/db-queries/src/db/datastore/region_replacement.rs index 56e73d2b2c..46baf1fd16 100644 --- a/nexus/db-queries/src/db/datastore/region_replacement.rs +++ b/nexus/db-queries/src/db/datastore/region_replacement.rs @@ -732,9 +732,18 @@ impl DataStore { } /// Nexus has been notified by an Upstairs (or has otherwised determined) - /// that a region replacement is done, so update the record. This may arrive - /// in the middle of a drive saga invocation, so do not filter on state or - /// operating saga id! + /// that a region replacement is done, so update the record. Filter on the + /// following: + /// + /// - operating saga id being None, as this happens outside of a saga and + /// should only transition the record if there isn't currently a lock. + /// + /// - the record being in the state "Running": this function is called when + /// a "finish" notification is seen, and that only happens after a region + /// replacement drive saga has invoked either a reconcilation or live + /// repair, and that has finished. The region replacement drive background + /// task will scan for these notifications and call this function if one + /// is seen. pub async fn mark_region_replacement_as_done( &self, opctx: &OpContext, @@ -743,11 +752,12 @@ impl DataStore { use db::schema::region_replacement::dsl; let updated = diesel::update(dsl::region_replacement) .filter(dsl::id.eq(region_replacement_id)) - .set(( + .filter(dsl::operating_saga_id.is_null()) + .filter(dsl::replacement_state.eq(RegionReplacementState::Running)) + .set( dsl::replacement_state .eq(RegionReplacementState::ReplacementDone), - dsl::operating_saga_id.eq(Option::::None), - )) + ) .check_if_exists::(region_replacement_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await; @@ -858,7 +868,7 @@ mod test { async fn test_replacement_done_in_middle_of_drive_saga() { // If Nexus receives a notification that a repair has finished in the // middle of a drive saga, then make sure the replacement request state - // ends up as `ReplacementDone`. + // eventually ends up as `ReplacementDone`. let logctx = dev::test_setup_log( "test_replacement_done_in_middle_of_drive_saga", @@ -880,7 +890,7 @@ mod test { .await .unwrap(); - // Transition to Driving + // The drive saga will transition the record to Driving, locking it. let saga_id = Uuid::new_v4(); @@ -890,7 +900,39 @@ mod test { .unwrap(); // Now, Nexus receives a notification that the repair has finished - // successfully + // successfully. A background task trying to mark as replacement done + // should fail as the record was locked by the saga. + + datastore + .mark_region_replacement_as_done(&opctx, request.id) + .await + .unwrap_err(); + + // Ensure that the state is still Driving, and the operating saga id is + // set. + + let actual_request = datastore + .get_region_replacement_request_by_id(&opctx, request.id) + .await + .unwrap(); + + assert_eq!( + actual_request.replacement_state, + RegionReplacementState::Driving + ); + assert_eq!(actual_request.operating_saga_id, Some(saga_id)); + + // The Drive saga will finish, but doesn't transition to replacement + // done because it didn't detect that one of the repair operations had + // finished ok. + + datastore + .undo_set_region_replacement_driving(&opctx, request.id, saga_id) + .await + .unwrap(); + + // Now the region replacement drive background task wakes up again, and + // this time marks the record as replacement done successfully. datastore .mark_region_replacement_as_done(&opctx, request.id) @@ -911,14 +953,73 @@ mod test { ); assert_eq!(actual_request.operating_saga_id, None); - // The Drive saga will unwind when it tries to set the state back to - // Running. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_replacement_done_in_middle_of_finish_saga() { + // If multiple Nexus are racing, don't let one mark a record as + // "ReplacementDone" if it's in the middle of the finish saga. + + let logctx = dev::test_setup_log( + "test_replacement_done_in_middle_of_finish_saga", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let region_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + let request = { + let mut request = RegionReplacement::new(region_id, volume_id); + request.replacement_state = RegionReplacementState::ReplacementDone; + request + }; datastore - .undo_set_region_replacement_driving(&opctx, request.id, saga_id) + .insert_region_replacement_request(&opctx, request.clone()) + .await + .unwrap(); + + // The finish saga will transition to Completing, setting operating saga + // id accordingly. + + let saga_id = Uuid::new_v4(); + + datastore + .set_region_replacement_completing(&opctx, request.id, saga_id) + .await + .unwrap(); + + // Double check that another saga can't do this, because the first saga + // took the lock. + + datastore + .set_region_replacement_completing( + &opctx, + request.id, + Uuid::new_v4(), + ) + .await + .unwrap_err(); + + // mark_region_replacement_as_done is called due to a finish + // notification scan by the region replacement drive background task. + // This should fail as the saga took the lock on this record. + + datastore + .mark_region_replacement_as_done(&opctx, request.id) .await .unwrap_err(); + // The first saga has finished and sets the record to Complete. + + datastore + .set_region_replacement_complete(&opctx, request.id, saga_id) + .await + .unwrap(); + db.cleanup().await.unwrap(); logctx.cleanup_successful(); }