oxidecomputer · jmpesp · Aug 26, 2024 · Jul 26, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -36,6 +36,7 @@ use nexus_types::internal_api::background::LookupRegionPortStatus;
 use nexus_types::internal_api::background::RegionReplacementDriverStatus;
 use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus;
 use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus;
+use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus;
 use nexus_types::inventory::BaseboardId;
 use omicron_uuid_kinds::CollectionUuid;
 use omicron_uuid_kinds::DemoSagaUuid;
@@ -1504,6 +1505,46 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                     println!("    > {line}");
                 }
 
+                println!("    errors: {}", status.errors.len());
+                for line in &status.errors {
+                    println!("    > {line}");
+                }
+            }
+        }
+    } else if name == "region_snapshot_replacement_step" {
+        match serde_json::from_value::<RegionSnapshotReplacementStepStatus>(
+            details.clone(),
+        ) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+
+            Ok(status) => {
+                println!(
+                    "    total step records created ok: {}",
+                    status.step_records_created_ok.len(),
+                );
+                for line in &status.step_records_created_ok {
+                    println!("    > {line}");
+                }
+
+                println!(
+                    "    total step garbage collect saga invoked ok: {}",
+                    status.step_garbage_collect_invoked_ok.len(),
+                );
+                for line in &status.step_garbage_collect_invoked_ok {
+                    println!("    > {line}");
+                }
+
+                println!(
+                    "    total step saga invoked ok: {}",
+                    status.step_invoked_ok.len(),
+                );
+                for line in &status.step_invoked_ok {
+                    println!("    > {line}");
+                }
+
                 println!("    errors: {}", status.errors.len());
                 for line in &status.errors {
                     println!("    > {line}");

diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
@@ -135,6 +135,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 
@@ -292,6 +297,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 
@@ -436,6 +446,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 

diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
@@ -336,6 +336,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 
@@ -589,6 +594,16 @@ task: "region_snapshot_replacement_start"
     started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
 warning: unknown background task: "region_snapshot_replacement_start" (don't know how to interpret details: Object {"errors": Array [], "requests_created_ok": Array [], "start_invoked_ok": Array []})
 
+task: "region_snapshot_replacement_step"
+  configured period: every <REDACTED_DURATION>s
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    total step records created ok: 0
+    total step garbage collect saga invoked ok: 0
+    total step saga invoked ok: 0
+    errors: 0
+
 task: "saga_recovery"
   configured period: every 10m
   currently executing: no
@@ -995,6 +1010,16 @@ task: "region_snapshot_replacement_start"
     started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
 warning: unknown background task: "region_snapshot_replacement_start" (don't know how to interpret details: Object {"errors": Array [], "requests_created_ok": Array [], "start_invoked_ok": Array []})
 
+task: "region_snapshot_replacement_step"
+  configured period: every <REDACTED_DURATION>s
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    total step records created ok: 0
+    total step garbage collect saga invoked ok: 0
+    total step saga invoked ok: 0
+    errors: 0
+
 task: "saga_recovery"
   configured period: every 10m
   currently executing: no

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
@@ -396,6 +396,8 @@ pub struct BackgroundTaskConfig {
     /// configuration for region snapshot replacement garbage collection
     pub region_snapshot_replacement_garbage_collection:
         RegionSnapshotReplacementGarbageCollectionConfig,
+    /// configuration for region snapshot replacement step task
+    pub region_snapshot_replacement_step: RegionSnapshotReplacementStepConfig,
 }
 
 #[serde_as]
@@ -648,6 +650,14 @@ pub struct RegionSnapshotReplacementGarbageCollectionConfig {
     pub period_secs: Duration,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct RegionSnapshotReplacementStepConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+}
+
 /// Configuration for a nexus server
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
 pub struct PackageConfig {
@@ -897,6 +907,7 @@ mod test {
             lookup_region_port.period_secs = 60
             region_snapshot_replacement_start.period_secs = 30
             region_snapshot_replacement_garbage_collection.period_secs = 30
+            region_snapshot_replacement_step.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             seed = 0
@@ -1067,6 +1078,10 @@ mod test {
                             RegionSnapshotReplacementGarbageCollectionConfig {
                                 period_secs: Duration::from_secs(30),
                             },
+                        region_snapshot_replacement_step:
+                            RegionSnapshotReplacementStepConfig {
+                                period_secs: Duration::from_secs(30),
+                            },
                     },
                     default_region_allocation_strategy:
                         crate::nexus_config::RegionAllocationStrategy::Random {
@@ -1145,6 +1160,7 @@ mod test {
             lookup_region_port.period_secs = 60
             region_snapshot_replacement_start.period_secs = 30
             region_snapshot_replacement_garbage_collection.period_secs = 30
+            region_snapshot_replacement_step.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             "##,

diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml
@@ -141,6 +141,7 @@ saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 region_snapshot_replacement_start.period_secs = 30
 region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
 
 [default_region_allocation_strategy]
 # allocate region on 3 random distinct zpools, on 3 random distinct sleds.

diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
@@ -127,6 +127,7 @@ saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 region_snapshot_replacement_start.period_secs = 30
 region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
 
 [default_region_allocation_strategy]
 # allocate region on 3 random distinct zpools, on 3 random distinct sleds.

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
@@ -110,6 +110,7 @@ use super::tasks::region_replacement;
 use super::tasks::region_replacement_driver;
 use super::tasks::region_snapshot_replacement_garbage_collect::*;
 use super::tasks::region_snapshot_replacement_start::*;
+use super::tasks::region_snapshot_replacement_step::*;
 use super::tasks::saga_recovery;
 use super::tasks::service_firewall_rules;
 use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker;
@@ -165,6 +166,7 @@ pub struct BackgroundTasks {
     pub task_lookup_region_port: Activator,
     pub task_region_snapshot_replacement_start: Activator,
     pub task_region_snapshot_replacement_garbage_collection: Activator,
+    pub task_region_snapshot_replacement_step: Activator,
 
     // Handles to activate background tasks that do not get used by Nexus
     // at-large.  These background tasks are implementation details as far as
@@ -249,6 +251,7 @@ impl BackgroundTasksInitializer {
             task_region_snapshot_replacement_start: Activator::new(),
             task_region_snapshot_replacement_garbage_collection: Activator::new(
             ),
+            task_region_snapshot_replacement_step: Activator::new(),
 
             task_internal_dns_propagation: Activator::new(),
             task_external_dns_propagation: Activator::new(),
@@ -312,6 +315,7 @@ impl BackgroundTasksInitializer {
             task_lookup_region_port,
             task_region_snapshot_replacement_start,
             task_region_snapshot_replacement_garbage_collection,
+            task_region_snapshot_replacement_step,
             // Add new background tasks here.  Be sure to use this binding in a
             // call to `Driver::register()` below.  That's what actually wires
             // up the Activator to the corresponding background task.
@@ -760,14 +764,29 @@ impl BackgroundTasksInitializer {
                 .region_snapshot_replacement_garbage_collection
                 .period_secs,
             task_impl: Box::new(RegionSnapshotReplacementGarbageCollect::new(
-                datastore,
+                datastore.clone(),
                 sagas.clone(),
             )),
             opctx: opctx.child(BTreeMap::new()),
             watchers: vec![],
             activator: task_region_snapshot_replacement_garbage_collection,
         });
 
+        driver.register(TaskDefinition {
+            name: "region_snapshot_replacement_step",
+            description:
+                "detect what volumes were affected by a region snapshot \
+                replacement, and run the step saga for them",
+            period: config.region_snapshot_replacement_step.period_secs,
+            task_impl: Box::new(RegionSnapshotReplacementFindAffected::new(
+                datastore,
+                sagas.clone(),
+            )),
+            opctx: opctx.child(BTreeMap::new()),
+            watchers: vec![],
+            activator: task_region_snapshot_replacement_step,
+        });
+
         driver
     }
 }

diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs
@@ -27,6 +27,7 @@ pub mod region_replacement;
 pub mod region_replacement_driver;
 pub mod region_snapshot_replacement_garbage_collect;
 pub mod region_snapshot_replacement_start;
+pub mod region_snapshot_replacement_step;
 pub mod saga_recovery;
 pub mod service_firewall_rules;
 pub mod sync_service_zone_nat;