[nexus] add background task for cleaning up abandoned VMMs (#5812)

**Note**: This change is part of the ongoing work on instance lifecycle management that I'm working on in PR #5749. It's not actually necessary on its own, it's just a component of the upcoming instance updater saga. However, I thought it would be easier to review if I factored out this change into a separate PR that can be reviewed and merged on its own. The instance update saga (see PR #5749) will only clean up after VMMs whose IDs appear in an `instance` record. When a live migration finishes (successfully or not), we want to allow a new migration to begin as soon as possible, which means we have to unlink the “unused” side of the migration --- the source if migration succeeded, or the target if it failed --- from the instance, even though that VMM may not be fully destroyed yet. Once this happens, the instance update saga will no longer be able to clean up these VMMs, so we’ll need a separate task that cleans up these "abandoned" VMMs in the background. This branch introduces an `abandoned_vmm_reaper` background task that's responsible for doing this. It queries the database to list VMMs which are: - in the `Destroyed` state - not deleted yet (i.e. `time_deleted` IS NOT NULL) - not pointed to by their corresponding instances (neither the `active_propolis_id` nor the `target_propolis_id` equals the VMM's ID) For any VMMs returned by this query, the `abandoned_vmm_reaper` task will: - remove the `sled_resource` reservation for that VMM - sets the `time_deleted` on the VMM record if it was not already set. This cleanup process will be executed periodically in the background. Eventually, the background task will also be explicitly triggered by the instance update saga when it knows it has abandoned a VMM. As an aside, I noticed that the current implementation of `DataStore::vmm_mark_deleted` will always unconditionally set the `time_deleted` field on a VMM record, even if it's already set. This is "probably fine" for overall correctness: the VMM remains deleted, so the operation is still idempotent-ish. But, it's not *great*, as it means that any queries for VMMs deleted before a certain timestamp may not be strictly correct, and we're updating the database more frequently than we really need to. So, I've gone ahead and changed it to only set `time_deleted` if the record's `time_deleted` is null, using `check_if_exists` so that the method still returns `Ok` if the record was already deleted --- the caller can inspect the returned `bool` to determine whether or not they were the actual deleter, but the query still doesn't fail.
oxidecomputer · May 24, 2024 · 27e6b34 · 27e6b34
1 parent ed4dbf2
commit 27e6b34
Show file tree

Hide file tree

Showing 12 changed files with 663 additions and 6 deletions.
diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -996,6 +996,59 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 eprintln!("    unexpected return value from task: {:?}", val)
             }
         };
+    } else if name == "abandoned_vmm_reaper" {
+        #[derive(Deserialize)]
+        struct TaskSuccess {
+            /// total number of abandoned VMMs found
+            found: usize,
+
+            /// number of abandoned VMM records that were deleted
+            vmms_deleted: usize,
+
+            /// number of abandoned VMM records that were already deleted when
+            /// we tried to delete them.
+            vmms_already_deleted: usize,
+
+            /// sled resource reservations that were released
+            sled_reservations_deleted: usize,
+
+            /// number of errors that occurred during the activation
+            error_count: usize,
+
+            /// the last error that occurred during execution.
+            error: Option<String>,
+        }
+        match serde_json::from_value::<TaskSuccess>(details.clone()) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+            Ok(TaskSuccess {
+                found,
+                vmms_deleted,
+                vmms_already_deleted,
+                sled_reservations_deleted,
+                error_count,
+                error,
+            }) => {
+                if let Some(error) = error {
+                    println!("    task did not complete successfully!");
+                    println!("      total errors: {error_count}");
+                    println!("      most recent error: {error}");
+                }
+
+                println!("    total abandoned VMMs found: {found}");
+                println!("      VMM records deleted: {vmms_deleted}");
+                println!(
+                    "      VMM records already deleted by another Nexus: {}",
+                    vmms_already_deleted,
+                );
+                println!(
+                    "    sled resource reservations deleted: {}",
+                    sled_reservations_deleted,
+                );
+            }
+        };
     } else {
         println!(
             "warning: unknown background task: {:?} \

diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
@@ -25,6 +25,11 @@ EXECUTING COMMAND: omdb ["nexus", "--nexus-internal-url", "http://127.0.0.1:REDA
 termination: Exited(0)
 ---------------------------------------------
 stdout:
+task: "abandoned_vmm_reaper"
+    deletes sled reservations for VMMs that have been abandoned by their
+    instances
+
+
 task: "bfd_manager"
     Manages bidirectional fowarding detection (BFD) configuration on rack
     switches
@@ -140,6 +145,11 @@ EXECUTING COMMAND: omdb ["nexus", "background-tasks", "doc"]
 termination: Exited(0)
 ---------------------------------------------
 stdout:
+task: "abandoned_vmm_reaper"
+    deletes sled reservations for VMMs that have been abandoned by their
+    instances
+
+
 task: "bfd_manager"
     Manages bidirectional fowarding detection (BFD) configuration on rack
     switches
@@ -242,6 +252,11 @@ EXECUTING COMMAND: omdb ["--dns-server", "[::1]:REDACTED_PORT", "nexus", "backgr
 termination: Exited(0)
 ---------------------------------------------
 stdout:
+task: "abandoned_vmm_reaper"
+    deletes sled reservations for VMMs that have been abandoned by their
+    instances
+
+
 task: "bfd_manager"
     Manages bidirectional fowarding detection (BFD) configuration on rack
     switches

diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
@@ -202,6 +202,11 @@ EXECUTING COMMAND: omdb ["nexus", "background-tasks", "doc"]
 termination: Exited(0)
 ---------------------------------------------
 stdout:
+task: "abandoned_vmm_reaper"
+    deletes sled reservations for VMMs that have been abandoned by their
+    instances
+
+
 task: "bfd_manager"
     Manages bidirectional fowarding detection (BFD) configuration on rack
     switches
@@ -380,6 +385,16 @@ task: "blueprint_executor"
     started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
     last completion reported error: no blueprint
 
+task: "abandoned_vmm_reaper"
+  configured period: every 1m
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by an explicit signal
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    total abandoned VMMs found: 0
+      VMM records deleted: 0
+      VMM records already deleted by another Nexus: 0
+    sled resource reservations deleted: 0
+
 task: "bfd_manager"
   configured period: every 30s
   currently executing: no

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
@@ -381,6 +381,8 @@ pub struct BackgroundTaskConfig {
     pub service_firewall_propagation: ServiceFirewallPropagationConfig,
     /// configuration for v2p mapping propagation task
     pub v2p_mapping_propagation: V2PMappingPropagationConfig,
+    /// configuration for abandoned VMM reaper task
+    pub abandoned_vmm_reaper: AbandonedVmmReaperConfig,
 }
 
 #[serde_as]
@@ -549,6 +551,14 @@ pub struct V2PMappingPropagationConfig {
     pub period_secs: Duration,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct AbandonedVmmReaperConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+}
+
 /// Configuration for a nexus server
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
 pub struct PackageConfig {
@@ -788,6 +798,7 @@ mod test {
             instance_watcher.period_secs = 30
             service_firewall_propagation.period_secs = 300
             v2p_mapping_propagation.period_secs = 30
+            abandoned_vmm_reaper.period_secs = 60
             [default_region_allocation_strategy]
             type = "random"
             seed = 0
@@ -926,6 +937,9 @@ mod test {
                         v2p_mapping_propagation: V2PMappingPropagationConfig {
                             period_secs: Duration::from_secs(30)
                         },
+                        abandoned_vmm_reaper: AbandonedVmmReaperConfig {
+                            period_secs: Duration::from_secs(60),
+                        }
                     },
                     default_region_allocation_strategy:
                         crate::nexus_config::RegionAllocationStrategy::Random {
@@ -995,6 +1009,7 @@ mod test {
             instance_watcher.period_secs = 30
             service_firewall_propagation.period_secs = 300
             v2p_mapping_propagation.period_secs = 30
+            abandoned_vmm_reaper.period_secs = 60
             [default_region_allocation_strategy]
             type = "random"
             "##,

diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs
@@ -9,16 +9,21 @@ use crate::authz;
 use crate::context::OpContext;
 use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
+use crate::db::model::InstanceState as DbInstanceState;
 use crate::db::model::Vmm;
 use crate::db::model::VmmRuntimeState;
+use crate::db::pagination::paginated;
 use crate::db::schema::vmm::dsl;
 use crate::db::update_and_check::UpdateAndCheck;
 use crate::db::update_and_check::UpdateStatus;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use chrono::Utc;
 use diesel::prelude::*;
 use omicron_common::api::external::CreateResult;
+use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::Error;
+use omicron_common::api::external::InstanceState as ApiInstanceState;
+use omicron_common::api::external::ListResultVec;
 use omicron_common::api::external::LookupResult;
 use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;
@@ -50,9 +55,6 @@ impl DataStore {
         opctx: &OpContext,
         vmm_id: &Uuid,
     ) -> UpdateResult<bool> {
-        use crate::db::model::InstanceState as DbInstanceState;
-        use omicron_common::api::external::InstanceState as ApiInstanceState;
-
         let valid_states = vec![
             DbInstanceState::new(ApiInstanceState::Destroyed),
             DbInstanceState::new(ApiInstanceState::Failed),
@@ -61,9 +63,15 @@ impl DataStore {
         let updated = diesel::update(dsl::vmm)
             .filter(dsl::id.eq(*vmm_id))
             .filter(dsl::state.eq_any(valid_states))
+            .filter(dsl::time_deleted.is_null())
             .set(dsl::time_deleted.eq(Utc::now()))
-            .execute_async(&*self.pool_connection_authorized(opctx).await?)
+            .check_if_exists::<Vmm>(*vmm_id)
+            .execute_and_check(&*self.pool_connection_authorized(opctx).await?)
             .await
+            .map(|r| match r.status {
+                UpdateStatus::Updated => true,
+                UpdateStatus::NotUpdatedButExists => false,
+            })
             .map_err(|e| {
                 public_error_from_diesel(
                     e,
@@ -74,7 +82,7 @@ impl DataStore {
                 )
             })?;
 
-        Ok(updated != 0)
+        Ok(updated)
     }
 
     pub async fn vmm_fetch(
@@ -164,4 +172,63 @@ impl DataStore {
 
         Ok(vmm)
     }
+
+    /// Lists VMMs which have been abandoned by their instances after a
+    /// migration and are in need of cleanup.
+    ///
+    /// A VMM is considered "abandoned" if (and only if):
+    ///
+    /// - It is in the `Destroyed` state.
+    /// - It is not currently running an instance, and it is also not the
+    ///   migration target of any instance (i.e. it is not pointed to by
+    ///   any instance record's `active_propolis_id` and `target_propolis_id`
+    ///   fields).
+    /// - It has not been deleted yet.
+    pub async fn vmm_list_abandoned(
+        &self,
+        opctx: &OpContext,
+        pagparams: &DataPageParams<'_, Uuid>,
+    ) -> ListResultVec<Vmm> {
+        use crate::db::schema::instance::dsl as instance_dsl;
+        let destroyed = DbInstanceState::new(ApiInstanceState::Destroyed);
+        paginated(dsl::vmm, dsl::id, pagparams)
+            // In order to be considered "abandoned", a VMM must be:
+            // - in the `Destroyed` state
+            .filter(dsl::state.eq(destroyed))
+            // - not deleted yet
+            .filter(dsl::time_deleted.is_null())
+            // - not pointed to by any instance's `active_propolis_id` or
+            //   `target_propolis_id`.
+            //
+            .left_join(
+                // Left join with the `instance` table on the VMM's instance ID, so
+                // that we can check if the instance pointed to by this VMM (if
+                // any exists) has this VMM pointed to by its
+                // `active_propolis_id` or `target_propolis_id` fields.
+                instance_dsl::instance
+                    .on(instance_dsl::id.eq(dsl::instance_id)),
+            )
+            .filter(
+                dsl::id
+                    .nullable()
+                    .ne(instance_dsl::active_propolis_id)
+                    // In SQL, *all* comparisons with NULL are `false`, even `!=
+                    // NULL`, so we have to explicitly check for nulls here, or
+                    // else VMMs whose instances have no `active_propolis_id`
+                    // will not be considered abandoned (incorrectly).
+                    .or(instance_dsl::active_propolis_id.is_null()),
+            )
+            .filter(
+                dsl::id
+                    .nullable()
+                    .ne(instance_dsl::target_propolis_id)
+                    // As above, we must add this clause because SQL nulls have
+                    // the most irritating behavior possible.
+                    .or(instance_dsl::target_propolis_id.is_null()),
+            )
+            .select(Vmm::as_select())
+            .load_async(&*self.pool_connection_authorized(opctx).await?)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
+    }
 }
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
@@ -117,6 +117,7 @@ region_replacement.period_secs = 30
 instance_watcher.period_secs = 30
 service_firewall_propagation.period_secs = 300
 v2p_mapping_propagation.period_secs = 30
+abandoned_vmm_reaper.period_secs = 60
 
 [default_region_allocation_strategy]
 # allocate region on 3 random distinct zpools, on 3 random distinct sleds.