oxidecomputer · jmpesp · Dec 4, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 27, 2023
diff --git a/common/src/nexus_config.rs b/common/src/nexus_config.rs
@@ -339,6 +339,8 @@ pub struct BackgroundTaskConfig {
     pub nat_cleanup: NatCleanupConfig,
     /// configuration for inventory tasks
     pub inventory: InventoryConfig,
+    /// configuration for phantom disks task
+    pub phantom_disks: PhantomDiskConfig,
 }
 
 #[serde_as]
@@ -386,7 +388,7 @@ pub struct NatCleanupConfig {
 pub struct InventoryConfig {
     /// period (in seconds) for periodic activations of this background task
     ///
-    /// Each activation fetches information about all harware and software in
+    /// Each activation fetches information about all hardware and software in
     /// the system and inserts it into the database.  This generates a moderate
     /// amount of data.
     #[serde_as(as = "DurationSeconds<u64>")]
@@ -405,6 +407,14 @@ pub struct InventoryConfig {
     pub disable: bool,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct PhantomDiskConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+}
+
 /// Configuration for a nexus server
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
 pub struct PackageConfig {
@@ -508,8 +518,9 @@ mod test {
         BackgroundTaskConfig, Config, ConfigDropshotWithTls, ConsoleConfig,
         Database, DeploymentConfig, DnsTasksConfig, DpdConfig,
         ExternalEndpointsConfig, InternalDns, InventoryConfig, LoadError,
-        LoadErrorKind, MgdConfig, NatCleanupConfig, PackageConfig, SchemeName,
-        TimeseriesDbConfig, Tunables, UpdatesConfig,
+        LoadErrorKind, MgdConfig, NatCleanupConfig, PackageConfig,
+        PhantomDiskConfig, SchemeName, TimeseriesDbConfig, Tunables,
+        UpdatesConfig,
     };
     use crate::address::{Ipv6Subnet, RACK_PREFIX};
     use crate::api::internal::shared::SwitchLocation;
@@ -663,6 +674,7 @@ mod test {
             inventory.period_secs = 10
             inventory.nkeep = 11
             inventory.disable = false
+            phantom_disks.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             seed = 0
@@ -764,7 +776,10 @@ mod test {
                             period_secs: Duration::from_secs(10),
                             nkeep: 11,
                             disable: false,
-                        }
+                        },
+                        phantom_disks: PhantomDiskConfig {
+                            period_secs: Duration::from_secs(30),
+                        },
                     },
                     default_region_allocation_strategy:
                         crate::nexus_config::RegionAllocationStrategy::Random {
@@ -822,6 +837,7 @@ mod test {
             inventory.period_secs = 10
             inventory.nkeep = 3
             inventory.disable = false
+            phantom_disks.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             "##,

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -515,6 +515,22 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 );
             }
         };
+    } else if name == "phantom_disks" {
+        #[derive(Deserialize)]
+        struct TaskSuccess {
+            /// how many phantom disks were found
+            ok: usize,
+        }
+
+        match serde_json::from_value::<TaskSuccess>(details.clone()) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+            Ok(success) => {
+                println!("    phantom disks found: {}", success.ok);
+            }
+        };
     } else {
         println!(
             "warning: unknown background task: {:?} \

diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
@@ -66,6 +66,10 @@ task: "nat_v4_garbage_collector"
     predetermined retention policy
 
 
+task: "phantom_disks"
+    detects and un-deletes phantom disks
+
+
 ---------------------------------------------
 stderr:
 note: using Nexus URL http://127.0.0.1:REDACTED_PORT
@@ -131,6 +135,10 @@ task: "nat_v4_garbage_collector"
     predetermined retention policy
 
 
+task: "phantom_disks"
+    detects and un-deletes phantom disks
+
+
 ---------------------------------------------
 stderr:
 note: Nexus URL not specified.  Will pick one from DNS.
@@ -183,6 +191,10 @@ task: "nat_v4_garbage_collector"
     predetermined retention policy
 
 
+task: "phantom_disks"
+    detects and un-deletes phantom disks
+
+
 ---------------------------------------------
 stderr:
 note: Nexus URL not specified.  Will pick one from DNS.

diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
@@ -260,6 +260,10 @@ task: "nat_v4_garbage_collector"
     predetermined retention policy
 
 
+task: "phantom_disks"
+    detects and un-deletes phantom disks
+
+
 ---------------------------------------------
 stderr:
 note: using Nexus URL http://127.0.0.1:REDACTED_PORT/
@@ -357,6 +361,13 @@ task: "inventory_collection"
     last collection started: <REDACTED_TIMESTAMP>
     last collection done:    <REDACTED_TIMESTAMP>
 
+task: "phantom_disks"
+  configured period: every 30s
+  currently executing: no
+  last completed activation: iter 2, triggered by an explicit signal
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    phantom disks found: 0
+
 ---------------------------------------------
 stderr:
 note: using Nexus URL http://127.0.0.1:REDACTED_PORT/

diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
@@ -1299,7 +1299,7 @@ table! {
 ///
 /// This should be updated whenever the schema is changed. For more details,
 /// refer to: schema/crdb/README.adoc
-pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(14, 0, 0);
+pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(15, 0, 0);
 
 allow_tables_to_appear_in_same_query!(
     system_update,
@@ -1368,3 +1368,7 @@ allow_tables_to_appear_in_same_query!(
     switch_port,
     switch_port_settings_bgp_peer_config
 );
+
+allow_tables_to_appear_in_same_query!(disk, virtual_provisioning_resource);
+
+allow_tables_to_appear_in_same_query!(volume, virtual_provisioning_resource);
diff --git a/nexus/db-queries/src/db/datastore/disk.rs b/nexus/db-queries/src/db/datastore/disk.rs
@@ -25,11 +25,14 @@ use crate::db::model::DiskUpdate;
 use crate::db::model::Instance;
 use crate::db::model::Name;
 use crate::db::model::Project;
+use crate::db::model::VirtualProvisioningResource;
+use crate::db::model::Volume;
 use crate::db::pagination::paginated;
 use crate::db::queries::disk::DiskSetClauseForAttach;
 use crate::db::update_and_check::UpdateAndCheck;
 use crate::db::update_and_check::UpdateStatus;
 use async_bb8_diesel::AsyncRunQueryDsl;
+use chrono::DateTime;
 use chrono::Utc;
 use diesel::prelude::*;
 use omicron_common::api;
@@ -652,4 +655,143 @@ impl DataStore {
             }
         }
     }
+
+    /// Set a disk to faulted and un-delete it
+    ///
+    /// If the disk delete saga unwinds, then the disk should _not_ remain
+    /// deleted: disk delete saga should be triggered again in order to fully
+    /// complete, and the only way to do that is to un-delete the disk. Set it
+    /// to faulted to ensure that it won't be used.
+    pub async fn project_undelete_disk_set_faulted_no_auth(
+        &self,
+        disk_id: &Uuid,
+    ) -> Result<(), Error> {
+        use db::schema::disk::dsl;
+        let conn = self.pool_connection_unauthorized().await?;
+
+        let faulted = api::external::DiskState::Faulted.label();
+
+        let result = diesel::update(dsl::disk)
+            .filter(dsl::time_deleted.is_not_null())
+            .filter(dsl::id.eq(*disk_id))
+            .set((
+                dsl::time_deleted.eq(None::<DateTime<Utc>>),
+                dsl::disk_state.eq(faulted),
+            ))
+            .check_if_exists::<Disk>(*disk_id)
+            .execute_and_check(&conn)
+            .await
+            .map_err(|e| {
+                public_error_from_diesel(
+                    e,
+                    ErrorHandler::NotFoundByLookup(
+                        ResourceType::Disk,
+                        LookupType::ById(*disk_id),
+                    ),
+                )
+            })?;
+
+        match result.status {
+            UpdateStatus::Updated => Ok(()),
+            UpdateStatus::NotUpdatedButExists => {
+                let disk = result.found;
+                let disk_state = disk.state();
+
+                if disk.time_deleted().is_none()
+                    && disk_state.state() == &api::external::DiskState::Faulted
+                {
+                    // To maintain idempotency, if the disk has already been
+                    // faulted, don't throw an error.
+                    return Ok(());
+                } else {
+                    // NOTE: This is a "catch-all" error case, more specific
+                    // errors should be preferred as they're more actionable.
+                    return Err(Error::InternalError {
+                        internal_message: String::from(
+                            "disk exists, but cannot be faulted",
+                        ),
+                    });
+                }
+            }
+        }
+    }
+
+    /// Find disks that have been deleted but still have a
+    /// `virtual_provisioning_resource` record: this indicates that a disk
+    /// delete saga partially succeeded, then unwound, which (before the fixes
+    /// in customer-support#58) would mean the disk was deleted but the project
+    /// it was in could not be deleted (due to an erroneous number of bytes
+    /// "still provisioned").
+    pub async fn find_phantom_disks(&self) -> ListResultVec<Disk> {
+        use db::schema::disk::dsl;
+        use db::schema::virtual_provisioning_resource::dsl as resource_dsl;
+        use db::schema::volume::dsl as volume_dsl;
+
+        let conn = self.pool_connection_unauthorized().await?;
+
+        let potential_phantom_disks: Vec<(
+            Disk,
+            Option<VirtualProvisioningResource>,
+            Option<Volume>,
+        )> = dsl::disk
+            .filter(dsl::time_deleted.is_not_null())
+            .left_join(
+                resource_dsl::virtual_provisioning_resource
+                    .on(resource_dsl::id.eq(dsl::id)),
+            )
+            .left_join(volume_dsl::volume.on(dsl::volume_id.eq(volume_dsl::id)))
+            .select((
+                Disk::as_select(),
+                Option::<VirtualProvisioningResource>::as_select(),
+                Option::<Volume>::as_select(),
+            ))
+            .load_async(&*conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        // The first forward steps of the disk delete saga (plus the volume
+        // delete sub saga) are as follows:
+        //
+        // 1. soft-delete the disk
+        // 2. call virtual_provisioning_collection_delete_disk
+        // 3. soft-delete the disk's volume
+        //
+        // Before the fixes as part of customer-support#58, steps 1 and 3 did
+        // not have undo steps, where step 2 did. In order to detect when the
+        // disk delete saga unwound, find entries where
+        //
+        // 1. the disk and volume are soft-deleted
+        // 2. the `virtual_provisioning_resource` exists
+        //
+        // It's important not to conflict with any currently running disk delete
+        // saga.
+
+        Ok(potential_phantom_disks
+            .into_iter()
+            .filter(|(disk, resource, volume)| {
+                if let Some(volume) = volume {
+                    // In this branch, the volume record was soft-deleted. The
+                    // saga unwound before the volume record could be hard
+                    // deleted. This won't conflict with a running disk delete
+                    // saga, because the resource record should be None if the
+                    // disk and volume were already soft deleted (if there is
+                    // one, the saga will be at or past step 3).
+                    disk.time_deleted().is_some()
+                        && volume.time_deleted.is_some()
+                        && resource.is_some()
+                } else {
+                    // In this branch, the volume record was hard-deleted. The
+                    // saga could still have unwound after hard deleting the
+                    // volume record, so proceed with filtering. This won't
+                    // conflict with a running disk delete saga because the
+                    // resource record should be None if the disk was soft
+                    // deleted and the volume was hard deleted (if there is one,
+                    // the saga should be almost finished as the volume hard
+                    // delete is the last thing it does).
+                    disk.time_deleted().is_some() && resource.is_some()
+                }
+            })
+            .map(|(disk, _, _)| disk)
+            .collect())
+    }
 }
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
@@ -100,6 +100,7 @@ inventory.period_secs = 600
 inventory.nkeep = 5
 # Disable inventory collection altogether (for emergencies)
 inventory.disable = false
+phantom_disks.period_secs = 30
 
 [default_region_allocation_strategy]
 # allocate region on 3 random distinct zpools, on 3 random distinct sleds.

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
@@ -11,6 +11,7 @@ use super::dns_servers;
 use super::external_endpoints;
 use super::inventory_collection;
 use super::nat_cleanup;
+use super::phantom_disks;
 use nexus_db_model::DnsGroup;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
@@ -52,6 +53,9 @@ pub struct BackgroundTasks {
 
     /// task handle for the task that collects inventory
     pub task_inventory_collection: common::TaskHandle,
+
+    /// task handle for the task that detects phantom disks
+    pub task_phantom_disks: common::TaskHandle,
 }
 
 impl BackgroundTasks {
@@ -122,7 +126,7 @@ impl BackgroundTasks {
         // Background task: inventory collector
         let task_inventory_collection = {
             let collector = inventory_collection::InventoryCollector::new(
-                datastore,
+                datastore.clone(),
                 resolver,
                 &nexus_id.to_string(),
                 config.inventory.nkeep,
@@ -143,6 +147,22 @@ impl BackgroundTasks {
             task
         };
 
+        // Background task: phantom disk detection
+        let task_phantom_disks = {
+            let detector = phantom_disks::PhantomDiskDetector::new(datastore);
+
+            let task = driver.register(
+                String::from("phantom_disks"),
+                String::from("detects and un-deletes phantom disks"),
+                config.phantom_disks.period_secs,
+                Box::new(detector),
+                opctx.child(BTreeMap::new()),
+                vec![],
+            );
+
+            task
+        };
+
         BackgroundTasks {
             driver,
             task_internal_dns_config,
@@ -153,6 +173,7 @@ impl BackgroundTasks {
             external_endpoints,
             nat_cleanup,
             task_inventory_collection,
+            task_phantom_disks,
         }
     }