Do not double count region snapshots records! (#4095)

`decrease_crucible_resource_count_and_soft_delete_volume` does not disambiguate cases where the snapshot_addr of a region_snapshot is duplicated with another one, which can occur due to the Crucible Agent reclaiming ports from destroyed daemons (see also #4049, which makes the simulated Crucible agent do this). Several invocations of the snapshot create and snapshot delete sagas could race in such a way that one of these ports would be reclaimed, and then be used in a different snapshot, and the lifetime of both of these would overlap! This would confuse our reference counting, which was written with a naive assumption that this port reuse **wouldn't** occur with these overlapping lifetimes. Spoiler alert, it can: root@[fd00:1122:3344:101::3]:32221/omicron> select * from region_snapshot where snapshot_addr = '[fd00:1122:3344:102::7]:19016'; dataset_id | region_id | snapshot_id | snapshot_addr | volume_references ---------------------------------------+--------------------------------------+--------------------------------------+-------------------------------+-------------------- 80790bfd-4b81-4381-9262-20912e3826cc | 0387bbb7-1d54-4683-943c-6c17d6804de9 | 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 80790bfd-4b81-4381-9262-20912e3826cc | ff20e066-8815-4eb6-ac84-fab9b9103462 | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 (2 rows) One way to solve this would be to create a UNIQUE INDEX on `snapshot_addr` here, but then in these cases the snapshot creation would return a 500 error to the user. This commit adds a sixth column: `deleting`, a boolean that is true when the region snapshot is part of a volume's `resources_to_clean_up`, and false otherwise. This is used to select (as part of the transaction for `decrease_crucible_resource_count_and_soft_delete_volume`) only the region_snapshot records that were decremented as part of that transaction, and skip re-deleting them otherwise. This works because the overlapping lifetime of the records in the DB is **not** the overlapping lifetime of the actual read-only downstairs daemon: for the port to be reclaimed, the original daemon has to be DELETEd, which happens after the decrement transaction has already computed which resources to clean up: 1) a snapshot record is created: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` 2) it is incremented as part of `volume_create`: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 1 | false | ``` 3) when the volume is deleted, then the decrement transaction will: a) decrease `volume_references` by 1 ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` b) note any `region_snapshot` records whose `volume_references` went to 0 and have `deleted` = false, and return those in the list of resources to clean up: [ 1a800928-8f93-4cd3-9df1-4129582ffc20 ] c) set deleted = true for any region_snapshot records whose `volume_references` went to 0 and have deleted = false ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | ``` 4) That read-only snapshot daemon is DELETEd, freeing up the port. Another snapshot creation occurs, using that reclaimed port: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` 5) That new snapshot is incremented as part of `volume_create`: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 1 | false | ``` 6) It is later deleted, and the decrement transaction will: a) decrease `volume_references` by 1: ``` j snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` b) note any `region_snapshot` records whose `volume_references` went to 0 and have `deleted` = false, and return those in the list of resources to clean up: [ bdd9614e-f089-4a94-ae46-e10b96b79ba3 ] c) set deleted = true for any region_snapshot records whose `volume_references` went to 0 and have deleted = false ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | true | ```
oxidecomputer · Oct 10, 2023 · d9d3953 · d9d3953
1 parent 47a6b42
commit d9d3953
Show file tree

Hide file tree

Showing 14 changed files with 563 additions and 129 deletions.
diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
@@ -7,7 +7,7 @@ sim-b6d65341 [::1]:REDACTED_PORT -    REDACTED_UUID_REDACTED_UUID_REDACTED
 ---------------------------------------------
 stderr:
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["db", "--db-url", "junk", "sleds"]
 termination: Exited(2)
@@ -172,7 +172,7 @@ stderr:
 note: database URL not specified.  Will search DNS.
 note: (override with --db-url or OMDB_DB_URL)
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["--dns-server", "[::1]:REDACTED_PORT", "db", "sleds"]
 termination: Exited(0)
@@ -185,5 +185,5 @@ stderr:
 note: database URL not specified.  Will search DNS.
 note: (override with --db-url or OMDB_DB_URL)
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
@@ -8,7 +8,7 @@ external oxide-dev.test               2   <REDACTED_TIMESTAMP> create silo: "tes
 ---------------------------------------------
 stderr:
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["db", "dns", "diff", "external", "2"]
 termination: Exited(0)
@@ -24,7 +24,7 @@ changes:                    names added: 1, names removed: 0
 ---------------------------------------------
 stderr:
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["db", "dns", "names", "external", "2"]
 termination: Exited(0)
@@ -36,7 +36,7 @@ External zone: oxide-dev.test
 ---------------------------------------------
 stderr:
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["db", "services", "list-instances"]
 termination: Exited(0)
@@ -52,7 +52,7 @@ Nexus          REDACTED_UUID_REDACTED_UUID_REDACTED [::ffff:127.0.0.1]:REDACTED_
 ---------------------------------------------
 stderr:
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["db", "services", "list-by-sled"]
 termination: Exited(0)
@@ -71,7 +71,7 @@ sled: sim-b6d65341 (id REDACTED_UUID_REDACTED_UUID_REDACTED)
 ---------------------------------------------
 stderr:
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["db", "sleds"]
 termination: Exited(0)
@@ -82,7 +82,7 @@ sim-b6d65341 [::1]:REDACTED_PORT -    REDACTED_UUID_REDACTED_UUID_REDACTED
 ---------------------------------------------
 stderr:
 note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
-note: database schema version matches expected (5.0.0)
+note: database schema version matches expected (6.0.0)
 =============================================
 EXECUTING COMMAND: omdb ["mgs", "inventory"]
 termination: Exited(0)

diff --git a/nexus/db-model/src/region_snapshot.rs b/nexus/db-model/src/region_snapshot.rs
@@ -32,4 +32,7 @@ pub struct RegionSnapshot {
 
     // how many volumes reference this?
     pub volume_references: i64,
+
+    // true if part of a volume's `resources_to_clean_up` already
+    pub deleting: bool,
 }
diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
@@ -856,6 +856,7 @@ table! {
         snapshot_id -> Uuid,
         snapshot_addr -> Text,
         volume_references -> Int8,
+        deleting -> Bool,
     }
 }
 
@@ -1130,7 +1131,7 @@ table! {
 ///
 /// This should be updated whenever the schema is changed. For more details,
 /// refer to: schema/crdb/README.adoc
-pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(5, 0, 0);
+pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(6, 0, 0);
 
 allow_tables_to_appear_in_same_query!(
     system_update,

diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs
@@ -13,15 +13,31 @@ use crate::db::error::ErrorHandler;
 use crate::db::identity::Asset;
 use crate::db::model::Dataset;
 use crate::db::model::Zpool;
+use async_bb8_diesel::AsyncRunQueryDsl;
 use chrono::Utc;
 use diesel::prelude::*;
 use diesel::upsert::excluded;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::Error;
+use omicron_common::api::external::LookupResult;
 use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;
+use uuid::Uuid;
 
 impl DataStore {
+    pub async fn dataset_get(&self, dataset_id: Uuid) -> LookupResult<Dataset> {
+        use db::schema::dataset::dsl;
+
+        dsl::dataset
+            .filter(dsl::id.eq(dataset_id))
+            .select(Dataset::as_select())
+            .first_async::<Dataset>(
+                &*self.pool_connection_unauthorized().await?,
+            )
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
+    }
+
     /// Stores a new dataset in the database.
     pub async fn dataset_upsert(
         &self,

diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs
@@ -10,9 +10,11 @@ use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
 use crate::db::model::RegionSnapshot;
 use async_bb8_diesel::AsyncRunQueryDsl;
+use async_bb8_diesel::OptionalExtension;
 use diesel::prelude::*;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::DeleteResult;
+use omicron_common::api::external::LookupResult;
 use uuid::Uuid;
 
 impl DataStore {
@@ -31,6 +33,27 @@ impl DataStore {
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
     }
 
+    pub async fn region_snapshot_get(
+        &self,
+        dataset_id: Uuid,
+        region_id: Uuid,
+        snapshot_id: Uuid,
+    ) -> LookupResult<Option<RegionSnapshot>> {
+        use db::schema::region_snapshot::dsl;
+
+        dsl::region_snapshot
+            .filter(dsl::dataset_id.eq(dataset_id))
+            .filter(dsl::region_id.eq(region_id))
+            .filter(dsl::snapshot_id.eq(snapshot_id))
+            .select(RegionSnapshot::as_select())
+            .first_async::<RegionSnapshot>(
+                &*self.pool_connection_unauthorized().await?,
+            )
+            .await
+            .optional()
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
+    }
+
     pub async fn region_snapshot_remove(
         &self,
         dataset_id: Uuid,

diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs
@@ -119,6 +119,7 @@ impl DataStore {
                         .filter(
                             rs_dsl::snapshot_addr.eq(read_only_target.clone()),
                         )
+                        .filter(rs_dsl::deleting.eq(false))
                         .set(
                             rs_dsl::volume_references
                                 .eq(rs_dsl::volume_references + 1),
@@ -573,9 +574,7 @@ impl DataStore {
         // multiple times, and that is done by soft-deleting the volume during
         // the transaction, and returning the previously serialized list of
         // resources to clean up if a soft-delete has already occurred.
-        //
-        // TODO it would be nice to make this transaction_async, but I couldn't
-        // get the async optional extension to work.
+
         self.pool_connection_unauthorized()
             .await?
             .transaction_async(|conn| async move {
@@ -639,20 +638,50 @@ impl DataStore {
                     }
                 };
 
-                // Decrease the number of uses for each referenced region snapshot.
+                // Decrease the number of uses for each non-deleted referenced
+                // region snapshot.
+
                 use db::schema::region_snapshot::dsl;
 
                 diesel::update(dsl::region_snapshot)
                     .filter(
                         dsl::snapshot_addr
                             .eq_any(crucible_targets.read_only_targets.clone()),
                     )
+                    .filter(dsl::volume_references.gt(0))
+                    .filter(dsl::deleting.eq(false))
                     .set(dsl::volume_references.eq(dsl::volume_references - 1))
                     .execute_async(&conn)
                     .await?;
 
+                // Then, note anything that was set to zero from the above
+                // UPDATE, and then mark all those as deleted.
+                let snapshots_to_delete: Vec<RegionSnapshot> =
+                    dsl::region_snapshot
+                        .filter(
+                            dsl::snapshot_addr.eq_any(
+                                crucible_targets.read_only_targets.clone(),
+                            ),
+                        )
+                        .filter(dsl::volume_references.eq(0))
+                        .filter(dsl::deleting.eq(false))
+                        .select(RegionSnapshot::as_select())
+                        .load_async(&conn)
+                        .await?;
+
+                diesel::update(dsl::region_snapshot)
+                    .filter(
+                        dsl::snapshot_addr
+                            .eq_any(crucible_targets.read_only_targets.clone()),
+                    )
+                    .filter(dsl::volume_references.eq(0))
+                    .filter(dsl::deleting.eq(false))
+                    .set(dsl::deleting.eq(true))
+                    .execute_async(&conn)
+                    .await?;
+
                 // Return what results can be cleaned up
-                let result = CrucibleResources::V1(CrucibleResourcesV1 {
+                let result = CrucibleResources::V2(CrucibleResourcesV2 {
                     // The only use of a read-write region will be at the top level of a
                     // Volume. These are not shared, but if any snapshots are taken this
                     // will prevent deletion of the region. Filter out any regions that
@@ -681,53 +710,25 @@ impl DataStore {
                                     .eq(0)
                                     // Despite the SQL specifying that this column is NOT NULL,
                                     // this null check is required for this function to work!
+                                    // The left join of region_snapshot might cause a null here.
                                     .or(dsl::volume_references.is_null()),
                             )
                             .select((Dataset::as_select(), Region::as_select()))
                             .get_results_async::<(Dataset, Region)>(&conn)
                             .await?
                     },
 
-                    // A volume (for a disk or snapshot) may reference another nested
-                    // volume as a read-only parent, and this may be arbitrarily deep.
-                    // After decrementing volume_references above, get the region
-                    // snapshot records for these read_only_targets where the
-                    // volume_references has gone to 0. Consumers of this struct will
-                    // be responsible for deleting the read-only downstairs running
-                    // for the snapshot and the snapshot itself.
-                    datasets_and_snapshots: {
-                        use db::schema::dataset::dsl as dataset_dsl;
-
-                        dsl::region_snapshot
-                            // Only return region_snapshot records related to
-                            // this volume that have zero references. This will
-                            // only happen one time, on the last decrease of a
-                            // volume containing these read-only targets.
-                            //
-                            // It's important to not return *every* region
-                            // snapshot with zero references: multiple volume
-                            // delete sub-sagas will then be issues duplicate
-                            // DELETE calls to Crucible agents, and a request to
-                            // delete a read-only downstairs running for a
-                            // snapshot that doesn't exist will return a 404,
-                            // causing the saga to error and unwind.
-                            .filter(dsl::snapshot_addr.eq_any(
-                                crucible_targets.read_only_targets.clone(),
-                            ))
-                            .filter(dsl::volume_references.eq(0))
-                            .inner_join(
-                                dataset_dsl::dataset
-                                    .on(dsl::dataset_id.eq(dataset_dsl::id)),
-                            )
-                            .select((
-                                Dataset::as_select(),
-                                RegionSnapshot::as_select(),
-                            ))
-                            .get_results_async::<(Dataset, RegionSnapshot)>(
-                                &conn,
-                            )
-                            .await?
-                    },
+                    // Consumers of this struct will be responsible for deleting
+                    // the read-only downstairs running for the snapshot and the
+                    // snapshot itself.
+                    //
+                    // It's important to not return *every* region snapshot with
+                    // zero references: multiple volume delete sub-sagas will
+                    // then be issues duplicate DELETE calls to Crucible agents,
+                    // and a request to delete a read-only downstairs running
+                    // for a snapshot that doesn't exist will return a 404,
+                    // causing the saga to error and unwind.
+                    snapshots_to_delete,
                 });
 
                 // Soft delete this volume, and serialize the resources that are to
@@ -967,14 +968,15 @@ impl DataStore {
 
 #[derive(Default, Debug, Serialize, Deserialize)]
 pub struct CrucibleTargets {
-    read_only_targets: Vec<String>,
+    pub read_only_targets: Vec<String>,
 }
 
 // Serialize this enum into the `resources_to_clean_up` column to handle
 // different versions over time.
 #[derive(Debug, Serialize, Deserialize)]
 pub enum CrucibleResources {
     V1(CrucibleResourcesV1),
+    V2(CrucibleResourcesV2),
 }
 
 #[derive(Debug, Default, Serialize, Deserialize)]
@@ -983,6 +985,12 @@ pub struct CrucibleResourcesV1 {
     pub datasets_and_snapshots: Vec<(Dataset, RegionSnapshot)>,
 }
 
+#[derive(Debug, Default, Serialize, Deserialize)]
+pub struct CrucibleResourcesV2 {
+    pub datasets_and_regions: Vec<(Dataset, Region)>,
+    pub snapshots_to_delete: Vec<RegionSnapshot>,
+}
+
 /// Return the targets from a VolumeConstructionRequest.
 ///
 /// The targets of a volume construction request map to resources.

diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs
@@ -1280,6 +1280,7 @@ async fn ssc_start_running_snapshot(
                 snapshot_id,
                 snapshot_addr,
                 volume_references: 0, // to be filled later
+                deleting: false,
             })
             .await
             .map_err(ActionError::action_failed)?;