From d9d39531991cc8843ef38c4d0afc03afe1a58722 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 10 Oct 2023 12:19:23 -0400 Subject: [PATCH] Do not double count region snapshots records! (#4095) `decrease_crucible_resource_count_and_soft_delete_volume` does not disambiguate cases where the snapshot_addr of a region_snapshot is duplicated with another one, which can occur due to the Crucible Agent reclaiming ports from destroyed daemons (see also #4049, which makes the simulated Crucible agent do this). Several invocations of the snapshot create and snapshot delete sagas could race in such a way that one of these ports would be reclaimed, and then be used in a different snapshot, and the lifetime of both of these would overlap! This would confuse our reference counting, which was written with a naive assumption that this port reuse **wouldn't** occur with these overlapping lifetimes. Spoiler alert, it can: root@[fd00:1122:3344:101::3]:32221/omicron> select * from region_snapshot where snapshot_addr = '[fd00:1122:3344:102::7]:19016'; dataset_id | region_id | snapshot_id | snapshot_addr | volume_references ---------------------------------------+--------------------------------------+--------------------------------------+-------------------------------+-------------------- 80790bfd-4b81-4381-9262-20912e3826cc | 0387bbb7-1d54-4683-943c-6c17d6804de9 | 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 80790bfd-4b81-4381-9262-20912e3826cc | ff20e066-8815-4eb6-ac84-fab9b9103462 | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 (2 rows) One way to solve this would be to create a UNIQUE INDEX on `snapshot_addr` here, but then in these cases the snapshot creation would return a 500 error to the user. This commit adds a sixth column: `deleting`, a boolean that is true when the region snapshot is part of a volume's `resources_to_clean_up`, and false otherwise. This is used to select (as part of the transaction for `decrease_crucible_resource_count_and_soft_delete_volume`) only the region_snapshot records that were decremented as part of that transaction, and skip re-deleting them otherwise. This works because the overlapping lifetime of the records in the DB is **not** the overlapping lifetime of the actual read-only downstairs daemon: for the port to be reclaimed, the original daemon has to be DELETEd, which happens after the decrement transaction has already computed which resources to clean up: 1) a snapshot record is created: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` 2) it is incremented as part of `volume_create`: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 1 | false | ``` 3) when the volume is deleted, then the decrement transaction will: a) decrease `volume_references` by 1 ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` b) note any `region_snapshot` records whose `volume_references` went to 0 and have `deleted` = false, and return those in the list of resources to clean up: [ 1a800928-8f93-4cd3-9df1-4129582ffc20 ] c) set deleted = true for any region_snapshot records whose `volume_references` went to 0 and have deleted = false ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | ``` 4) That read-only snapshot daemon is DELETEd, freeing up the port. Another snapshot creation occurs, using that reclaimed port: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` 5) That new snapshot is incremented as part of `volume_create`: ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 1 | false | ``` 6) It is later deleted, and the decrement transaction will: a) decrease `volume_references` by 1: ``` j snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | false | ``` b) note any `region_snapshot` records whose `volume_references` went to 0 and have `deleted` = false, and return those in the list of resources to clean up: [ bdd9614e-f089-4a94-ae46-e10b96b79ba3 ] c) set deleted = true for any region_snapshot records whose `volume_references` went to 0 and have deleted = false ``` snapshot_id | snapshot_addr | volume_references | deleted | -------------------------------------+-------------------------------+-------------------+---------- 1a800928-8f93-4cd3-9df1-4129582ffc20 | [fd00:1122:3344:102::7]:19016 | 0 | true | bdd9614e-f089-4a94-ae46-e10b96b79ba3 | [fd00:1122:3344:102::7]:19016 | 0 | true | ``` --- dev-tools/omdb/tests/env.out | 6 +- dev-tools/omdb/tests/successes.out | 12 +- nexus/db-model/src/region_snapshot.rs | 3 + nexus/db-model/src/schema.rs | 3 +- nexus/db-queries/src/db/datastore/dataset.rs | 16 + .../src/db/datastore/region_snapshot.rs | 23 ++ nexus/db-queries/src/db/datastore/volume.rs | 100 +++--- nexus/src/app/sagas/snapshot_create.rs | 1 + nexus/src/app/sagas/volume_delete.rs | 177 ++++++---- nexus/tests/integration_tests/snapshots.rs | 36 +- .../integration_tests/volume_management.rs | 308 ++++++++++++++++++ schema/crdb/6.0.0/up1.sql | 1 + schema/crdb/6.0.0/up2.sql | 1 + schema/crdb/dbinit.sql | 5 +- 14 files changed, 563 insertions(+), 129 deletions(-) create mode 100644 schema/crdb/6.0.0/up1.sql create mode 100644 schema/crdb/6.0.0/up2.sql diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index eb4cd0d32d..07a6d3fae5 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -7,7 +7,7 @@ sim-b6d65341 [::1]:REDACTED_PORT - REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "--db-url", "junk", "sleds"] termination: Exited(2) @@ -172,7 +172,7 @@ stderr: note: database URL not specified. Will search DNS. note: (override with --db-url or OMDB_DB_URL) note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["--dns-server", "[::1]:REDACTED_PORT", "db", "sleds"] termination: Exited(0) @@ -185,5 +185,5 @@ stderr: note: database URL not specified. Will search DNS. note: (override with --db-url or OMDB_DB_URL) note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index eb075a84ea..038f365e8e 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -8,7 +8,7 @@ external oxide-dev.test 2 create silo: "tes --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "dns", "diff", "external", "2"] termination: Exited(0) @@ -24,7 +24,7 @@ changes: names added: 1, names removed: 0 --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "dns", "names", "external", "2"] termination: Exited(0) @@ -36,7 +36,7 @@ External zone: oxide-dev.test --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "services", "list-instances"] termination: Exited(0) @@ -52,7 +52,7 @@ Nexus REDACTED_UUID_REDACTED_UUID_REDACTED [::ffff:127.0.0.1]:REDACTED_ --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "services", "list-by-sled"] termination: Exited(0) @@ -71,7 +71,7 @@ sled: sim-b6d65341 (id REDACTED_UUID_REDACTED_UUID_REDACTED) --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["db", "sleds"] termination: Exited(0) @@ -82,7 +82,7 @@ sim-b6d65341 [::1]:REDACTED_PORT - REDACTED_UUID_REDACTED_UUID_REDACTED --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable -note: database schema version matches expected (5.0.0) +note: database schema version matches expected (6.0.0) ============================================= EXECUTING COMMAND: omdb ["mgs", "inventory"] termination: Exited(0) diff --git a/nexus/db-model/src/region_snapshot.rs b/nexus/db-model/src/region_snapshot.rs index 9addeb83e3..af1cf8b2b3 100644 --- a/nexus/db-model/src/region_snapshot.rs +++ b/nexus/db-model/src/region_snapshot.rs @@ -32,4 +32,7 @@ pub struct RegionSnapshot { // how many volumes reference this? pub volume_references: i64, + + // true if part of a volume's `resources_to_clean_up` already + pub deleting: bool, } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 94a770e2ca..0165ab1568 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -856,6 +856,7 @@ table! { snapshot_id -> Uuid, snapshot_addr -> Text, volume_references -> Int8, + deleting -> Bool, } } @@ -1130,7 +1131,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(5, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(6, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/nexus/db-queries/src/db/datastore/dataset.rs b/nexus/db-queries/src/db/datastore/dataset.rs index 99972459c8..0b26789e8f 100644 --- a/nexus/db-queries/src/db/datastore/dataset.rs +++ b/nexus/db-queries/src/db/datastore/dataset.rs @@ -13,15 +13,31 @@ use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; use crate::db::model::Zpool; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; use diesel::upsert::excluded; use omicron_common::api::external::CreateResult; use omicron_common::api::external::Error; +use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use uuid::Uuid; impl DataStore { + pub async fn dataset_get(&self, dataset_id: Uuid) -> LookupResult { + use db::schema::dataset::dsl; + + dsl::dataset + .filter(dsl::id.eq(dataset_id)) + .select(Dataset::as_select()) + .first_async::( + &*self.pool_connection_unauthorized().await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// Stores a new dataset in the database. pub async fn dataset_upsert( &self, diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 0a707e4504..148cfe4812 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -10,9 +10,11 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::RegionSnapshot; use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::OptionalExtension; use diesel::prelude::*; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; +use omicron_common::api::external::LookupResult; use uuid::Uuid; impl DataStore { @@ -31,6 +33,27 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + pub async fn region_snapshot_get( + &self, + dataset_id: Uuid, + region_id: Uuid, + snapshot_id: Uuid, + ) -> LookupResult> { + use db::schema::region_snapshot::dsl; + + dsl::region_snapshot + .filter(dsl::dataset_id.eq(dataset_id)) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::snapshot_id.eq(snapshot_id)) + .select(RegionSnapshot::as_select()) + .first_async::( + &*self.pool_connection_unauthorized().await?, + ) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + pub async fn region_snapshot_remove( &self, dataset_id: Uuid, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index b3e82886de..b97b8451cf 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -119,6 +119,7 @@ impl DataStore { .filter( rs_dsl::snapshot_addr.eq(read_only_target.clone()), ) + .filter(rs_dsl::deleting.eq(false)) .set( rs_dsl::volume_references .eq(rs_dsl::volume_references + 1), @@ -573,9 +574,7 @@ impl DataStore { // multiple times, and that is done by soft-deleting the volume during // the transaction, and returning the previously serialized list of // resources to clean up if a soft-delete has already occurred. - // - // TODO it would be nice to make this transaction_async, but I couldn't - // get the async optional extension to work. + self.pool_connection_unauthorized() .await? .transaction_async(|conn| async move { @@ -639,7 +638,9 @@ impl DataStore { } }; - // Decrease the number of uses for each referenced region snapshot. + // Decrease the number of uses for each non-deleted referenced + // region snapshot. + use db::schema::region_snapshot::dsl; diesel::update(dsl::region_snapshot) @@ -647,12 +648,40 @@ impl DataStore { dsl::snapshot_addr .eq_any(crucible_targets.read_only_targets.clone()), ) + .filter(dsl::volume_references.gt(0)) + .filter(dsl::deleting.eq(false)) .set(dsl::volume_references.eq(dsl::volume_references - 1)) .execute_async(&conn) .await?; + // Then, note anything that was set to zero from the above + // UPDATE, and then mark all those as deleted. + let snapshots_to_delete: Vec = + dsl::region_snapshot + .filter( + dsl::snapshot_addr.eq_any( + crucible_targets.read_only_targets.clone(), + ), + ) + .filter(dsl::volume_references.eq(0)) + .filter(dsl::deleting.eq(false)) + .select(RegionSnapshot::as_select()) + .load_async(&conn) + .await?; + + diesel::update(dsl::region_snapshot) + .filter( + dsl::snapshot_addr + .eq_any(crucible_targets.read_only_targets.clone()), + ) + .filter(dsl::volume_references.eq(0)) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(&conn) + .await?; + // Return what results can be cleaned up - let result = CrucibleResources::V1(CrucibleResourcesV1 { + let result = CrucibleResources::V2(CrucibleResourcesV2 { // The only use of a read-write region will be at the top level of a // Volume. These are not shared, but if any snapshots are taken this // will prevent deletion of the region. Filter out any regions that @@ -681,6 +710,7 @@ impl DataStore { .eq(0) // Despite the SQL specifying that this column is NOT NULL, // this null check is required for this function to work! + // The left join of region_snapshot might cause a null here. .or(dsl::volume_references.is_null()), ) .select((Dataset::as_select(), Region::as_select())) @@ -688,46 +718,17 @@ impl DataStore { .await? }, - // A volume (for a disk or snapshot) may reference another nested - // volume as a read-only parent, and this may be arbitrarily deep. - // After decrementing volume_references above, get the region - // snapshot records for these read_only_targets where the - // volume_references has gone to 0. Consumers of this struct will - // be responsible for deleting the read-only downstairs running - // for the snapshot and the snapshot itself. - datasets_and_snapshots: { - use db::schema::dataset::dsl as dataset_dsl; - - dsl::region_snapshot - // Only return region_snapshot records related to - // this volume that have zero references. This will - // only happen one time, on the last decrease of a - // volume containing these read-only targets. - // - // It's important to not return *every* region - // snapshot with zero references: multiple volume - // delete sub-sagas will then be issues duplicate - // DELETE calls to Crucible agents, and a request to - // delete a read-only downstairs running for a - // snapshot that doesn't exist will return a 404, - // causing the saga to error and unwind. - .filter(dsl::snapshot_addr.eq_any( - crucible_targets.read_only_targets.clone(), - )) - .filter(dsl::volume_references.eq(0)) - .inner_join( - dataset_dsl::dataset - .on(dsl::dataset_id.eq(dataset_dsl::id)), - ) - .select(( - Dataset::as_select(), - RegionSnapshot::as_select(), - )) - .get_results_async::<(Dataset, RegionSnapshot)>( - &conn, - ) - .await? - }, + // Consumers of this struct will be responsible for deleting + // the read-only downstairs running for the snapshot and the + // snapshot itself. + // + // It's important to not return *every* region snapshot with + // zero references: multiple volume delete sub-sagas will + // then be issues duplicate DELETE calls to Crucible agents, + // and a request to delete a read-only downstairs running + // for a snapshot that doesn't exist will return a 404, + // causing the saga to error and unwind. + snapshots_to_delete, }); // Soft delete this volume, and serialize the resources that are to @@ -967,7 +968,7 @@ impl DataStore { #[derive(Default, Debug, Serialize, Deserialize)] pub struct CrucibleTargets { - read_only_targets: Vec, + pub read_only_targets: Vec, } // Serialize this enum into the `resources_to_clean_up` column to handle @@ -975,6 +976,7 @@ pub struct CrucibleTargets { #[derive(Debug, Serialize, Deserialize)] pub enum CrucibleResources { V1(CrucibleResourcesV1), + V2(CrucibleResourcesV2), } #[derive(Debug, Default, Serialize, Deserialize)] @@ -983,6 +985,12 @@ pub struct CrucibleResourcesV1 { pub datasets_and_snapshots: Vec<(Dataset, RegionSnapshot)>, } +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct CrucibleResourcesV2 { + pub datasets_and_regions: Vec<(Dataset, Region)>, + pub snapshots_to_delete: Vec, +} + /// Return the targets from a VolumeConstructionRequest. /// /// The targets of a volume construction request map to resources. diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index eeabf64894..9c8a33fb17 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -1280,6 +1280,7 @@ async fn ssc_start_running_snapshot( snapshot_id, snapshot_addr, volume_references: 0, // to be filled later + deleting: false, }) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index 4cd633f575..d6358d5435 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -155,39 +155,39 @@ async fn svd_delete_crucible_regions( sagactx.lookup::("crucible_resources_to_delete")?; // Send DELETE calls to the corresponding Crucible agents - match crucible_resources_to_delete { + let datasets_and_regions = match crucible_resources_to_delete { CrucibleResources::V1(crucible_resources_to_delete) => { - delete_crucible_regions( - log, - crucible_resources_to_delete.datasets_and_regions.clone(), - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to delete_crucible_regions: {:?}", - e, - )) - })?; + crucible_resources_to_delete.datasets_and_regions + } - // Remove DB records - let region_ids_to_delete = crucible_resources_to_delete - .datasets_and_regions - .iter() - .map(|(_, r)| r.id()) - .collect(); - - osagactx - .datastore() - .regions_hard_delete(log, region_ids_to_delete) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to regions_hard_delete: {:?}", - e, - )) - })?; + CrucibleResources::V2(crucible_resources_to_delete) => { + crucible_resources_to_delete.datasets_and_regions } - } + }; + + delete_crucible_regions(log, datasets_and_regions.clone()).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to delete_crucible_regions: {:?}", + e, + )) + }, + )?; + + // Remove DB records + let region_ids_to_delete = + datasets_and_regions.iter().map(|(_, r)| r.id()).collect(); + + osagactx + .datastore() + .regions_hard_delete(log, region_ids_to_delete) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to regions_hard_delete: {:?}", + e, + )) + })?; Ok(()) } @@ -202,26 +202,46 @@ async fn svd_delete_crucible_running_snapshots( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); let crucible_resources_to_delete = sagactx.lookup::("crucible_resources_to_delete")?; // Send DELETE calls to the corresponding Crucible agents - match crucible_resources_to_delete { + let datasets_and_snapshots = match crucible_resources_to_delete { CrucibleResources::V1(crucible_resources_to_delete) => { - delete_crucible_running_snapshots( - log, - crucible_resources_to_delete.datasets_and_snapshots.clone(), - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to delete_crucible_running_snapshots: {:?}", - e, - )) - })?; + crucible_resources_to_delete.datasets_and_snapshots } - } + + CrucibleResources::V2(crucible_resources_to_delete) => { + let mut datasets_and_snapshots: Vec<_> = Vec::with_capacity( + crucible_resources_to_delete.snapshots_to_delete.len(), + ); + + for region_snapshot in + crucible_resources_to_delete.snapshots_to_delete + { + let dataset = osagactx + .datastore() + .dataset_get(region_snapshot.dataset_id) + .await + .map_err(ActionError::action_failed)?; + + datasets_and_snapshots.push((dataset, region_snapshot)); + } + + datasets_and_snapshots + } + }; + + delete_crucible_running_snapshots(log, datasets_and_snapshots.clone()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to delete_crucible_running_snapshots: {:?}", + e, + )) + })?; Ok(()) } @@ -235,26 +255,46 @@ async fn svd_delete_crucible_snapshots( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let log = sagactx.user_data().log(); + let osagactx = sagactx.user_data(); let crucible_resources_to_delete = sagactx.lookup::("crucible_resources_to_delete")?; // Send DELETE calls to the corresponding Crucible agents - match crucible_resources_to_delete { + let datasets_and_snapshots = match crucible_resources_to_delete { CrucibleResources::V1(crucible_resources_to_delete) => { - delete_crucible_snapshots( - log, - crucible_resources_to_delete.datasets_and_snapshots.clone(), - ) - .await - .map_err(|e| { - ActionError::action_failed(format!( - "failed to delete_crucible_snapshots: {:?}", - e, - )) - })?; + crucible_resources_to_delete.datasets_and_snapshots } - } + + CrucibleResources::V2(crucible_resources_to_delete) => { + let mut datasets_and_snapshots: Vec<_> = Vec::with_capacity( + crucible_resources_to_delete.snapshots_to_delete.len(), + ); + + for region_snapshot in + crucible_resources_to_delete.snapshots_to_delete + { + let dataset = osagactx + .datastore() + .dataset_get(region_snapshot.dataset_id) + .await + .map_err(ActionError::action_failed)?; + + datasets_and_snapshots.push((dataset, region_snapshot)); + } + + datasets_and_snapshots + } + }; + + delete_crucible_snapshots(log, datasets_and_snapshots.clone()) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to delete_crucible_snapshots: {:?}", + e, + )) + })?; Ok(()) } @@ -293,6 +333,31 @@ async fn svd_delete_crucible_snapshot_records( })?; } } + + CrucibleResources::V2(crucible_resources_to_delete) => { + // Remove DB records + for region_snapshot in + &crucible_resources_to_delete.snapshots_to_delete + { + osagactx + .datastore() + .region_snapshot_remove( + region_snapshot.dataset_id, + region_snapshot.region_id, + region_snapshot.snapshot_id, + ) + .await + .map_err(|e| { + ActionError::action_failed(format!( + "failed to region_snapshot_remove {} {} {}: {:?}", + region_snapshot.dataset_id, + region_snapshot.region_id, + region_snapshot.snapshot_id, + e, + )) + })?; + } + } } Ok(()) diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index d212175415..68f4cdadd2 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -1094,6 +1094,7 @@ async fn test_region_snapshot_create_idempotent( snapshot_addr: "[::]:12345".to_string(), volume_references: 1, + deleting: false, }; datastore.region_snapshot_create(region_snapshot.clone()).await.unwrap(); @@ -1287,13 +1288,16 @@ async fn test_multiple_deletes_not_sent(cptestctx: &ControlPlaneTestContext) { .unwrap(); let resources_1 = match resources_1 { - db::datastore::CrucibleResources::V1(resources_1) => resources_1, + db::datastore::CrucibleResources::V1(_) => panic!("using old style!"), + db::datastore::CrucibleResources::V2(resources_1) => resources_1, }; let resources_2 = match resources_2 { - db::datastore::CrucibleResources::V1(resources_2) => resources_2, + db::datastore::CrucibleResources::V1(_) => panic!("using old style!"), + db::datastore::CrucibleResources::V2(resources_2) => resources_2, }; let resources_3 = match resources_3 { - db::datastore::CrucibleResources::V1(resources_3) => resources_3, + db::datastore::CrucibleResources::V1(_) => panic!("using old style!"), + db::datastore::CrucibleResources::V2(resources_3) => resources_3, }; // No region deletions yet, these are just snapshot deletes @@ -1304,24 +1308,24 @@ async fn test_multiple_deletes_not_sent(cptestctx: &ControlPlaneTestContext) { // But there are snapshots to delete - assert!(!resources_1.datasets_and_snapshots.is_empty()); - assert!(!resources_2.datasets_and_snapshots.is_empty()); - assert!(!resources_3.datasets_and_snapshots.is_empty()); + assert!(!resources_1.snapshots_to_delete.is_empty()); + assert!(!resources_2.snapshots_to_delete.is_empty()); + assert!(!resources_3.snapshots_to_delete.is_empty()); - // Assert there are no overlaps in the datasets_and_snapshots to delete. + // Assert there are no overlaps in the snapshots_to_delete to delete. - for tuple in &resources_1.datasets_and_snapshots { - assert!(!resources_2.datasets_and_snapshots.contains(tuple)); - assert!(!resources_3.datasets_and_snapshots.contains(tuple)); + for tuple in &resources_1.snapshots_to_delete { + assert!(!resources_2.snapshots_to_delete.contains(tuple)); + assert!(!resources_3.snapshots_to_delete.contains(tuple)); } - for tuple in &resources_2.datasets_and_snapshots { - assert!(!resources_1.datasets_and_snapshots.contains(tuple)); - assert!(!resources_3.datasets_and_snapshots.contains(tuple)); + for tuple in &resources_2.snapshots_to_delete { + assert!(!resources_1.snapshots_to_delete.contains(tuple)); + assert!(!resources_3.snapshots_to_delete.contains(tuple)); } - for tuple in &resources_3.datasets_and_snapshots { - assert!(!resources_1.datasets_and_snapshots.contains(tuple)); - assert!(!resources_2.datasets_and_snapshots.contains(tuple)); + for tuple in &resources_3.snapshots_to_delete { + assert!(!resources_1.snapshots_to_delete.contains(tuple)); + assert!(!resources_2.snapshots_to_delete.contains(tuple)); } } diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 70d34fb778..e263593def 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -19,6 +19,7 @@ use nexus_test_utils::resource_helpers::DiskTest; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::views; +use nexus_types::identity::Asset; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; @@ -1813,6 +1814,313 @@ async fn test_volume_checkout_updates_sparse_mid_multiple_gen( volume_match_gen(new_vol, vec![Some(8), None, Some(10)]); } +/// Test that the Crucible agent's port reuse does not confuse +/// `decrease_crucible_resource_count_and_soft_delete_volume`, due to the +/// `[ipv6]:port` targets being reused. +#[nexus_test] +async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { + let nexus = &cptestctx.server.apictx().nexus; + let datastore = nexus.datastore(); + + // Four zpools, one dataset each + let mut disk_test = DiskTest::new(&cptestctx).await; + disk_test + .add_zpool_with_dataset(&cptestctx, DiskTest::DEFAULT_ZPOOL_SIZE_GIB) + .await; + + // This bug occurs when region_snapshot records share a snapshot_addr, so + // insert those here manually. + + // (dataset_id, region_id, snapshot_id, snapshot_addr) + let region_snapshots = vec![ + // first snapshot-create + ( + disk_test.zpools[0].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101:7]:19016"), + ), + ( + disk_test.zpools[1].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:102:7]:19016"), + ), + ( + disk_test.zpools[2].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103:7]:19016"), + ), + // second snapshot-create + ( + disk_test.zpools[0].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101:7]:19016"), // duplicate! + ), + ( + disk_test.zpools[3].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:104:7]:19016"), + ), + ( + disk_test.zpools[2].datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103:7]:19017"), + ), + ]; + + // First, three `region_snapshot` records created in the snapshot-create + // saga, which are then used to make snapshot's volume construction request + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + let volume_id = Uuid::new_v4(); + let volume = datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[0].3.clone(), + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Sanity check + + assert_eq!(volume.id(), volume_id); + + // Make sure the volume has only three read-only targets: + + let crucible_targets = datastore + .read_only_resources_associated_with_volume(volume_id) + .await + .unwrap(); + assert_eq!(crucible_targets.read_only_targets.len(), 3); + + // Also validate the volume's region_snapshots got incremented by + // volume_create + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 1); + assert_eq!(region_snapshot.deleting, false); + } + + // Soft delete the volume, and validate that only three region_snapshot + // records are returned. + + let cr = datastore + .decrease_crucible_resource_count_and_soft_delete_volume(volume_id) + .await + .unwrap(); + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 0); + assert_eq!(region_snapshot.deleting, true); + } + + match cr { + nexus_db_queries::db::datastore::CrucibleResources::V1(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.datasets_and_snapshots.len(), 3); + } + + nexus_db_queries::db::datastore::CrucibleResources::V2(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.snapshots_to_delete.len(), 3); + } + } + + // Now, let's say we're at a spot where the running snapshots have been + // deleted, but before volume_hard_delete or region_snapshot_remove are + // called. Pretend another snapshot-create and snapshot-delete snuck in + // here, and the second snapshot hits a agent that reuses the first target. + + for i in 3..6 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + let volume_id = Uuid::new_v4(); + let volume = datastore + .volume_create(nexus_db_model::Volume::new( + volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[3].3.clone(), + region_snapshots[4].3.clone(), + region_snapshots[5].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Sanity check + + assert_eq!(volume.id(), volume_id); + + // Make sure the volume has only three read-only targets: + + let crucible_targets = datastore + .read_only_resources_associated_with_volume(volume_id) + .await + .unwrap(); + assert_eq!(crucible_targets.read_only_targets.len(), 3); + + // Also validate only the volume's region_snapshots got incremented by + // volume_create. + + for i in 0..3 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 0); + assert_eq!(region_snapshot.deleting, true); + } + for i in 3..6 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 1); + assert_eq!(region_snapshot.deleting, false); + } + + // Soft delete the volume, and validate that only three region_snapshot + // records are returned. + + let cr = datastore + .decrease_crucible_resource_count_and_soft_delete_volume(volume_id) + .await + .unwrap(); + + // Make sure every region_snapshot is now 0, and deleting + + for i in 0..6 { + let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; + let region_snapshot = datastore + .region_snapshot_get(dataset_id, region_id, snapshot_id) + .await + .unwrap() + .unwrap(); + + assert_eq!(region_snapshot.volume_references, 0); + assert_eq!(region_snapshot.deleting, true); + } + + match cr { + nexus_db_queries::db::datastore::CrucibleResources::V1(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.datasets_and_snapshots.len(), 3); + } + + nexus_db_queries::db::datastore::CrucibleResources::V2(cr) => { + assert!(cr.datasets_and_regions.is_empty()); + assert_eq!(cr.snapshots_to_delete.len(), 3); + } + } +} + #[nexus_test] async fn test_disk_create_saga_unwinds_correctly( cptestctx: &ControlPlaneTestContext, diff --git a/schema/crdb/6.0.0/up1.sql b/schema/crdb/6.0.0/up1.sql new file mode 100644 index 0000000000..4a3cdc302e --- /dev/null +++ b/schema/crdb/6.0.0/up1.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot ADD COLUMN IF NOT EXISTS deleting BOOL NOT NULL DEFAULT false; diff --git a/schema/crdb/6.0.0/up2.sql b/schema/crdb/6.0.0/up2.sql new file mode 100644 index 0000000000..77c136a3bf --- /dev/null +++ b/schema/crdb/6.0.0/up2.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region_snapshot ALTER COLUMN deleting DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ad09092f8f..a62cbae5ea 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -505,6 +505,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region_snapshot ( /* How many volumes reference this? */ volume_references INT8 NOT NULL, + /* Is this currently part of some resources_to_delete? */ + deleting BOOL NOT NULL, + PRIMARY KEY (dataset_id, region_id, snapshot_id) ); @@ -2574,7 +2577,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '5.0.0', NULL) + ( TRUE, NOW(), NOW(), '6.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;