From ae4282a3aa5adaa5c831af38af82ca84cf9076b6 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 4 Oct 2024 16:29:13 +0000 Subject: [PATCH 01/24] Implement record based Crucible reference counting Crucible volumes are created by layering read-write regions over a hierarchy of read-only resources. Originally only a region snapshot could be used as a read-only resource for a volume. With the introduction of read-only regions (created during the region snapshot replacement process) this is no longer true! Read-only resources can be used by many volumes, and because of this they need to have a reference count so they can be deleted when they're not referenced anymore. The region_snapshot table uses a `volume_references` column, which counts how many uses there are. The region table does not have this column, and more over a simple integer works for reference counting but does not tell you _what_ volume that use is from. This can be determined (see omdb's validate volume references command) but it's information that is tossed out, as Nexus knows what volumes use what resources! Instead, record what read-only resources a volume uses in a new table. As part of the schema change to add the new `volume_resource_usage` table, a migration is included that will create the appropriate records for all region snapshots. In testing, a few bugs were found: the worst being that read-only regions did not have their read_only column set to true. This would be a problem if read-only regions are created, but they're currently only created during region snapshot replacement. To detect if any of these regions were created, find all regions that were allocated for a snapshot volume: SELECT id FROM region WHERE volume_id IN (SELECT volume_id FROM snapshot); A similar bug was found in the simulated Crucible agent. This commit also reverts #6728, enabling region snapshot replacement again - it was disabled due to a lack of read-only region reference counting, so it can be enabled once again. --- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 16 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/volume_resource_usage.rs | 132 ++ nexus/db-queries/src/db/datastore/mod.rs | 4 +- nexus/db-queries/src/db/datastore/region.rs | 2 +- nexus/db-queries/src/db/datastore/volume.rs | 1576 ++++++++++--- nexus/db-queries/src/db/queries/mod.rs | 1 - nexus/db-queries/src/db/queries/volume.rs | 114 - nexus/src/app/background/init.rs | 32 +- .../region_snapshot_replacement_finish.rs | 12 +- ...on_snapshot_replacement_garbage_collect.rs | 23 +- .../region_snapshot_replacement_start.rs | 15 +- .../tasks/region_snapshot_replacement_step.rs | 36 +- nexus/src/app/sagas/disk_create.rs | 17 + .../region_snapshot_replacement_start.rs | 1 + .../integration_tests/volume_management.rs | 2091 ++++++++++++++++- .../crdb/crucible-ref-count-records/up01.sql | 1 + .../crdb/crucible-ref-count-records/up02.sql | 3 + .../crdb/crucible-ref-count-records/up03.sql | 4 + .../crdb/crucible-ref-count-records/up04.sql | 15 + .../crdb/crucible-ref-count-records/up05.sql | 3 + .../crdb/crucible-ref-count-records/up06.sql | 3 + .../crdb/crucible-ref-count-records/up07.sql | 8 + .../crdb/crucible-ref-count-records/up08.sql | 29 + schema/crdb/dbinit.sql | 45 +- sled-agent/src/sim/storage.rs | 2 +- 27 files changed, 3678 insertions(+), 512 deletions(-) create mode 100644 nexus/db-model/src/volume_resource_usage.rs delete mode 100644 nexus/db-queries/src/db/queries/volume.rs create mode 100644 schema/crdb/crucible-ref-count-records/up01.sql create mode 100644 schema/crdb/crucible-ref-count-records/up02.sql create mode 100644 schema/crdb/crucible-ref-count-records/up03.sql create mode 100644 schema/crdb/crucible-ref-count-records/up04.sql create mode 100644 schema/crdb/crucible-ref-count-records/up05.sql create mode 100644 schema/crdb/crucible-ref-count-records/up06.sql create mode 100644 schema/crdb/crucible-ref-count-records/up07.sql create mode 100644 schema/crdb/crucible-ref-count-records/up08.sql diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 250fbcb369..7473d5f6b1 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -107,6 +107,7 @@ mod vmm; mod vni; mod volume; mod volume_repair; +mod volume_resource_usage; mod vpc; mod vpc_firewall_rule; mod vpc_route; @@ -215,6 +216,7 @@ pub use vmm_state::*; pub use vni::*; pub use volume::*; pub use volume_repair::*; +pub use volume_resource_usage::*; pub use vpc::*; pub use vpc_firewall_rule::*; pub use vpc_route::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index d9e2c43e75..3eeee099eb 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1967,3 +1967,19 @@ joinable!(instance_ssh_key -> instance (instance_id)); allow_tables_to_appear_in_same_query!(sled, sled_instance); joinable!(network_interface -> probe (parent_id)); + +table! { + volume_resource_usage (usage_id) { + usage_id -> Uuid, + + volume_id -> Uuid, + + usage_type -> crate::VolumeResourceUsageTypeEnum, + + region_id -> Nullable, + + region_snapshot_dataset_id -> Nullable, + region_snapshot_region_id -> Nullable, + region_snapshot_snapshot_id -> Nullable, + } +} diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 12e03e6d4e..c9398114c1 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(107, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(108, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(108, "crucible-ref-count-records"), KnownVersion::new(107, "add-instance-boot-disk"), KnownVersion::new(106, "dataset-kinds-update"), KnownVersion::new(105, "inventory-nvme-firmware"), diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs new file mode 100644 index 0000000000..0095baf471 --- /dev/null +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -0,0 +1,132 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::volume_resource_usage; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel( + postgres_type(name = "volume_resource_usage_type", schema = "public") + )] + pub struct VolumeResourceUsageTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq, Eq, Hash)] + #[diesel(sql_type = VolumeResourceUsageTypeEnum)] + pub enum VolumeResourceUsageType; + + ReadOnlyRegion => b"read_only_region" + RegionSnapshot => b"region_snapshot" +); + +/// Crucible volumes are created by layering read-write regions over a hierarchy +/// of read-only resources. Originally only a region snapshot could be used as a +/// read-only resource for a volume. With the introduction of read-only regions +/// (created during the region snapshot replacement process) this is no longer +/// true. +/// +/// Read-only resources can be used by many volumes, and because of this they +/// need to have a reference count so they can be deleted when they're not +/// referenced anymore. The region_snapshot table uses a `volume_references` +/// column, which counts how many uses there are. The region table does not have +/// this column, and more over a simple integer works for reference counting but +/// does not tell you _what_ volume that use is from. This can be determined +/// (see omdb's validate volume references command) but it's information that is +/// tossed out, as Nexus knows what volumes use what resources! Instead, record +/// what read-only resources a volume uses here. +/// +/// Note: users should not use this object directly, and instead use the +/// [`VolumeResourceUsage`] enum, which is type-safe and will convert to and +/// from a [`VolumeResourceUsageRecord`] when interacting with the DB. +#[derive( + Queryable, Insertable, Debug, Clone, Selectable, PartialEq, Eq, Hash, +)] +#[diesel(table_name = volume_resource_usage)] +pub struct VolumeResourceUsageRecord { + pub usage_id: Uuid, + + pub volume_id: Uuid, + + pub usage_type: VolumeResourceUsageType, + + pub region_id: Option, + + pub region_snapshot_dataset_id: Option, + pub region_snapshot_region_id: Option, + pub region_snapshot_snapshot_id: Option, +} + +#[derive(Debug, Clone)] +pub enum VolumeResourceUsage { + ReadOnlyRegion { region_id: Uuid }, + + RegionSnapshot { dataset_id: Uuid, region_id: Uuid, snapshot_id: Uuid }, +} + +impl VolumeResourceUsageRecord { + pub fn new(volume_id: Uuid, usage: VolumeResourceUsage) -> Self { + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::ReadOnlyRegion, + + region_id: Some(region_id), + + region_snapshot_dataset_id: None, + region_snapshot_region_id: None, + region_snapshot_snapshot_id: None, + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::RegionSnapshot, + + region_id: None, + + region_snapshot_dataset_id: Some(dataset_id), + region_snapshot_region_id: Some(region_id), + region_snapshot_snapshot_id: Some(snapshot_id), + }, + } + } +} + +impl From for VolumeResourceUsage { + fn from(record: VolumeResourceUsageRecord) -> VolumeResourceUsage { + match record.usage_type { + VolumeResourceUsageType::ReadOnlyRegion => { + VolumeResourceUsage::ReadOnlyRegion { + region_id: record + .region_id + .expect("valid read-only region usage record"), + } + } + + VolumeResourceUsageType::RegionSnapshot => { + VolumeResourceUsage::RegionSnapshot { + dataset_id: record + .region_snapshot_dataset_id + .expect("valid region snapshot usage record"), + + region_id: record + .region_snapshot_region_id + .expect("valid region snapshot usage record"), + + snapshot_id: record + .region_snapshot_snapshot_id + .expect("valid region snapshot usage record"), + } + } + } + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index ec317c184f..7aaff8ccb4 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -793,7 +793,7 @@ mod test { } #[derive(Debug)] - struct TestDatasets { + pub(crate) struct TestDatasets { // eligible and ineligible aren't currently used, but are probably handy // for the future. #[allow(dead_code)] @@ -810,7 +810,7 @@ mod test { type SledToDatasetMap = HashMap>; impl TestDatasets { - async fn create( + pub(crate) async fn create( opctx: &OpContext, datastore: Arc, num_eligible_sleds: usize, diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index df733af5b5..f555e0af6b 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -264,7 +264,7 @@ impl DataStore { block_size, blocks_per_extent, extent_count, - read_only: false, + read_only: maybe_snapshot_id.is_some(), }, allocation_strategy, num_regions_required, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 3bd0ef41ed..d7d6b10081 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -7,6 +7,7 @@ use super::DataStore; use crate::db; use crate::db::datastore::OpContext; +use crate::db::datastore::RunnableQuery; use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; @@ -22,9 +23,11 @@ use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; +use crate::db::model::VolumeResourceUsage; +use crate::db::model::VolumeResourceUsageRecord; +use crate::db::model::VolumeResourceUsageType; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; -use crate::db::queries::volume::*; use crate::db::DbConnection; use crate::transaction_retry::OptionalError; use anyhow::bail; @@ -53,6 +56,7 @@ use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; use std::collections::VecDeque; +use std::net::AddrParseError; use std::net::SocketAddr; use std::net::SocketAddrV6; use uuid::Uuid; @@ -91,19 +95,187 @@ enum VolumeGetError { InvalidVolume(String), } +#[derive(Debug, thiserror::Error)] +enum VolumeCreationError { + #[error("Error from Volume creation: {0}")] + Public(Error), + + #[error("Serde error during Volume creation: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Address parsing error during Volume creation: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), +} + impl DataStore { - pub async fn volume_create(&self, volume: Volume) -> CreateResult { + async fn volume_create_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume: Volume, + crucible_targets: CrucibleTargets, + ) -> Result { use db::schema::volume::dsl; - #[derive(Debug, thiserror::Error)] - enum VolumeCreationError { - #[error("Error from Volume creation: {0}")] - Public(Error), + let maybe_volume: Option = dsl::volume + .filter(dsl::id.eq(volume.id())) + .select(Volume::as_select()) + .first_async(conn) + .await + .optional()?; - #[error("Serde error during Volume creation: {0}")] - SerdeError(#[from] serde_json::Error), + // If the volume existed already, return it and do not increase usage + // counts. + if let Some(volume) = maybe_volume { + return Ok(volume); + } + + let volume: Volume = diesel::insert_into(dsl::volume) + .values(volume.clone()) + .returning(Volume::as_returning()) + .get_result_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeCreationError::Public(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Volume, + volume.id().to_string().as_str(), + ), + )) + }) + })?; + + // Increase the usage count for the read-only Crucible resources + use db::schema::volume_resource_usage::dsl as ru_dsl; + + for read_only_target in crucible_targets.read_only_targets { + let sub_err = OptionalError::new(); + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &sub_err, + &read_only_target, + ) + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(VolumeCreationError::AddressParseError(sub_err)) + } else { + e + } + })?; + + match maybe_usage { + Some(usage) => { + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(VolumeResourceUsageRecord::new( + volume.id(), + usage, + )) + .execute_async(conn) + .await?; + } + + None => { + // Something went wrong, bail out - we can't create this + // Volume if we can't record its resource usage correctly + return Err(err.bail( + VolumeCreationError::CouldNotFindResource( + read_only_target, + ), + )); + } + } + } + + Ok(volume) + } + + async fn target_to_region( + conn: &async_bb8_diesel::Connection, + err: &OptionalError, + target: &str, + read_only: bool, + ) -> Result, diesel::result::Error> { + let address: SocketAddrV6 = target.parse().map_err(|e| err.bail(e))?; + let ip: db::model::Ipv6Addr = address.ip().into(); + + // Match region by dataset id and target port + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl as region_dsl; + + dataset_dsl::dataset + .inner_join( + region_dsl::region + .on(region_dsl::dataset_id.eq(dataset_dsl::id)), + ) + .filter(dataset_dsl::ip.eq(ip)) + .filter( + region_dsl::port + .eq(Some::(address.port().into())), + ) + .filter(region_dsl::read_only.eq(read_only)) + .select(Region::as_select()) + .get_result_async::(conn) + .await + .optional() + } + + async fn read_only_target_to_volume_resource_usage( + conn: &async_bb8_diesel::Connection, + err: &OptionalError, + read_only_target: &str, + ) -> Result, diesel::result::Error> { + // Easy case: it's a region snapshot, and we can match by the snapshot + // address directly + + let maybe_region_snapshot = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::snapshot_addr.eq(read_only_target.to_string())) + .filter(dsl::deleting.eq(false)) + .select(RegionSnapshot::as_select()) + .get_result_async::(conn) + .await + .optional()? + }; + + if let Some(region_snapshot) = maybe_region_snapshot { + return Ok(Some(VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + })); } + // Less easy case: it's a read-only region, and we have to match by + // dataset ip and region port + + let maybe_region = Self::target_to_region( + conn, + err, + read_only_target, + true, // read-only + ) + .await?; + + if let Some(region) = maybe_region { + return Ok(Some(VolumeResourceUsage::ReadOnlyRegion { + region_id: region.id(), + })); + } + + // If the resource was hard-deleted, or in the off chance that the + // region didn't have an assigned port, return None here. + Ok(None) + } + + pub async fn volume_create(&self, volume: Volume) -> CreateResult { // Grab all the targets that the volume construction request references. // Do this outside the transaction, as the data inside volume doesn't // change and this would simply add to the transaction time. @@ -132,67 +304,13 @@ impl DataStore { let crucible_targets = crucible_targets.clone(); let volume = volume.clone(); async move { - let maybe_volume: Option = dsl::volume - .filter(dsl::id.eq(volume.id())) - .select(Volume::as_select()) - .first_async(&conn) - .await - .optional()?; - - // If the volume existed already, return it and do not increase - // usage counts. - if let Some(volume) = maybe_volume { - return Ok(volume); - } - - // TODO do we need on_conflict do_nothing here? if the transaction - // model is read-committed, the SELECT above could return nothing, - // and the INSERT here could still result in a conflict. - // - // See also https://github.com/oxidecomputer/omicron/issues/1168 - let volume: Volume = diesel::insert_into(dsl::volume) - .values(volume.clone()) - .on_conflict(dsl::id) - .do_nothing() - .returning(Volume::as_returning()) - .get_result_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Volume, - volume.id().to_string().as_str(), - ), - ), - ) - }) - })?; - - // Increase the usage count for Crucible resources according to the - // contents of the volume. - - // Increase the number of uses for each referenced region snapshot. - use db::schema::region_snapshot::dsl as rs_dsl; - for read_only_target in &crucible_targets.read_only_targets - { - diesel::update(rs_dsl::region_snapshot) - .filter( - rs_dsl::snapshot_addr - .eq(read_only_target.clone()), - ) - .filter(rs_dsl::deleting.eq(false)) - .set( - rs_dsl::volume_references - .eq(rs_dsl::volume_references + 1), - ) - .execute_async(&conn) - .await?; - } - - Ok(volume) + Self::volume_create_txn( + &conn, + err, + volume, + crucible_targets, + ) + .await } }) .await @@ -200,10 +318,22 @@ impl DataStore { if let Some(err) = err.take() { match err { VolumeCreationError::Public(err) => err, + VolumeCreationError::SerdeError(err) => { Error::internal_error(&format!( - "Transaction error: {}", - err + "SerdeError error: {err}" + )) + } + + VolumeCreationError::CouldNotFindResource(s) => { + Error::internal_error(&format!( + "CouldNotFindResource error: {s}" + )) + } + + VolumeCreationError::AddressParseError(err) => { + Error::internal_error(&format!( + "AddressParseError error: {err}" )) } } @@ -213,18 +343,27 @@ impl DataStore { }) } - /// Return a `Option` based on id, even if it's soft deleted. - pub async fn volume_get( - &self, + async fn volume_get_impl( + conn: &async_bb8_diesel::Connection, volume_id: Uuid, - ) -> LookupResult> { + ) -> Result, diesel::result::Error> { use db::schema::volume::dsl; dsl::volume .filter(dsl::id.eq(volume_id)) .select(Volume::as_select()) - .first_async::(&*self.pool_connection_unauthorized().await?) + .first_async::(conn) .await .optional() + } + + /// Return a `Option` based on id, even if it's soft deleted. + pub async fn volume_get( + &self, + volume_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_unauthorized().await?; + Self::volume_get_impl(&conn, volume_id) + .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -241,6 +380,105 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + fn volume_usage_records_for_resource_query( + resource: VolumeResourceUsage, + ) -> impl RunnableQuery { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + dsl::volume_resource_usage + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .into_boxed() + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => dsl::volume_resource_usage + .filter( + dsl::usage_type.eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .into_boxed(), + } + } + + /// For a given VolumeResourceUsage, return all found usage records for it. + pub async fn volume_usage_records_for_resource( + &self, + resource: VolumeResourceUsage, + ) -> ListResultVec { + let conn = self.pool_connection_unauthorized().await?; + + Self::volume_usage_records_for_resource_query(resource) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// When moving a resource from one volume to another, call this to update + /// the corresponding volume resource usage record + pub async fn swap_volume_usage_records_for_resources( + conn: &async_bb8_diesel::Connection, + resource: VolumeResourceUsage, + from_volume_id: Uuid, + to_volume_id: Uuid, + ) -> Result<(), diesel::result::Error> { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + } + + Ok(()) + } + async fn volume_checkout_allowed( reason: &VolumeCheckoutReason, vcr: &VolumeConstructionRequest, @@ -781,17 +1019,33 @@ impl DataStore { } /// Find regions for deleted volumes that do not have associated region - /// snapshots. + /// snapshots and are not being used by any other non-deleted volumes pub async fn find_deleted_volume_regions( &self, ) -> ListResultVec<(Dataset, Region, Option, Volume)> { + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("find_deleted_volume_regions") + .transaction(&conn, |conn| async move { + Self::find_deleted_volume_regions_txn(&conn).await + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + async fn find_deleted_volume_regions_txn( + conn: &async_bb8_diesel::Connection, + ) -> Result< + Vec<(Dataset, Region, Option, Volume)>, + diesel::result::Error, + > { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; use db::schema::volume::dsl as volume_dsl; + use db::schema::volume_resource_usage::dsl as ru_dsl; // Find all regions and datasets - region_dsl::region + let tuples_superset = region_dsl::region .inner_join( volume_dsl::volume.on(region_dsl::volume_id.eq(volume_dsl::id)), ) @@ -824,9 +1078,29 @@ impl DataStore { Option::::as_select(), Volume::as_select(), )) - .load_async(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .load_async(conn) + .await?; + + // Filter out read-only regions that are still being used by volumes + let mut tuples = Vec::with_capacity(tuples_superset.len()); + + for (dataset, region, region_snapshot, volume) in tuples_superset { + let region_usage_left = ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(ru_dsl::region_id.eq(region.id())) + .count() + .get_result_async::(conn) + .await?; + + if region_usage_left == 0 { + tuples.push((dataset, region, region_snapshot, volume)); + } + } + + Ok(tuples) } pub async fn read_only_resources_associated_with_volume( @@ -857,8 +1131,15 @@ enum SoftDeleteTransactionError { #[error("Serde error decreasing Crucible resources: {0}")] SerdeError(#[from] serde_json::Error), - #[error("Updated {0} database rows, expected 1")] - UnexpectedDatabaseUpdate(usize), + #[error("Updated {0} database rows in {1}, expected 1")] + UnexpectedDatabaseUpdate(usize, String), + + // XXX is this an error? delete volume anyway, else we're stuck? + #[error("Could not match resource to {0}")] + CouldNotFindResource(String), + + #[error("Address parsing error during Volume soft-delete: {0}")] + AddressParseError(#[from] AddrParseError), } impl DataStore { @@ -923,8 +1204,8 @@ impl DataStore { .map_err(|e| err.bail(e.into()))?; // Grab all the targets that the volume construction request references. - // Do this _inside_ the transaction, as the read-only parent data inside - // volume can change as a result of region snapshot replacement. + // Do this _inside_ the transaction, as the data inside volumes can + // change as a result of region / region snapshot replacement. let crucible_targets = { let mut crucible_targets = CrucibleTargets::default(); read_only_resources_associated_with_volume( @@ -934,52 +1215,237 @@ impl DataStore { crucible_targets }; - // Decrease the number of references for each region snapshot that a - // volume references, returning the updated rows. - let updated_region_snapshots = ConditionallyDecreaseReferences::new( - crucible_targets.read_only_targets, - ) - .get_results_async::(conn) - .await?; + // Decrease the number of references for each resource that a volume + // references, collecting the regions and region snapshots that were + // freed up for deletion. - // Return all regions for the volume to be cleaned up. - let regions: Vec = { - use db::schema::region::dsl as region_dsl; - use db::schema::region_snapshot::dsl as region_snapshot_dsl; + let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); - region_dsl::region - .left_join( - region_snapshot_dsl::region_snapshot - .on(region_snapshot_dsl::region_id.eq(region_dsl::id)), - ) - .filter( - region_snapshot_dsl::volume_references - .eq(0) - .or(region_snapshot_dsl::volume_references.is_null()), + let mut regions: Vec = + Vec::with_capacity(3 * num_read_write_subvolumes); + + let mut region_snapshots: Vec = + Vec::with_capacity(crucible_targets.read_only_targets.len()); + + // First, grab read-write regions - they're not shared, but they are + // not candidates for deletion if there are region snapshots + { + let mut read_write_targets = + Vec::with_capacity(3 * num_read_write_subvolumes); + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let sub_err = OptionalError::new(); + + let maybe_region = Self::target_to_region( + conn, &sub_err, &target, false, // read-write ) - .filter(region_dsl::volume_id.eq(volume_id)) - .select(Region::as_select()) - .get_results_async(conn) - .await? - .into_iter() - .map(|region| region.id()) - .collect() - }; + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(SoftDeleteTransactionError::AddressParseError( + sub_err, + )) + } else { + e + } + })?; - // Return the region snapshots that had their volume_references updated - // to 0 (and had the deleting flag set) to be cleaned up. - let region_snapshots = updated_region_snapshots - .into_iter() - .filter(|region_snapshot| { - region_snapshot.volume_references == 0 - && region_snapshot.deleting - }) - .map(|region_snapshot| RegionSnapshotV3 { - dataset: region_snapshot.dataset_id, - region: region_snapshot.region_id, - snapshot: region_snapshot.snapshot_id, - }) - .collect(); + let Some(region) = maybe_region else { + return Err(err.bail( + SoftDeleteTransactionError::CouldNotFindResource( + format!("could not find resource for {target}"), + ), + )); + }; + + // Filter out regions that have any region-snapshots + let region_snapshot_count: i64 = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::region_id.eq(region.id())) + .count() + .get_result_async::(conn) + .await? + }; + + if region_snapshot_count == 0 { + regions.push(region.id()); + } + } + } + + for read_only_target in &crucible_targets.read_only_targets { + use db::schema::volume_resource_usage::dsl as ru_dsl; + + let sub_err = OptionalError::new(); + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &sub_err, + read_only_target, + ) + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(SoftDeleteTransactionError::AddressParseError( + sub_err, + )) + } else { + e + } + })?; + + let Some(usage) = maybe_usage else { + return Err(err.bail( + SoftDeleteTransactionError::CouldNotFindResource(format!( + "could not find resource for {read_only_target}" + )), + )); + }; + + // For each read-only resource, remove the associated volume + // resource usage record for this volume. Only return a resource for + // deletion if no more associated volume usage records are found. + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::ReadOnlyRegion, + ), + ) + .filter(ru_dsl::region_id.eq(Some(region_id))) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage (region)".into(), + ) + )); + } + + let region_usage_left = ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(ru_dsl::region_id.eq(region_id)) + .count() + .get_result_async::(conn) + .await?; + + if region_usage_left == 0 { + regions.push(region_id); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage \ + (region_snapshot)".into(), + ) + )); + } + + let region_snapshot_usage_left = + ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .count() + .get_result_async::(conn) + .await?; + + if region_snapshot_usage_left == 0 { + // Don't forget to set `deleting`! see: omicron#4095 + { + use db::schema::region_snapshot::dsl; + let updated_rows = + diesel::update(dsl::region_snapshot) + .filter(dsl::dataset_id.eq(dataset_id)) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::snapshot_id.eq(snapshot_id)) + .filter( + dsl::snapshot_addr + .eq(read_only_target.clone()), + ) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting".into(), + ) + )); + } + } + + region_snapshots.push(RegionSnapshotV3 { + dataset: dataset_id, + region: region_id, + snapshot: snapshot_id, + }); + } + } + } + } let resources_to_delete = CrucibleResources::V3(CrucibleResourcesV3 { regions, @@ -1005,6 +1471,7 @@ impl DataStore { return Err(err.bail( SoftDeleteTransactionError::UnexpectedDatabaseUpdate( updated_rows, + "volume".into(), ), )); } @@ -1053,11 +1520,20 @@ impl DataStore { ) -> Result { #[derive(Debug, thiserror::Error)] enum RemoveReadOnlyParentError { - #[error("Serde error removing read only parent: {0}")] + #[error("Error removing read-only parent: {0}")] + Public(Error), + + #[error("Serde error removing read-only parent: {0}")] SerdeError(#[from] serde_json::Error), #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), + + #[error("Address parsing error during ROP removal: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), } // In this single transaction: @@ -1080,8 +1556,8 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - // Grab the volume in question. If the volume record was already - // deleted then we can just return. + // Grab the volume in question. If the volume record was + // already deleted then we can just return. let volume = { use db::schema::volume::dsl; @@ -1100,8 +1576,8 @@ impl DataStore { }; if volume.time_deleted.is_some() { - // this volume is deleted, so let whatever is deleting - // it clean it up. + // this volume is deleted, so let whatever is + // deleting it clean it up. return Ok(false); } else { // A volume record exists, and was not deleted, we @@ -1110,8 +1586,8 @@ impl DataStore { } }; - // If a read_only_parent exists, remove it from volume_id, and - // attach it to temp_volume_id. + // If a read_only_parent exists, remove it from volume_id, + // and attach it to temp_volume_id. let vcr: VolumeConstructionRequest = serde_json::from_str( volume.data() @@ -1179,6 +1655,7 @@ impl DataStore { sub_volumes: vec![], read_only_parent, }; + let rop_volume_data = serde_json::to_string( &rop_vcr @@ -1188,6 +1665,7 @@ impl DataStore { e, )) })?; + // Update the temp_volume_id with the volume // data that contains the read_only_parent. let num_updated = @@ -1197,20 +1675,73 @@ impl DataStore { .set(volume_dsl::data.eq(rop_volume_data)) .execute_async(&conn) .await?; + if num_updated != 1 { return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); } + + // Update the volume resource usage record for + // every read-only resource in the ROP + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &rop_vcr, + &mut crucible_targets, + ); + crucible_targets + }; + + for read_only_target in crucible_targets.read_only_targets { + let sub_err = OptionalError::new(); + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + &conn, + &sub_err, + &read_only_target, + ) + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(RemoveReadOnlyParentError::AddressParseError(sub_err)) + } else { + e + } + })?; + + let Some(usage) = maybe_usage else { + return Err(err.bail( + RemoveReadOnlyParentError::CouldNotFindResource(format!( + "could not find resource for {read_only_target}" + )), + )); + }; + + Self::swap_volume_usage_records_for_resources( + &conn, + usage, + volume_id, + temp_volume_id, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + RemoveReadOnlyParentError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + } + Ok(true) } } - VolumeConstructionRequest::File { id: _, block_size: _, path: _ } - | VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _ } - | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { + + VolumeConstructionRequest::File { .. } + | VolumeConstructionRequest::Region { .. } + | VolumeConstructionRequest::Url { .. } => { // Volume has a format that does not contain ROPs Ok(false) } @@ -1608,7 +2139,7 @@ pub struct CrucibleResourcesV2 { pub snapshots_to_delete: Vec, } -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] pub struct RegionSnapshotV3 { dataset: Uuid, region: Uuid, @@ -1895,6 +2426,8 @@ pub struct VolumeReplacementParams { #[derive(Debug, Clone, Copy)] pub struct VolumeWithTarget(pub Uuid); +// Note: it would be easier to pass around strings, but comparison could fail +// due to formatting issues, so pass around SocketAddrV6 here #[derive(Debug, Clone, Copy)] pub struct ExistingTarget(pub SocketAddrV6); @@ -2242,6 +2775,18 @@ impl DataStore { #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), + + #[error( + "Address parsing error during Volume snapshot \ + replacement: {0}" + )] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), + + #[error("Multiple volume resource usage records for {0}")] + MultipleResourceUsageRecords(String), } let err = OptionalError::new(); @@ -2249,8 +2794,10 @@ impl DataStore { self.transaction_retry_wrapper("volume_replace_snapshot") .transaction(&conn, |conn| { let err = err.clone(); + async move { use db::schema::volume::dsl as volume_dsl; + use db::schema::volume_resource_usage::dsl as ru_dsl; // Grab the old volume first let maybe_old_volume = { @@ -2409,24 +2956,188 @@ impl DataStore { read_only_parent: None, }; - let volume_data = serde_json::to_string(&vcr) + let volume_data = serde_json::to_string(&vcr) + .map_err(|e| { + err.bail(VolumeReplaceSnapshotError::SerdeError(e)) + })?; + + // Update the volume to delete data + let num_updated = + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_to_delete_id.0)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(volume_data)) + .execute_async(&conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( + num_updated, 1, + ) + )); + } + + // Update the appropriate volume resource usage records - it + // could either be a read-only region or a region snapshot, + // so determine what it is first + + let sub_err = OptionalError::new(); + let maybe_existing_usage = Self::read_only_target_to_volume_resource_usage( + &conn, + &sub_err, + &existing.0.to_string(), + ) + .await + .map_err(|e| if let Some(sub_err) = sub_err.take() { + err.bail(VolumeReplaceSnapshotError::AddressParseError( + sub_err + )) + } else { + e + } + )?; + + let Some(existing_usage) = maybe_existing_usage else { + return Err(err.bail( + VolumeReplaceSnapshotError::CouldNotFindResource( + format!( + "could not find resource for {}", + existing.0, + ) + )) + ); + }; + + // The "existing" target moved into the volume to delete + + Self::swap_volume_usage_records_for_resources( + &conn, + existing_usage, + volume_id.0, + volume_to_delete_id.0, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + + let sub_err = OptionalError::new(); + let maybe_replacement_usage = + Self::read_only_target_to_volume_resource_usage( + &conn, + &sub_err, + &replacement.0.to_string(), + ) + .await + .map_err(|e| if let Some(sub_err) = sub_err.take() { + err.bail(VolumeReplaceSnapshotError::AddressParseError( + sub_err + )) + } else { + e + } + )?; + + let Some(replacement_usage) = maybe_replacement_usage else { + return Err(err.bail( + VolumeReplaceSnapshotError::CouldNotFindResource( + format!( + "could not find resource for {}", + existing.0, + ) + )) + ); + }; + + // This function may be called with a replacement volume + // that is completely blank, to be filled in later by this + // function. `volume_create` will have been called but will + // not have added any volume resource usage records, because + // it was blank! + // + // The indention leaving this transaction is that the + // correct volume resource usage records exist, so if this + // is the case, create a new record. + // + // If the replacement volume usage records exist, then + // perform a swap instead. + + let existing_replacement_volume_usage_records = + Self::volume_usage_records_for_resource_query( + replacement_usage.clone(), + ) + .load_async(&conn) + .await .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })? + // XXX be smart enough to .filter the above query + .into_iter() + .filter(|record| record.volume_id == volume_to_delete_id.0) + .count(); + + // The "replacement" target moved into the volume + + if existing_replacement_volume_usage_records == 0 { + // No matching record + let new_record = VolumeResourceUsageRecord::new( + volume_id.0, + replacement_usage, + ); - // Update the volume to delete data - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_to_delete_id.0)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(volume_data)) + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(new_record) .execute_async(&conn) - .await?; - - if num_updated != 1 { + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + } else if existing_replacement_volume_usage_records == 1 { + // One matching record: perform swap + Self::swap_volume_usage_records_for_resources( + &conn, + replacement_usage, + volume_to_delete_id.0, + volume_id.0, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + } else { + // More than one matching record! return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( - num_updated, 1, + VolumeReplaceSnapshotError::MultipleResourceUsageRecords( + format!("{replacement_usage:?}") ) )); } @@ -2440,19 +3151,13 @@ impl DataStore { match err { VolumeReplaceSnapshotError::Public(e) => e, - VolumeReplaceSnapshotError::SerdeError(_) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::SnapshotReplacementError(_) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) => { + VolumeReplaceSnapshotError::SerdeError(_) | + VolumeReplaceSnapshotError::SnapshotReplacementError(_) | + VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) | + VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) | + VolumeReplaceSnapshotError::AddressParseError(_) | + VolumeReplaceSnapshotError::CouldNotFindResource(_) | + VolumeReplaceSnapshotError::MultipleResourceUsageRecords(_) => { Error::internal_error(&err.to_string()) } } @@ -2463,7 +3168,7 @@ impl DataStore { } } -/// Return the targets from a VolumeConstructionRequest. +/// Return the read-only targets from a VolumeConstructionRequest. /// /// The targets of a volume construction request map to resources. pub fn read_only_resources_associated_with_volume( @@ -2508,6 +3213,65 @@ pub fn read_only_resources_associated_with_volume( } } +/// Return the read-write targets from a VolumeConstructionRequest. +/// +/// The targets of a volume construction request map to resources. +pub fn read_write_resources_associated_with_volume( + vcr: &VolumeConstructionRequest, + targets: &mut Vec, +) { + let mut parts: VecDeque<&VolumeConstructionRequest> = VecDeque::new(); + parts.push_back(&vcr); + + while let Some(vcr_part) = parts.pop_front() { + match vcr_part { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + for sub_volume in sub_volumes { + parts.push_back(sub_volume); + } + + // No need to look under read-only parent + } + + VolumeConstructionRequest::Url { .. } => { + // no action required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if !opts.read_only { + for target in &opts.target { + targets.push(target.clone()); + } + } + } + + VolumeConstructionRequest::File { .. } => { + // no action required + } + } + } +} + +/// Return the number of read-write subvolumes in a VolumeConstructionRequest. +pub fn count_read_write_sub_volumes(vcr: &VolumeConstructionRequest) -> usize { + match vcr { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + sub_volumes.len() + } + + VolumeConstructionRequest::Url { .. } => 0, + + VolumeConstructionRequest::Region { .. } => { + // We don't support a pure Region VCR at the volume + // level in the database, so this choice should + // never be encountered. + panic!("Region not supported as a top level volume"); + } + + VolumeConstructionRequest::File { .. } => 0, + } +} + /// Returns true if the sub-volumes of a Volume are all read-only pub fn volume_is_read_only( vcr: &VolumeConstructionRequest, @@ -2780,8 +3544,14 @@ impl DataStore { mod tests { use super::*; + use crate::db::datastore::test::TestDatasets; use crate::db::datastore::test_utils::datastore_test; + use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; + use nexus_config::RegionAllocationStrategy; + use nexus_db_model::SqlU16; use nexus_test_utils::db::test_setup_database; + use nexus_types::external_api::params::DiskSource; + use omicron_common::api::external::ByteCount; use omicron_test_utils::dev; use sled_agent_client::types::CrucibleOpts; @@ -2896,47 +3666,81 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_region"); let log = logctx.log.new(o!()); let mut db = test_setup_database(&log).await; - let (_opctx, db_datastore) = datastore_test(&logctx, &db).await; + let (opctx, db_datastore) = datastore_test(&logctx, &db).await; + let conn = db_datastore.pool_connection_unauthorized().await.unwrap(); - // Insert four Region records (three, plus one additionally allocated) + let _test_datasets = TestDatasets::create( + &opctx, + db_datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); - let new_volume_id = Uuid::new_v4(); - - let mut region_and_volume_ids = [ - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), new_volume_id), - ]; + let volume_to_delete_id = Uuid::new_v4(); - { - let conn = db_datastore.pool_connection_for_tests().await.unwrap(); + let datasets_and_regions = db_datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); - for i in 0..4 { - let (_, volume_id) = region_and_volume_ids[i]; + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); - let region = Region::new( - Uuid::new_v4(), // dataset id - volume_id, - 512_i64.try_into().unwrap(), - 10, - 10, - 10001, - false, - ); + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); - region_and_volume_ids[i].0 = region.id(); + let address: SocketAddrV6 = + db_datastore.region_addr(region.id()).await.unwrap().unwrap(); - use nexus_db_model::schema::region::dsl; - diesel::insert_into(dsl::region) - .values(region.clone()) - .execute_async(&*conn) - .await - .unwrap(); - } + region_addresses.push(address.to_string()); } + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + false, // read-write + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = db_datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + let _volume = db_datastore .volume_create(nexus_db_model::Volume::new( volume_id, @@ -2951,9 +3755,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // target to replace - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), // target to replace + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -2974,26 +3778,19 @@ mod tests { // Replace one - let target = region_and_volume_ids[0]; - let replacement = region_and_volume_ids[3]; - let volume_replace_region_result = db_datastore .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, ) .await @@ -3020,9 +3817,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:101::1]:11111"), // replaced - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + replacement_region_addr.to_string(), // replaced + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3043,19 +3840,15 @@ mod tests { .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, ) .await @@ -3082,9 +3875,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // back to what it was - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), // back to what it was + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3109,13 +3902,126 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_snapshot"); let log = logctx.log.new(o!()); let mut db = test_setup_database(&log).await; - let (_opctx, db_datastore) = datastore_test(&logctx, &db).await; + let (opctx, db_datastore) = datastore_test(&logctx, &db).await; + let conn = db_datastore.pool_connection_for_tests().await.unwrap(); - // Insert two volumes: one with the target to replace, and one temporary - // "volume to delete" that's blank. + let _test_datasets = TestDatasets::create( + &opctx, + db_datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); let volume_to_delete_id = Uuid::new_v4(); + + let datasets_and_regions = db_datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); + + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); + + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); + + let address: SocketAddrV6 = + db_datastore.region_addr(region.id()).await.unwrap().unwrap(); + + region_addresses.push(address.to_string()); + } + + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + true, // read-only + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = db_datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + let region_snapshots = [ + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + ), + ]; + + db_datastore + .region_snapshot_create(region_snapshots[0].clone()) + .await + .unwrap(); + db_datastore + .region_snapshot_create(region_snapshots[1].clone()) + .await + .unwrap(); + db_datastore + .region_snapshot_create(region_snapshots[2].clone()) + .await + .unwrap(); + + // Insert two volumes: one with the target to replace, and one temporary + // "volume to delete" that's blank. Validate the pre-replacement volume + // resource usage records. + let rop_id = Uuid::new_v4(); db_datastore @@ -3132,9 +4038,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3156,9 +4062,9 @@ mod tests { id: rop_id, target: vec![ // target to replace - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3177,6 +4083,22 @@ mod tests { .await .unwrap(); + for region_snapshot in ®ion_snapshots { + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + db_datastore .volume_create(nexus_db_model::Volume::new( volume_to_delete_id, @@ -3191,21 +4113,33 @@ mod tests { .await .unwrap(); + // `volume_create` above was called with a blank volume, so no usage + // record will have been created for the read-only region + + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + // Do the replacement let volume_replace_snapshot_result = db_datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd00:1122:3344:104::1]:400".parse().unwrap()), - ReplacementTarget( - "[fd55:1122:3344:101::1]:111".parse().unwrap(), - ), + ExistingTarget(address_1.parse().unwrap()), + ReplacementTarget(replacement_region_addr), VolumeToDelete(volume_to_delete_id), ) .await .unwrap(); - assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done,); + assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done); // Ensure the shape of the resulting VCRs @@ -3227,9 +4161,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3251,9 +4185,9 @@ mod tests { id: rop_id, target: vec![ // target replaced - String::from("[fd55:1122:3344:101::1]:111"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + replacement_region_addr.to_string(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3293,7 +4227,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replaced target stashed here - String::from("[fd00:1122:3344:104::1]:400"), + address_1.clone(), ], lossy: false, flush_timeout: None, @@ -3309,15 +4243,54 @@ mod tests { }, ); + // Validate the post-replacement volume resource usage records + + for (i, region_snapshot) in region_snapshots.iter().enumerate() { + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + + match i { + 0 => { + assert_eq!(usage[0].volume_id, volume_to_delete_id); + } + + 1 | 2 => { + assert_eq!(usage[0].volume_id, volume_id); + } + + _ => panic!("out of range"), + } + } + + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + // Now undo the replacement. Note volume ID is not swapped. let volume_replace_snapshot_result = db_datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd55:1122:3344:101::1]:111".parse().unwrap()), - ReplacementTarget( - "[fd00:1122:3344:104::1]:400".parse().unwrap(), - ), + ExistingTarget(replacement_region_addr), + ReplacementTarget(address_1.parse().unwrap()), VolumeToDelete(volume_to_delete_id), ) .await @@ -3344,9 +4317,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3368,9 +4341,7 @@ mod tests { id: rop_id, target: vec![ // back to what it was - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1, address_2, address_3, ], lossy: false, flush_timeout: None, @@ -3410,7 +4381,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replacement stashed here - String::from("[fd55:1122:3344:101::1]:111"), + replacement_region_addr.to_string(), ], lossy: false, flush_timeout: None, @@ -3426,6 +4397,36 @@ mod tests { }, ); + // Validate the post-post-replacement volume resource usage records + + for region_snapshot in ®ion_snapshots { + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); + db.cleanup().await.unwrap(); logctx.cleanup_successful(); } @@ -3440,6 +4441,41 @@ mod tests { let volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + db_datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + )) + .await + .unwrap(); + db_datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + )) + .await + .unwrap(); + db_datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + )) + .await + .unwrap(); + // case where the needle is found db_datastore @@ -3458,9 +4494,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2, + address_3, ], lossy: false, flush_timeout: None, @@ -3482,7 +4518,7 @@ mod tests { let volumes = db_datastore .find_volumes_referencing_socket_addr( &opctx, - "[fd00:1122:3344:104::1]:400".parse().unwrap(), + address_1.parse().unwrap(), ) .await .unwrap(); diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index 02800e3a3c..5f34c7cfb3 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -14,7 +14,6 @@ pub mod network_interface; pub mod oximeter; pub mod region_allocation; pub mod virtual_provisioning_collection_update; -pub mod volume; pub mod vpc; pub mod vpc_subnet; diff --git a/nexus/db-queries/src/db/queries/volume.rs b/nexus/db-queries/src/db/queries/volume.rs deleted file mode 100644 index e7fe832a82..0000000000 --- a/nexus/db-queries/src/db/queries/volume.rs +++ /dev/null @@ -1,114 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Helper queries for working with volumes. - -use crate::db; -use crate::db::pool::DbConnection; -use diesel::expression::is_aggregate; -use diesel::expression::ValidGrouping; -use diesel::pg::Pg; -use diesel::query_builder::AstPass; -use diesel::query_builder::Query; -use diesel::query_builder::QueryFragment; -use diesel::query_builder::QueryId; -use diesel::sql_types; -use diesel::Column; -use diesel::Expression; -use diesel::QueryResult; -use diesel::RunQueryDsl; - -/// Produces a query fragment that will conditionally reduce the volume -/// references for region_snapshot rows whose snapshot_addr column is part of a -/// list. -/// -/// The output should look like: -/// -/// ```sql -/// update region_snapshot set -/// volume_references = volume_references - 1, -/// deleting = case when volume_references = 1 -/// then true -/// else false -/// end -/// where -/// snapshot_addr in ('a1', 'a2', 'a3') and -/// volume_references >= 1 and -/// deleting = false -/// returning * -/// ``` -#[must_use = "Queries must be executed"] -pub struct ConditionallyDecreaseReferences { - snapshot_addrs: Vec, -} - -impl ConditionallyDecreaseReferences { - pub fn new(snapshot_addrs: Vec) -> Self { - Self { snapshot_addrs } - } -} - -impl QueryId for ConditionallyDecreaseReferences { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} - -impl QueryFragment for ConditionallyDecreaseReferences { - fn walk_ast<'a>(&'a self, mut out: AstPass<'_, 'a, Pg>) -> QueryResult<()> { - use db::schema::region_snapshot::dsl; - - out.push_sql("UPDATE "); - dsl::region_snapshot.walk_ast(out.reborrow())?; - out.push_sql(" SET "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" - 1, "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = CASE WHEN "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = 1 THEN TRUE ELSE FALSE END WHERE "); - out.push_identifier(dsl::snapshot_addr::NAME)?; - out.push_sql(" IN ("); - - // If self.snapshot_addrs is empty, this query fragment will - // intentionally not update any region_snapshot rows. - for (i, snapshot_addr) in self.snapshot_addrs.iter().enumerate() { - out.push_bind_param::(snapshot_addr)?; - if i == self.snapshot_addrs.len() - 1 { - out.push_sql(" "); - } else { - out.push_sql(", "); - } - } - - out.push_sql(") AND "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" >= 1 AND "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = false RETURNING *"); - - Ok(()) - } -} - -impl Expression for ConditionallyDecreaseReferences { - type SqlType = sql_types::Array; -} - -impl ValidGrouping - for ConditionallyDecreaseReferences -{ - type IsAggregate = is_aggregate::Never; -} - -impl RunQueryDsl for ConditionallyDecreaseReferences {} - -type SelectableSql = < - >::SelectExpression as diesel::Expression ->::SqlType; - -impl Query for ConditionallyDecreaseReferences { - type SqlType = SelectableSql; -} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 69221779ee..8ae3be84af 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -778,8 +778,7 @@ impl BackgroundTasksInitializer { "detect if region snapshots need replacement and begin the \ process", period: config.region_snapshot_replacement_start.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new(RegionSnapshotReplacementDetector::disabled( + task_impl: Box::new(RegionSnapshotReplacementDetector::new( datastore.clone(), sagas.clone(), )), @@ -795,13 +794,10 @@ impl BackgroundTasksInitializer { period: config .region_snapshot_replacement_garbage_collection .period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementGarbageCollect::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementGarbageCollect::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_garbage_collection, @@ -813,13 +809,10 @@ impl BackgroundTasksInitializer { "detect what volumes were affected by a region snapshot \ replacement, and run the step saga for them", period: config.region_snapshot_replacement_step.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFindAffected::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementFindAffected::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_step, @@ -831,10 +824,9 @@ impl BackgroundTasksInitializer { "complete a region snapshot replacement if all the steps are \ done", period: config.region_snapshot_replacement_finish.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFinishDetector::disabled(datastore), - ), + task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( + datastore, + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_finish, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 83078cb978..caa2fa7bed 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -19,17 +19,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, - disabled: bool, } impl RegionSnapshotReplacementFinishDetector { - #[allow(dead_code)] pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: false } - } - - pub fn disabled(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: true } + RegionSnapshotReplacementFinishDetector { datastore } } async fn transition_requests_to_done( @@ -159,10 +153,6 @@ impl BackgroundTask for RegionSnapshotReplacementFinishDetector { async move { let mut status = RegionSnapshotReplacementFinishStatus::default(); - if self.disabled { - return json!(status); - } - self.transition_requests_to_done(opctx, &mut status).await; json!(status) diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index eb171fda12..f3b1b68198 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -22,28 +22,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementGarbageCollect { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementGarbageCollect { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementGarbageCollect { datastore, sagas } } async fn send_garbage_collect_request( @@ -152,10 +135,6 @@ impl BackgroundTask for RegionSnapshotReplacementGarbageCollect { let mut status = RegionSnapshotReplacementGarbageCollectStatus::default(); - if self.disabled { - return json!(status); - } - self.clean_up_region_snapshot_replacement_volumes( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 8fd1e55975..bc739ecf27 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -29,20 +29,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementDetector { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementDetector { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: false } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: true } + RegionSnapshotReplacementDetector { datastore, sagas } } async fn send_start_request( @@ -246,10 +237,6 @@ impl BackgroundTask for RegionSnapshotReplacementDetector { async { let mut status = RegionSnapshotReplacementStartStatus::default(); - if self.disabled { - return json!(status); - } - self.create_requests_for_region_snapshots_on_expunged_disks( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index da05500a58..c402c8fc92 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -42,28 +42,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFindAffected { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementFindAffected { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementFindAffected { datastore, sagas } } async fn send_start_request( @@ -452,10 +435,6 @@ impl BackgroundTask for RegionSnapshotReplacementFindAffected { async move { let mut status = RegionSnapshotReplacementStepStatus::default(); - if self.disabled { - return json!(status); - } - // Importantly, clean old steps up before finding affected volumes! // Otherwise, will continue to find the snapshot in volumes to // delete, and will continue to see conflicts in next function. @@ -500,6 +479,19 @@ mod test { ) -> Uuid { let new_volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + snapshot_addr.clone(), + )) + .await + .unwrap(); + let volume_construction_request = VolumeConstructionRequest::Volume { id: new_volume_id, block_size: 0, diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index d2e3053668..8261785ada 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -934,6 +934,22 @@ pub(crate) mod test { .is_none() } + async fn no_volume_resource_usage_records_exist( + datastore: &DataStore, + ) -> bool { + use nexus_db_queries::db::schema::volume_resource_usage::dsl; + + let rows = dsl::volume_resource_usage + .count() + .get_result_async::( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap(); + + rows == 0 + } + async fn no_virtual_provisioning_resource_records_exist( datastore: &DataStore, ) -> bool { @@ -1030,6 +1046,7 @@ pub(crate) mod test { .await; assert!(no_disk_records_exist(datastore).await); assert!(no_volume_records_exist(datastore).await); + assert!(no_volume_resource_usage_records_exist(datastore).await); assert!( no_virtual_provisioning_resource_records_exist(datastore).await ); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 6d56732388..d615b55612 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -801,6 +801,7 @@ pub(crate) mod test { const SNAPSHOT_NAME: &str = "my-snap"; const PROJECT_NAME: &str = "springfield-squidport"; + /// Create four zpools, a disk, and a snapshot of that disk async fn prepare_for_test( cptestctx: &ControlPlaneTestContext, ) -> PrepareResult { diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 8765218d33..5a03a6b95a 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -5,24 +5,46 @@ //! Tests that Nexus properly manages and cleans up Crucible resources //! associated with Volumes +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; +use diesel::ExpressionMethods; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; +use nexus_config::RegionAllocationStrategy; +use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_model::VolumeResourceUsage; +use nexus_db_model::VolumeResourceUsageRecord; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::CrucibleResources; +use nexus_db_queries::db::datastore::ExistingTarget; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; +use nexus_db_queries::db::datastore::ReplacementTarget; +use nexus_db_queries::db::datastore::VolumeReplaceResult; +use nexus_db_queries::db::datastore::VolumeToDelete; +use nexus_db_queries::db::datastore::VolumeWithTarget; +use nexus_db_queries::db::datastore::SQL_BATCH_SIZE; use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::pagination::paginated; +use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; +use nexus_test_utils::background::run_replacement_tasks_to_completion; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::resource_helpers::create_default_ip_pool; +use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::views; use nexus_types::identity::Asset; +use nexus_types::identity::Resource; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; @@ -30,6 +52,7 @@ use omicron_common::api::external::Name; use omicron_common::api::internal; use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; @@ -37,6 +60,8 @@ use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; +use std::collections::HashSet; +use std::net::SocketAddrV6; use std::sync::Arc; use uuid::Uuid; @@ -2286,36 +2311,47 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate the volume's region_snapshots got incremented by - // volume_create + // Also validate the volume's region_snapshots had volume resource usage + // records created by volume_create for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } - // Soft delete the volume, and validate that only three region_snapshot - // records are returned. + // Soft delete the volume, and validate that the volume's region_snapshots + // had their volume resource usage records deleted let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -2396,30 +2432,42 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate only the volume's region_snapshots got incremented by - // volume_create. + // Also validate only the new volume's region_snapshots had usage records + // created. for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } + for i in 3..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } // Soft delete the volume, and validate that only three region_snapshot @@ -2427,18 +2475,23 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); - // Make sure every region_snapshot is now 0, and deleting + // Make sure every region_snapshot has no usage records now for i in 0..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -3503,3 +3556,1977 @@ async fn test_cte_returns_regions(cptestctx: &ControlPlaneTestContext) { .collect::>(), ); } + +struct TestReadOnlyRegionReferenceUsage { + datastore: Arc, + + region: db::model::Region, + region_address: SocketAddrV6, + + first_volume_id: Uuid, + second_volume_id: Uuid, + + last_resources_to_delete: Option, +} + +impl TestReadOnlyRegionReferenceUsage { + pub async fn new(cptestctx: &ControlPlaneTestContext) -> Self { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let first_volume_id = Uuid::new_v4(); + let second_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: first_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + assert_eq!(datasets_and_regions.len(), 1); + + let (_, region) = &datasets_and_regions[0]; + + assert_eq!(region.volume_id(), first_volume_id); + assert!(region.read_only()); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(region.id(), 12345).await.unwrap(); + + let region_address = + datastore.region_addr(region.id()).await.unwrap().unwrap(); + + let region = datastore.get_region(region.id()).await.unwrap(); + + TestReadOnlyRegionReferenceUsage { + datastore: datastore.clone(), + + region, + region_address, + + first_volume_id, + second_volume_id, + + last_resources_to_delete: None, + } + } + + pub async fn create_first_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_only_first_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.first_volume_id); + } + + pub async fn validate_only_second_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.second_volume_id); + } + + pub async fn delete_first_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.first_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn delete_second_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.second_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn validate_no_usage_records(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + } + + pub async fn validate_region_returned_for_cleanup(&self) { + assert!(self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + pub async fn validate_region_not_returned_for_cleanup(&self) { + assert!(!self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + pub async fn region_returned_by_find_deleted_volume_regions(&self) { + let deleted_volume_regions = + self.datastore.find_deleted_volume_regions().await.unwrap(); + + assert!(deleted_volume_regions + .into_iter() + .any(|(_, r, _, _)| r.id() == self.region.id())); + } + + pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { + let deleted_volume_regions = + self.datastore.find_deleted_volume_regions().await.unwrap(); + + assert!(!deleted_volume_regions + .into_iter() + .any(|(_, r, _, _)| r.id() == self.region.id())); + } + + pub async fn create_first_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_both_volumes_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == self.first_volume_id)); + assert!(usage.iter().any(|r| r.volume_id == self.second_volume_id)); + } +} + +/// Assert that creating a volume with a read-only region in a subvolume creates +/// an appropriate usage record, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_usage_sanity( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume().await; + + // Validate only one volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // Now, soft-delete the volume, and make sure that the associated volume + // resource usage record is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should be returned by find_deleted_volume_regions. + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating a volume with a read-only region in the ROP creates an +/// appropriate reference, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // It should be _not_ returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Now, soft-delete the volume, and make sure that read-only volume + // reference is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should be returned by find_deleted_volume_regions. + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region creates the +/// appropriate references, and that deleting only one of those volumes does not +/// mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume().await; + harness.create_second_volume().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the first volume, and make sure that only one read-only + // volume reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region in the ROP +/// creates the appropriate references, and that deleting only one of those +/// volumes does not mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume_region_in_rop().await; + harness.create_second_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the volume, and make sure that only one read-only volume + // reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that a read-only region is properly reference counted and not +/// prematurely deleted: +/// +/// 1) create a disk, then a snapshot of that disk, then a disk from that +/// snapshot +/// +/// 2) issue a region snapshot replacement request for one of the region +/// snapshots in that snapshot, then run that process to completion +/// +/// 3) delete the snapshot +/// +/// 4) expect that the reference to the read-only region in the disk created +/// from the snapshot means that read-only region will not be cleaned up by +/// Nexus +/// +/// 5) clean up all the objects, and expect the crucible resources are properly +/// cleaned up +#[nexus_test] +async fn test_read_only_region_reference_counting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // The snapshot's allocated regions should have the one read-only region + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + let db_read_only_dataset = + datastore.dataset_get(read_only_region.dataset_id()).await.unwrap(); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Deleting the snapshot should _not_ cause the region to get deleted from + // CRDB + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + let post_delete_region = + datastore.get_region(read_only_region.id()).await.unwrap(); + assert_eq!(post_delete_region, *read_only_region); + + // or cause Nexus to send delete commands to the appropriate Crucible + // agent. + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Created + ); + + // Expect that there is one read-only region reference now, and that's from + // disk-from-snap + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk, and expect that does not alter the volume usage records + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk from snapshot, verify everything is cleaned up + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Destroyed + ); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +/// Assert that a snapshot of a volume with a read-only region is properly +/// reference counted. +#[nexus_test] +async fn test_read_only_region_reference_counting_layers( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // Grab the read-only region in the snapshot volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Take a snapshot of the disk-from-snapshot disk + + let double_snapshot = create_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + "double-snapshot", + ) + .await; + + // Assert correct volume usage records + + let (.., db_double_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(double_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "double_snapshot {:?} should exist", + double_snapshot.identity.id + ) + }); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + // Delete resources, assert volume resource usage records along the way + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("double-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_volume_replace_snapshot_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + // There won't be any regions for the snapshot volume, only region snapshots + + assert!(allocated_regions.is_empty()); + + // Get another region to use with volume_replace_snapshot + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: db_snapshot.id(), + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + allocated_regions.len() + 1, + ) + .await + .unwrap(); + + // Get the newly allocated region + + let mut new_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(new_allocated_regions.len(), 1); + + let (_, new_region) = + new_allocated_regions.pop().expect("we just checked the length!"); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(new_region.id(), 12345).await.unwrap(); + + // Create a blank region to use as the "volume to delete" + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert the correct volume resource usage record before the replacement: + // nothing should reference the newly allocated region yet + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + // Perform replacement of the first region in the snapshot + + let existing = datastore + .region_snapshot_get( + disk_allocated_regions[0].1.dataset_id(), + disk_allocated_regions[0].1.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let replacement = datastore + .region_addr(new_region.id()) + .await + .expect("new region has address!") + .unwrap(); + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(existing.snapshot_addr.parse().unwrap()), + ReplacementTarget(replacement), + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the volume resource usage record after volume_replace_snapshot: + // the new region should have a usage for the snapshot's volume + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_snapshot.volume_id); + + // Now, reverse the replacement + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(replacement), // swapped! + ReplacementTarget(existing.snapshot_addr.parse().unwrap()), // swapped! + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the new region's volume resource usage record now references the + // volume to delete + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); +} + +/// Test that the `volume_remove_rop` function correctly updates volume resource +/// usage records +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: + // both the snapshot volume and disk_from_snapshot volume should have usage + // records for the three region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: + // the snapshot volume should still have usage records for the three region + // snapshots, and now so should the volume to delete + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + } +} + +/// Test that the `volume_remove_rop` function only updates volume resource +/// usage records for the volume being operated on +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting_no_modify_others( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_another_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(another_disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "another_disk_from_snapshot {:?} should exist", + another_disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: the + // snapshot volume, disk_from_snapshot volume, and + // another_disk_from_snapshot volume should have usage records for the three + // region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: the + // snapshot volume and another_disk_from_snapshot volume should still have + // usage records for the three region snapshots, and now so should the + // volume to delete. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } +} + +async fn delete_all_volume_resource_usage_records(datastore: &DataStore) { + use db::schema::volume_resource_usage::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::delete(dsl::volume_resource_usage) + .filter(dsl::usage_id.ne(Uuid::new_v4())) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn perform_migration(datastore: &DataStore) { + const MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL: &str = include_str!( + "../../../schema/crdb/crucible-ref-count-records/up08.sql" + ); + + assert!(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL + .contains("INSERT INTO volume_resource_usage")); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + // To make sure that the migration is idempotent, perform it twice + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); + + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn get_volume_resource_usage_records( + datastore: &DataStore, +) -> HashSet { + use db::schema::volume_resource_usage::dsl; + + let mut records: Vec = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + while let Some(p) = paginator.next() { + let page = paginated( + dsl::volume_resource_usage, + dsl::usage_id, + &p.current_pagparams(), + ) + .get_results_async::(&*conn) + .await + .unwrap(); + + paginator = p.found_batch(&page, &|r| r.usage_id); + + for record in page { + records.push(record); + } + } + + records + .into_iter() + .map(|mut record: VolumeResourceUsageRecord| { + // Zero out usage_id for comparison + record.usage_id = Uuid::nil(); + record + }) + .collect() +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk + + create_disk(&client, PROJECT_NAME, "disk").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a snapshot + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a disk from that snapshot + + create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the snapshot + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the disk from snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_soft_delete_volume( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot from that disk, then an image based on + // that snapshot + + create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let params = params::ImageCreate { + identity: IdentityMetadataCreateParams { + name: "windows99".parse().unwrap(), + description: String::from("as soon as we get CSM support!"), + }, + source: params::ImageSource::Snapshot { id: snapshot.identity.id }, + os: "windows98".to_string(), + version: "se".to_string(), + }; + + let images_url = format!("/v1/images?project={}", PROJECT_NAME); + NexusRequest::objects_post(client, &images_url, ¶ms) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap(); + + // Soft-delete the snapshot's volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let resources = + datastore.soft_delete_volume(db_snapshot.volume_id).await.unwrap(); + + // Assert that the region snapshots did not have deleted set to true + + assert!(datastore + .snapshots_to_delete(&resources) + .await + .unwrap() + .is_empty()); + + // This means that the snapshot volume is soft-deleted, make sure the + // migration does not make usage records for it! + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( + cptestctx: &ControlPlaneTestContext, +) { + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let mut iter = disk_test.zpools(); + let zpool0 = iter.next().expect("Expected four zpools"); + let zpool1 = iter.next().expect("Expected four zpools"); + let zpool2 = iter.next().expect("Expected four zpools"); + let zpool3 = iter.next().expect("Expected four zpools"); + + // (dataset_id, region_id, snapshot_id, snapshot_addr) + let region_snapshots = vec![ + ( + zpool0.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101:7]:19016"), + ), + ( + zpool1.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:102:7]:19016"), + ), + ( + zpool2.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103:7]:19016"), + ), + ( + zpool3.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:104:7]:19016"), + ), + ]; + + for i in 0..4 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + // Create two volumes, one with the first three region snapshots, one with + // the last three region snapshots + + let first_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[0].3.clone(), + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + let second_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + region_snapshots[3].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Deleting the first volume should only return region_snapshot[0] for + // deletion. + + let resources = + datastore.soft_delete_volume(first_volume_id).await.unwrap(); + + let snapshots_to_delete = + datastore.snapshots_to_delete(&resources).await.unwrap(); + + assert_eq!(snapshots_to_delete.len(), 1); + + let region_snapshot_to_delete = &snapshots_to_delete[0].1; + + assert_eq!(region_snapshot_to_delete.dataset_id, region_snapshots[0].0); + assert_eq!(region_snapshot_to_delete.region_id, region_snapshots[0].1); + assert_eq!(region_snapshot_to_delete.snapshot_id, region_snapshots[0].2); + assert_eq!(region_snapshot_to_delete.snapshot_addr, region_snapshots[0].3); + assert_eq!(region_snapshot_to_delete.volume_references, 0); + assert_eq!(region_snapshot_to_delete.deleting, true); + + // Test the migration does not incorrectly think a region snapshot with + // deleting = true is used by any volume + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} diff --git a/schema/crdb/crucible-ref-count-records/up01.sql b/schema/crdb/crucible-ref-count-records/up01.sql new file mode 100644 index 0000000000..b28e61135b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up01.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); diff --git a/schema/crdb/crucible-ref-count-records/up02.sql b/schema/crdb/crucible-ref-count-records/up02.sql new file mode 100644 index 0000000000..31fa35c3b8 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up02.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); diff --git a/schema/crdb/crucible-ref-count-records/up03.sql b/schema/crdb/crucible-ref-count-records/up03.sql new file mode 100644 index 0000000000..03940aabb0 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up03.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); diff --git a/schema/crdb/crucible-ref-count-records/up04.sql b/schema/crdb/crucible-ref-count-records/up04.sql new file mode 100644 index 0000000000..7d3bba3119 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up04.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + region_id UUID, + + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id) +); diff --git a/schema/crdb/crucible-ref-count-records/up05.sql b/schema/crdb/crucible-ref-count-records/up05.sql new file mode 100644 index 0000000000..a88a3fc6fd --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up05.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); diff --git a/schema/crdb/crucible-ref-count-records/up06.sql b/schema/crdb/crucible-ref-count-records/up06.sql new file mode 100644 index 0000000000..3262b194be --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up06.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up07.sql b/schema/crdb/crucible-ref-count-records/up07.sql new file mode 100644 index 0000000000..91c788be1b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up07.sql @@ -0,0 +1,8 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up08.sql b/schema/crdb/crucible-ref-count-records/up08.sql new file mode 100644 index 0000000000..9bcfebab76 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up08.sql @@ -0,0 +1,29 @@ +INSERT INTO volume_resource_usage ( + SELECT + ( + OVERLAY( + OVERLAY( + MD5(volume.id::TEXT || dataset_id::TEXT || region_id::TEXT || snapshot_id::TEXT || snapshot_addr || volume_references::TEXT) + PLACING '4' from 13 + ) + PLACING TO_HEX(volume_references) from 17 + )::uuid + ) as usage_id, + volume.id AS volume_id, + 'region_snapshot' AS usage_type, + NULL AS region_id, + region_snapshot.dataset_id AS region_snapshot_dataset_id, + region_snapshot.region_id AS region_snapshot_region_id, + region_snapshot.snapshot_id AS region_snapshot_snapshot_id + FROM + volume + JOIN + region_snapshot + ON + volume.data like '%' || region_snapshot.snapshot_addr || '%' + WHERE + volume.time_deleted is NULL AND + region_snapshot.deleting = false +) +ON CONFLICT (usage_id) DO NOTHING +; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index a6cd9b38fe..bb2580a444 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -575,6 +575,8 @@ CREATE INDEX IF NOT EXISTS lookup_dataset_by_zpool on omicron.public.dataset ( id ) WHERE pool_id IS NOT NULL AND time_deleted IS NULL; +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); + /* * A region of space allocated to Crucible Downstairs, within a dataset. */ @@ -654,6 +656,10 @@ CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_region_id on omicron.public region_id ); +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); + /* * Index on volume_references and snapshot_addr for crucible * resource accounting lookup @@ -4409,6 +4415,43 @@ CREATE INDEX IF NOT EXISTS lookup_bgp_config_by_bgp_announce_set_id ON omicron.p ) WHERE time_deleted IS NULL; +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); + +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + region_id UUID, + + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id) +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); + +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); /* * Keep this at the end of file so that the database does not contain a version @@ -4421,7 +4464,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '107.0.0', NULL) + (TRUE, NOW(), NOW(), '108.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 589ba87700..8369499658 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -124,7 +124,7 @@ impl CrucibleDataInner { key_pem: None, root_pem: None, source: None, - read_only: false, + read_only: params.source.is_some(), }; let old = self.regions.insert(id, region.clone()); From 410c79dda551ea5fdeb479df5cb227cf08897dbf Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 9 Oct 2024 01:16:19 +0000 Subject: [PATCH 02/24] full table scan required during clean slate check --- nexus/src/app/sagas/disk_create.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index 8261785ada..7efe70da77 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -939,11 +939,23 @@ pub(crate) mod test { ) -> bool { use nexus_db_queries::db::schema::volume_resource_usage::dsl; - let rows = dsl::volume_resource_usage - .count() - .get_result_async::( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + let rows = datastore + .transaction_retry_wrapper("no_volume_resource_usage_records_exist") + .transaction(&conn, |conn| async move { + conn.batch_execute_async( + nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, + ) + .await + .unwrap(); + + Ok(dsl::volume_resource_usage + .count() + .get_result_async::(&conn) + .await + .unwrap()) + }) .await .unwrap(); From 2ff34d5e40a551f37ed7f32b7ed36597b49cb3f7 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 16 Oct 2024 21:12:00 +0000 Subject: [PATCH 03/24] Explicitly separate region deletion code The garbage collection of read/write regions must be separate from read-only regions: - read/write regions are garbage collected by either being deleted during a volume soft delete, or by appearing later during the "find deleted volume regions" section of the volume delete saga - read-only regions are garbage collected only in the volume soft delete code, when there are no more references to them `find_deleted_volume_regions` was changed to only operate on read/write regions, and no longer returns the optional RegionSnapshot object: that check was moved from the volume delete saga into the function, as it didn't make sense that it was separated. This commit also adds checks to validate that invariants related to volumes are not violated during tests. One invalid test was deleted (regions will never be deleted when they're in use!) In order to properly test the separate region deletion routines, the first part of the fixes for dealing with deleted volumes during region snapshot replacement were brought in from that branch: these are the changes to region_snapshot_replacement_step.rs and region_snapshot_replacement_start.rs. --- dev-tools/omdb/src/bin/omdb/db.rs | 12 +- nexus/db-model/src/volume_resource_usage.rs | 2 +- nexus/db-queries/src/db/datastore/region.rs | 6 + .../src/db/datastore/region_snapshot.rs | 17 +- nexus/db-queries/src/db/datastore/snapshot.rs | 19 + nexus/db-queries/src/db/datastore/volume.rs | 231 ++++++++++-- .../tasks/region_snapshot_replacement_step.rs | 8 +- .../region_snapshot_replacement_start.rs | 56 ++- nexus/src/app/sagas/volume_delete.rs | 45 +-- nexus/tests/integration_tests/disks.rs | 77 ---- .../integration_tests/volume_management.rs | 330 +++++++++++++++++- .../crdb/crucible-ref-count-records/up09.sql | 2 + schema/crdb/dbinit.sql | 3 + 13 files changed, 617 insertions(+), 191 deletions(-) create mode 100644 schema/crdb/crucible-ref-count-records/up09.sql diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index be068f0912..82488def6c 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2451,26 +2451,22 @@ async fn cmd_db_region_find_deleted( struct Row { dataset_id: Uuid, region_id: Uuid, - region_snapshot_id: String, - volume_id: Uuid, + volume_id: String, } let rows: Vec = datasets_regions_volumes .into_iter() .map(|row| { - let (dataset, region, region_snapshot, volume) = row; + let (dataset, region, volume) = row; Row { dataset_id: dataset.id(), region_id: region.id(), - region_snapshot_id: if let Some(region_snapshot) = - region_snapshot - { - region_snapshot.snapshot_id.to_string() + volume_id: if let Some(volume) = volume { + volume.id().to_string() } else { String::from("") }, - volume_id: volume.id(), } }) .collect(); diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs index 0095baf471..0a76b3718d 100644 --- a/nexus/db-model/src/volume_resource_usage.rs +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -29,7 +29,7 @@ impl_enum_type!( /// /// Read-only resources can be used by many volumes, and because of this they /// need to have a reference count so they can be deleted when they're not -/// referenced anymore. The region_snapshot table uses a `volume_references` +/// referenced anymore. The region_snapshot table used a `volume_references` /// column, which counts how many uses there are. The region table does not have /// this column, and more over a simple integer works for reference counting but /// does not tell you _what_ volume that use is from. This can be determined diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index f555e0af6b..da32494d6f 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -362,6 +362,12 @@ impl DataStore { ) .execute_async(&conn).await?; } + + // Whenever a region is hard-deleted, validate invariants + // for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(()) } }) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 242560a415..4a5db14bd6 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -64,14 +64,25 @@ impl DataStore { ) -> DeleteResult { use db::schema::region_snapshot::dsl; - diesel::delete(dsl::region_snapshot) + let conn = self.pool_connection_unauthorized().await?; + + let result = diesel::delete(dsl::region_snapshot) .filter(dsl::dataset_id.eq(dataset_id)) .filter(dsl::region_id.eq(region_id)) .filter(dsl::snapshot_id.eq(snapshot_id)) - .execute_async(&*self.pool_connection_unauthorized().await?) + .execute_async(&*conn) .await .map(|_rows_deleted| ()) - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)); + + // Whenever a region snapshot is hard-deleted, validate invariants for + // all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + result } /// Find region snapshots on expunged disks diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 9d4900e2a4..b9f6b589d7 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -322,4 +322,23 @@ impl DataStore { .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// Get a snapshot, returning None if it does not exist (instead of a + /// NotFound error). + pub async fn snapshot_get( + &self, + opctx: &OpContext, + snapshot_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::snapshot::dsl; + dsl::snapshot + .filter(dsl::id.eq(snapshot_id)) + .select(Snapshot::as_select()) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index d7d6b10081..c5dd3327d9 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -192,6 +192,10 @@ impl DataStore { } } + // After volume creation, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(volume) } @@ -1019,10 +1023,11 @@ impl DataStore { } /// Find regions for deleted volumes that do not have associated region - /// snapshots and are not being used by any other non-deleted volumes + /// snapshots and are not being used by any other non-deleted volumes, and + /// return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option, Volume)> { + ) -> ListResultVec<(Dataset, Region, Option)> { let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("find_deleted_volume_regions") .transaction(&conn, |conn| async move { @@ -1034,19 +1039,19 @@ impl DataStore { async fn find_deleted_volume_regions_txn( conn: &async_bb8_diesel::Connection, - ) -> Result< - Vec<(Dataset, Region, Option, Volume)>, - diesel::result::Error, - > { + ) -> Result)>, diesel::result::Error> + { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; use db::schema::volume::dsl as volume_dsl; - use db::schema::volume_resource_usage::dsl as ru_dsl; - // Find all regions and datasets - let tuples_superset = region_dsl::region - .inner_join( + // Find all read-write regions (read-only region cleanup is taken care + // of in soft_delete_volume_txn!) and their associated datasets + let unfiltered_deleted_regions = region_dsl::region + .filter(region_dsl::read_only.eq(false)) + // the volume may be hard deleted, so use a left join here + .left_join( volume_dsl::volume.on(region_dsl::volume_id.eq(volume_dsl::id)), ) .inner_join( @@ -1054,53 +1059,70 @@ impl DataStore { .on(region_dsl::dataset_id.eq(dataset_dsl::id)), ) // where there either are no region snapshots, or the region - // snapshot volume references have gone to zero + // snapshot volume has deleted = true .left_join( dsl::region_snapshot.on(dsl::region_id .eq(region_dsl::id) .and(dsl::dataset_id.eq(dataset_dsl::id))), ) - .filter( - dsl::volume_references - .eq(0) - // Despite the SQL specifying that this column is NOT NULL, - // this null check is required for this function to work! - // It's possible that the left join of region_snapshot above - // could join zero rows, making this null. - .or(dsl::volume_references.is_null()), - ) - // where the volume has already been soft-deleted - .filter(volume_dsl::time_deleted.is_not_null()) + .filter(dsl::deleting.eq(true).or(dsl::deleting.is_null())) // and return them (along with the volume so it can be hard deleted) .select(( Dataset::as_select(), Region::as_select(), Option::::as_select(), - Volume::as_select(), + // Diesel can't express a difference between + // + // a) the volume record existing and the nullable + // volume.time_deleted column being set to null + // b) the volume record does not exist (null due to left join) + // + // so return an Option and check below + Option::::as_select(), )) .load_async(conn) .await?; - // Filter out read-only regions that are still being used by volumes - let mut tuples = Vec::with_capacity(tuples_superset.len()); + let mut deleted_regions = + Vec::with_capacity(unfiltered_deleted_regions.len()); - for (dataset, region, region_snapshot, volume) in tuples_superset { - let region_usage_left = ru_dsl::volume_resource_usage - .filter( - ru_dsl::usage_type - .eq(VolumeResourceUsageType::ReadOnlyRegion), - ) - .filter(ru_dsl::region_id.eq(region.id())) - .count() - .get_result_async::(conn) - .await?; + for (dataset, region, region_snapshot, volume) in + unfiltered_deleted_regions + { + // only operate on soft or hard deleted volumes + let deleted = match &volume { + Some(volume) => volume.time_deleted.is_some(), + None => true, + }; - if region_usage_left == 0 { - tuples.push((dataset, region, region_snapshot, volume)); + if !deleted { + continue; } + + if region_snapshot.is_some() { + // We cannot delete this region: the presence of the region + // snapshot record means that the Crucible agent's snapshot has + // not been deleted yet (as the lifetime of the region snapshot + // record is equal to or longer than the lifetime of the + // Crucible agent's snapshot). + // + // This condition can occur when multiple volume delete sagas + // run concurrently: one will decrement the crucible resources + // (but hasn't made the appropriate DELETE calls to the + // appropriate Agents to tombstone the running snapshots and + // snapshots yet), and the other will be in the "delete freed + // regions" saga node trying to delete the region. Without this + // check, This race results in the Crucible Agent returning + // "must delete snapshots first" and causing saga unwinds. + // + // Another volume delete will pick up this region and remove it. + continue; + } + + deleted_regions.push((dataset, region, volume)); } - Ok(tuples) + Ok(deleted_regions) } pub async fn read_only_resources_associated_with_volume( @@ -1477,6 +1499,10 @@ impl DataStore { } } + // After volume deletion, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(resources_to_delete) } @@ -1735,6 +1761,11 @@ impl DataStore { })?; } + // After read-only parent removal, validate + // invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(true) } } @@ -2713,6 +2744,11 @@ impl DataStore { }) })?; + // After region replacement, validate invariants for all + // volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(VolumeReplaceResult::Done) } }) @@ -3142,6 +3178,11 @@ impl DataStore { )); } + // After region snapshot replacement, validate invariants + // for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(VolumeReplaceResult::Done) } }) @@ -3540,6 +3581,120 @@ impl DataStore { } } +// Add some validation that runs only for tests +#[cfg(any(test, feature = "testing"))] +impl DataStore { + fn volume_invariant_violated(msg: String) -> diesel::result::Error { + diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::CheckViolation, + Box::new(msg), + ) + } + + /// Tests each Volume to see if invariants hold + /// + /// If an invariant is violated, this function returns a `CheckViolation` + /// with the text of what invariant was violated. + pub(crate) async fn validate_volume_invariants( + conn: &async_bb8_diesel::Connection, + ) -> Result<(), diesel::result::Error> { + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + + while let Some(p) = paginator.next() { + use db::schema::volume::dsl; + let haystack = + paginated(dsl::volume, dsl::id, &p.current_pagparams()) + .select(Volume::as_select()) + .get_results_async::(conn) + .await + .unwrap(); + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for volume in haystack { + Self::validate_volume_has_all_resources(&conn, volume).await?; + } + } + + Ok(()) + } + + /// Assert that the resources that comprise non-deleted volumes have not + /// been prematurely deleted. + async fn validate_volume_has_all_resources( + conn: &async_bb8_diesel::Connection, + volume: Volume, + ) -> Result<(), diesel::result::Error> { + if volume.time_deleted.is_some() { + // Do not need to validate resources for soft-deleted volumes + return Ok(()); + } + + let vcr: VolumeConstructionRequest = + serde_json::from_str(&volume.data()).unwrap(); + + // validate all read/write resources still exist + + let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); + + let mut read_write_targets = + Vec::with_capacity(3 * num_read_write_subvolumes); + + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let sub_err = OptionalError::new(); + + let maybe_region = DataStore::target_to_region( + conn, &sub_err, &target, false, // read-write + ) + .await + .unwrap(); + + let Some(_region) = maybe_region else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {target}" + ))); + }; + } + + // validate all read-only resources still exist + + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &vcr, + &mut crucible_targets, + ); + crucible_targets + }; + + for read_only_target in &crucible_targets.read_only_targets { + let sub_err = OptionalError::new(); + + let maybe_usage = + DataStore::read_only_target_to_volume_resource_usage( + conn, + &sub_err, + read_only_target, + ) + .await + .unwrap(); + + let Some(_usage) = maybe_usage else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {read_only_target}" + ))); + }; + } + + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index c402c8fc92..29878364e6 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -216,14 +216,16 @@ impl RegionSnapshotReplacementFindAffected { Ok(Some(region_snapshot)) => region_snapshot, Ok(None) => { + // If the associated region snapshot was deleted, then there + // are no more volumes that reference it. This is not an + // error! Continue processing the other requests. let s = format!( - "region snapshot {} {} {} not found!", + "region snapshot {} {} {} not found", request.old_dataset_id, request.old_region_id, request.old_snapshot_id, ); - error!(&log, "{s}"); - status.errors.push(s); + info!(&log, "{s}"); continue; } diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index d615b55612..4e6c3e1e16 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -58,7 +58,6 @@ use crate::app::db::datastore::ReplacementTarget; use crate::app::db::datastore::VolumeReplaceResult; use crate::app::db::datastore::VolumeToDelete; use crate::app::db::datastore::VolumeWithTarget; -use crate::app::db::lookup::LookupPath; use crate::app::sagas::common_storage::find_only_new_region; use crate::app::sagas::declare_saga_actions; use crate::app::RegionAllocationStrategy; @@ -242,12 +241,21 @@ async fn rsrss_get_alloc_region_params( ); // Look up the existing snapshot - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + // Find the region to replace let db_region = osagactx .datastore() @@ -480,12 +488,22 @@ async fn rsrss_get_old_snapshot_volume_id( ¶ms.serialized_authn, ); - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + // Look up the existing snapshot + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + Ok(db_snapshot.volume_id) } @@ -510,6 +528,13 @@ async fn rsrss_create_fake_volume( gen: 0, opts: CrucibleOpts { id: new_volume_id, + // Do not put the new region ID here: it will be deleted during + // the associated garbage collection saga if + // `volume_replace_snapshot` does not perform the swap (which + // happens when the snapshot volume is deleted) and we still + // want it to exist when performing other replacement steps. If + // the replacement does occur, then the old address will swapped + // in here. target: vec![], lossy: false, flush_timeout: None, @@ -673,14 +698,15 @@ async fn rsrss_replace_snapshot_in_volume( } VolumeReplaceResult::ExistingVolumeDeleted => { - // Unwind the saga here to clean up the resources allocated during - // this saga. The associated background task will transition this - // request's state to Completed. - - Err(ActionError::action_failed(format!( - "existing volume {} deleted", - replacement_params.old_volume_id - ))) + // If the snapshot volume was deleted, we still want to proceed with + // replacing the rest of the uses of the region snapshot. Note this + // also covers the case where this saga node runs (performing the + // replacement), the executor crashes before it can record that + // success, and then before this node is rerun the snapshot is + // deleted. If this saga unwound here, that would violate the + // property of idempotency. + + Ok(()) } } } diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index c5e1b49f61..12d48cb367 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -29,7 +29,7 @@ use super::NexusSaga; use crate::app::sagas::declare_saga_actions; use nexus_db_model::Dataset; use nexus_db_model::Region; -use nexus_db_model::RegionSnapshot; +use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; use nexus_types::identity::Asset; @@ -330,8 +330,7 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = - Vec<(Dataset, Region, Option, Uuid)>; +type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent @@ -394,7 +393,7 @@ type FreedCrucibleRegions = /// The disk's volume has no read only resources, while the snapshot's volume /// does. The disk volume's targets are all regions (backed by downstairs that /// are read/write) while the snapshot volume's targets are all snapshots -/// (backed by volumes that are read-only). The two volumes are linked in the +/// (backed by downstairs that are read-only). The two volumes are linked in the /// sense that the snapshots from the second are contained *within* the regions /// of the first, reflecting the resource nesting from ZFS. This is also /// reflected in the REST endpoint that the Crucible agent uses: @@ -436,7 +435,7 @@ async fn svd_find_freed_crucible_regions( // Don't serialize the whole Volume, as the data field contains key material! Ok(freed_datasets_regions_and_volumes .into_iter() - .map(|x| (x.0, x.1, x.2, x.3.id())) + .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) .collect()) } @@ -451,23 +450,7 @@ async fn svd_delete_freed_crucible_regions( let freed_datasets_regions_and_volumes = sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, region_snapshot, volume_id) in - freed_datasets_regions_and_volumes - { - if region_snapshot.is_some() { - // We cannot delete this region yet, the snapshot has not been - // deleted. This can occur when multiple volume delete sagas run - // concurrently: one will decrement the crucible resources (but - // hasn't made the appropriate DELETE calls to remove the running - // snapshots and snapshots yet), and the other will be here trying - // to delete the region. This race results in the crucible agent - // returning "must delete snapshots first" and causing saga unwinds. - // - // Another volume delete (probably the one racing with this one!) - // will pick up this region and remove it. - continue; - } - + for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -496,14 +479,16 @@ async fn svd_delete_freed_crucible_regions( })?; // Remove volume DB record - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; + if let Some(volume_id) = volume_id { + osagactx.datastore().volume_hard_delete(volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; + } } Ok(()) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index d6d7cb58ea..4b081d3e7f 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -2438,83 +2438,6 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( assert_eq!(datasets_and_regions.len(), REGION_REDUNDANCY_THRESHOLD); } -// Confirm that a region set can start at N, a region can be deleted, and the -// allocation CTE can bring the redundancy back to N. -#[nexus_test] -async fn test_region_allocation_after_delete( - cptestctx: &ControlPlaneTestContext, -) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - - // Create three 10 GiB zpools, each with one dataset. - let _disk_test = DiskTest::new(&cptestctx).await; - - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - - // Create a disk - let client = &cptestctx.external_client; - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, DISK_NAME).await; - - // Assert disk has three allocated regions - let disk_id = disk.identity.id; - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk_id) - .fetch() - .await - .unwrap_or_else(|_| panic!("test disk {:?} should exist", disk_id)); - - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); - - // Delete one of the regions - let region_to_delete: &nexus_db_model::Region = &allocated_regions[0].1; - datastore - .regions_hard_delete(&opctx.log, vec![region_to_delete.id()]) - .await - .unwrap(); - - // Assert disk's volume has one less allocated region - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD - 1); - - let region_total_size: ByteCount = ByteCount::try_from( - region_to_delete.block_size().to_bytes() - * region_to_delete.blocks_per_extent() - * region_to_delete.extent_count(), - ) - .unwrap(); - - // Rerun disk region allocation - datastore - .disk_region_allocate( - &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_delete.block_size().to_bytes() as u32, - ) - .unwrap(), - }, - region_total_size, - &RegionAllocationStrategy::Random { seed: None }, - ) - .await - .unwrap(); - - // Assert redundancy was restored - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); -} - #[nexus_test] async fn test_no_halt_disk_delete_one_region_on_expunged_agent( cptestctx: &ControlPlaneTestContext, diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 5a03a6b95a..ff88de52e0 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -13,6 +13,7 @@ use http::method::Method; use http::StatusCode; use nexus_config::RegionAllocationStrategy; use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_model::RegionSnapshotReplacementState; use nexus_db_model::VolumeResourceUsage; use nexus_db_model::VolumeResourceUsageRecord; use nexus_db_queries::context::OpContext; @@ -50,6 +51,8 @@ use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; use omicron_common::api::internal; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_test_utils::dev::poll::CondCheckError; use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; use omicron_uuid_kinds::GenericUuid; @@ -3757,22 +3760,14 @@ impl TestReadOnlyRegionReferenceUsage { .any(|(_, r)| r.id() == self.region.id())); } - pub async fn region_returned_by_find_deleted_volume_regions(&self) { - let deleted_volume_regions = - self.datastore.find_deleted_volume_regions().await.unwrap(); - - assert!(deleted_volume_regions - .into_iter() - .any(|(_, r, _, _)| r.id() == self.region.id())); - } - + // read-only regions should never be returned by find_deleted_volume_regions pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { let deleted_volume_regions = self.datastore.find_deleted_volume_regions().await.unwrap(); assert!(!deleted_volume_regions .into_iter() - .any(|(_, r, _, _)| r.id() == self.region.id())); + .any(|(_, r, _)| r.id() == self.region.id())); } pub async fn create_first_volume_region_in_rop(&self) { @@ -3923,9 +3918,9 @@ async fn test_read_only_region_reference_usage_sanity( harness.validate_region_returned_for_cleanup().await; - // It should be returned by find_deleted_volume_regions. + // It should not be returned by find_deleted_volume_regions. - harness.region_returned_by_find_deleted_volume_regions().await; + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that creating a volume with a read-only region in the ROP creates an @@ -3960,9 +3955,9 @@ async fn test_read_only_region_reference_sanity_rop( harness.validate_region_returned_for_cleanup().await; - // It should be returned by find_deleted_volume_regions. + // It should not be returned by find_deleted_volume_regions. - harness.region_returned_by_find_deleted_volume_regions().await; + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that creating multiple volumes with a read-only region creates the @@ -4007,7 +4002,9 @@ async fn test_read_only_region_reference_sanity_multi( harness.validate_region_returned_for_cleanup().await; - harness.region_returned_by_find_deleted_volume_regions().await; + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that creating multiple volumes with a read-only region in the ROP @@ -4052,7 +4049,9 @@ async fn test_read_only_region_reference_sanity_rop_multi( harness.validate_region_returned_for_cleanup().await; - harness.region_returned_by_find_deleted_volume_regions().await; + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; } /// Assert that a read-only region is properly reference counted and not @@ -5530,3 +5529,302 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( assert_eq!(records_before, records_after); } + +#[nexus_test] +async fn test_double_layer_with_read_only_region_delete( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, snapshot, then two disks from those snapshots + // 2) Replace one of the region snapshots in that snapshot volume with a + // read-only region + // 3) Delete in the following order: disk, snapshot, then two disks from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let _another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's targets, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Finally, delete another-disk-from-snapshot + + NexusRequest::object_delete( + client, + &get_disk_url("another-disk-from-snapshot"), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete another-disk-from-snapshot"); + + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_double_layer_snapshot_with_read_only_region_delete_2( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, then a snapshot + // 2) Replace two of the region snapshots in that snapshot volume with + // read-only regions + // 3) Create a disk from the snapshot + // 4) Replace the last of the region snapshots in that snapshot volume with + // a read-only region + // 5) Delete in the following order: disk, snapshot, then the disk from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Perform region snapshot replacement for two of the snapshot's targets, + // causing two read-only regions to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + wait_for_condition( + || { + let datastore = datastore.clone(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + let (dataset, region) = &allocated_regions[1]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Create a disk from the snapshot + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Replace the last of the region snapshot targets + + let (dataset, region) = &allocated_regions[2]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Assert everything was cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} diff --git a/schema/crdb/crucible-ref-count-records/up09.sql b/schema/crdb/crucible-ref-count-records/up09.sql new file mode 100644 index 0000000000..be56ed401c --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up09.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index ae4dd3e65b..2984c62dd5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -622,6 +622,9 @@ CREATE INDEX IF NOT EXISTS lookup_regions_missing_ports on omicron.public.region (id) WHERE port IS NULL; +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); + /* * A snapshot of a region, within a dataset. */ From 221751a4d9b4cc981429f90a5d4b9df77bcf4f11 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 16 Oct 2024 21:22:27 +0000 Subject: [PATCH 04/24] be explicit in comment! --- nexus/db-model/src/volume_resource_usage.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs index 0a76b3718d..14fb935d25 100644 --- a/nexus/db-model/src/volume_resource_usage.rs +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -34,8 +34,9 @@ impl_enum_type!( /// this column, and more over a simple integer works for reference counting but /// does not tell you _what_ volume that use is from. This can be determined /// (see omdb's validate volume references command) but it's information that is -/// tossed out, as Nexus knows what volumes use what resources! Instead, record -/// what read-only resources a volume uses here. +/// tossed out, as Nexus knows what volumes use what resources! Instead of +/// throwing away that knowledge and only incrementing and decrementing an +/// integer, record what read-only resources a volume uses in this table. /// /// Note: users should not use this object directly, and instead use the /// [`VolumeResourceUsage`] enum, which is type-safe and will convert to and From 00d9f38791c6629132f22fe35246740a3db92dbd Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 16 Oct 2024 21:34:26 +0000 Subject: [PATCH 05/24] reduce denting --- nexus/db-queries/src/db/datastore/volume.rs | 81 ++++++++++----------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index c5dd3327d9..83d4973cb0 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -1251,52 +1251,51 @@ impl DataStore { // First, grab read-write regions - they're not shared, but they are // not candidates for deletion if there are region snapshots - { - let mut read_write_targets = - Vec::with_capacity(3 * num_read_write_subvolumes); - read_write_resources_associated_with_volume( - &vcr, - &mut read_write_targets, - ); + let mut read_write_targets = + Vec::with_capacity(3 * num_read_write_subvolumes); - for target in read_write_targets { - let sub_err = OptionalError::new(); + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); - let maybe_region = Self::target_to_region( - conn, &sub_err, &target, false, // read-write - ) - .await - .map_err(|e| { - if let Some(sub_err) = sub_err.take() { - err.bail(SoftDeleteTransactionError::AddressParseError( - sub_err, - )) - } else { - e - } - })?; + for target in read_write_targets { + let sub_err = OptionalError::new(); - let Some(region) = maybe_region else { - return Err(err.bail( - SoftDeleteTransactionError::CouldNotFindResource( - format!("could not find resource for {target}"), - ), - )); - }; + let maybe_region = Self::target_to_region( + conn, &sub_err, &target, false, // read-write + ) + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(SoftDeleteTransactionError::AddressParseError( + sub_err, + )) + } else { + e + } + })?; - // Filter out regions that have any region-snapshots - let region_snapshot_count: i64 = { - use db::schema::region_snapshot::dsl; - dsl::region_snapshot - .filter(dsl::region_id.eq(region.id())) - .count() - .get_result_async::(conn) - .await? - }; + let Some(region) = maybe_region else { + return Err(err.bail( + SoftDeleteTransactionError::CouldNotFindResource( + format!("could not find resource for {target}"), + ), + )); + }; - if region_snapshot_count == 0 { - regions.push(region.id()); - } + // Filter out regions that have any region-snapshots + let region_snapshot_count: i64 = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::region_id.eq(region.id())) + .count() + .get_result_async::(conn) + .await? + }; + + if region_snapshot_count == 0 { + regions.push(region.id()); } } From 412d66564b52d254aef3bf7aa907b11444ae252c Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 16 Oct 2024 21:35:11 +0000 Subject: [PATCH 06/24] TODO be smart enough --- nexus/db-queries/src/db/datastore/volume.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 83d4973cb0..263cb75996 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -3121,7 +3121,7 @@ impl DataStore { ) }) })? - // XXX be smart enough to .filter the above query + // TODO be smart enough to .filter the above query .into_iter() .filter(|record| record.volume_id == volume_to_delete_id.0) .count(); From 0a45763a2806ff16cc3e046e906ab58e18d18dda Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 16 Oct 2024 21:35:50 +0000 Subject: [PATCH 07/24] use TryFrom instead of From, as then a database edit cannot panic Nexus --- nexus/db-model/src/volume_resource_usage.rs | 49 ++++++++++++--------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs index 14fb935d25..c55b053c62 100644 --- a/nexus/db-model/src/volume_resource_usage.rs +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -102,31 +102,40 @@ impl VolumeResourceUsageRecord { } } -impl From for VolumeResourceUsage { - fn from(record: VolumeResourceUsageRecord) -> VolumeResourceUsage { +impl TryFrom for VolumeResourceUsage { + type Error = String; + + fn try_from( + record: VolumeResourceUsageRecord, + ) -> Result { match record.usage_type { VolumeResourceUsageType::ReadOnlyRegion => { - VolumeResourceUsage::ReadOnlyRegion { - region_id: record - .region_id - .expect("valid read-only region usage record"), - } + let Some(region_id) = record.region_id else { + return Err("valid read-only region usage record".into()); + }; + + Ok(VolumeResourceUsage::ReadOnlyRegion { region_id }) } VolumeResourceUsageType::RegionSnapshot => { - VolumeResourceUsage::RegionSnapshot { - dataset_id: record - .region_snapshot_dataset_id - .expect("valid region snapshot usage record"), - - region_id: record - .region_snapshot_region_id - .expect("valid region snapshot usage record"), - - snapshot_id: record - .region_snapshot_snapshot_id - .expect("valid region snapshot usage record"), - } + let Some(dataset_id) = record.region_snapshot_dataset_id else { + return Err("valid region snapshot usage record".into()); + }; + + let Some(region_id) = record.region_snapshot_region_id else { + return Err("valid region snapshot usage record".into()); + }; + + let Some(snapshot_id) = record.region_snapshot_snapshot_id + else { + return Err("valid region snapshot usage record".into()); + }; + + Ok(VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }) } } } From 89744ea0659c25616af2f309c9f22878c591693b Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 16 Oct 2024 21:59:28 +0000 Subject: [PATCH 08/24] fmt --- nexus/db-queries/src/db/datastore/volume.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 263cb75996..2a25c809bc 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -1278,9 +1278,9 @@ impl DataStore { let Some(region) = maybe_region else { return Err(err.bail( - SoftDeleteTransactionError::CouldNotFindResource( - format!("could not find resource for {target}"), - ), + SoftDeleteTransactionError::CouldNotFindResource(format!( + "could not find resource for {target}" + )), )); }; From bd2ef84d4d9c387d92e3ac007ab7d10f52dcd96f Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Wed, 16 Oct 2024 22:00:01 +0000 Subject: [PATCH 09/24] add CONSTRAINT to volume_resource_usage table to separate two enum uses add comment to table as well --- .../crdb/crucible-ref-count-records/up04.sql | 24 +++++++++++- schema/crdb/dbinit.sql | 37 ++++++++++++++++++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/schema/crdb/crucible-ref-count-records/up04.sql b/schema/crdb/crucible-ref-count-records/up04.sql index 7d3bba3119..247f5d99ec 100644 --- a/schema/crdb/crucible-ref-count-records/up04.sql +++ b/schema/crdb/crucible-ref-count-records/up04.sql @@ -11,5 +11,27 @@ CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( region_snapshot_region_id UUID, region_snapshot_snapshot_id UUID, - PRIMARY KEY (usage_id) + PRIMARY KEY (usage_id), + + CONSTRAINT exactly_one_usage_source CHECK ( + ( + (usage_type = 'read_only_region') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NULL AND + region_snapshot_region_id IS NULL AND + region_snapshot_snapshot_id IS NULL + ) + ) + OR + ( + (usage_type = 'region_snapshot') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NOT NULL AND + region_snapshot_region_id IS NOT NULL AND + region_snapshot_snapshot_id IS NOT NULL + ) + ) + ) ); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 2984c62dd5..f813cac397 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4489,6 +4489,11 @@ CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( 'region_snapshot' ); +/* + * This table records when a Volume makes use of a read-only resource. When + * there are no more entries for a particular read-only resource, then that + * resource can be garbage collected. + */ CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( usage_id UUID NOT NULL, @@ -4496,13 +4501,43 @@ CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( usage_type omicron.public.volume_resource_usage_type NOT NULL, + /* + * This column contains a non-NULL value when the usage type is read_only + * region + */ region_id UUID, + /* + * These columns contain non-NULL values when the usage type is region + * snapshot + */ region_snapshot_dataset_id UUID, region_snapshot_region_id UUID, region_snapshot_snapshot_id UUID, - PRIMARY KEY (usage_id) + PRIMARY KEY (usage_id), + + CONSTRAINT exactly_one_usage_source CHECK ( + ( + (usage_type = 'read_only_region') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NULL AND + region_snapshot_region_id IS NULL AND + region_snapshot_snapshot_id IS NULL + ) + ) + OR + ( + (usage_type = 'region_snapshot') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NOT NULL AND + region_snapshot_region_id IS NOT NULL AND + region_snapshot_snapshot_id IS NOT NULL + ) + ) + ) ); CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( From af6e6e570c7deddeac0706b380278a41c7b2115c Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 17 Oct 2024 15:12:04 +0000 Subject: [PATCH 10/24] constraint typo --- schema/crdb/crucible-ref-count-records/up04.sql | 2 +- schema/crdb/dbinit.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/schema/crdb/crucible-ref-count-records/up04.sql b/schema/crdb/crucible-ref-count-records/up04.sql index 247f5d99ec..c020499418 100644 --- a/schema/crdb/crucible-ref-count-records/up04.sql +++ b/schema/crdb/crucible-ref-count-records/up04.sql @@ -26,7 +26,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( OR ( (usage_type = 'region_snapshot') AND - (region_id IS NOT NULL) AND + (region_id IS NULL) AND ( region_snapshot_dataset_id IS NOT NULL AND region_snapshot_region_id IS NOT NULL AND diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f813cac397..f253555ef5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4530,7 +4530,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( OR ( (usage_type = 'region_snapshot') AND - (region_id IS NOT NULL) AND + (region_id IS NULL) AND ( region_snapshot_dataset_id IS NOT NULL AND region_snapshot_region_id IS NOT NULL AND From 47cc31e8d4c4154773555042fab9526b9269a9bc Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 17 Oct 2024 17:26:02 +0000 Subject: [PATCH 11/24] assert read-only regions never have snapshots --- nexus/db-queries/src/db/datastore/volume.rs | 81 ++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 2a25c809bc..fa2ebe214b 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -1365,6 +1365,30 @@ impl DataStore { .await?; if region_usage_left == 0 { + // There are several factors that mean Nexus does _not_ + // have to check here for region snapshots taken of + // read-only regions: + // + // - When a Crucible Volume receives a flush, it is only + // propagated to the subvolumes of that Volume, not + // the read-only parent. There's a few reasons for + // this, but the main one is that a Crucible flush + // changes the on-disk data, and the directory that + // the downstairs of a read-only parent is serving out + // of may be on read-only storage, as is the case when + // serving out of a .zfs/snapshot/ directory. + // + // - Even if a Crucible flush _did_ propagate to the + // read-only parent, Nexus should never directly send + // the snapshot volume to a place where it will be + // constructed, meaning the read-only regions of the + // snapshot volume's subvolumes will never themselves + // receive a flush. + // + // If either of these factors change, then that check is + // required here. The `validate_volume_invariants` + // function will return an error if a read-only region + // has an associated region snapshot during testing. regions.push(region_id); } } @@ -3608,13 +3632,32 @@ impl DataStore { .await .unwrap(); - paginator = p.found_batch(&haystack, &|r| r.id()); + paginator = p.found_batch(&haystack, &|v| v.id()); for volume in haystack { Self::validate_volume_has_all_resources(&conn, volume).await?; } } + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + + while let Some(p) = paginator.next() { + use db::schema::region::dsl; + let haystack = + paginated(dsl::region, dsl::id, &p.current_pagparams()) + .select(Region::as_select()) + .get_results_async::(conn) + .await + .unwrap(); + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for region in haystack { + Self::validate_read_only_region_has_no_snapshots(&conn, region) + .await?; + } + } + Ok(()) } @@ -3692,6 +3735,42 @@ impl DataStore { Ok(()) } + + /// Assert that read-only regions do not have any associated region + /// snapshots (see associated comment in `soft_delete_volume_txn`) + async fn validate_read_only_region_has_no_snapshots( + conn: &async_bb8_diesel::Connection, + region: Region, + ) -> Result<(), diesel::result::Error> { + if !region.read_only() { + return Ok(()); + } + + use db::schema::volume_resource_usage::dsl; + + let matching_usage_records: Vec = + dsl::volume_resource_usage + .filter( + dsl::usage_type.eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_region_id.eq(region.id())) + .select(VolumeResourceUsageRecord::as_select()) + .get_results_async(conn) + .await? + .into_iter() + .map(|r| r.try_into().unwrap()) + .collect(); + + if !matching_usage_records.is_empty() { + return Err(Self::volume_invariant_violated(format!( + "read-only region {} has matching usage records: {:?}", + region.id(), + matching_usage_records, + ))); + } + + Ok(()) + } } #[cfg(test)] From 3d16f26fce843980ff1ad0964fd44d69dcf77383 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 00:24:57 +0000 Subject: [PATCH 12/24] DO NOT UNWRAP FOR THE LOVE OF ALL THAT IS HOLY --- nexus/db-queries/src/db/datastore/volume.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index fa2ebe214b..dece9c4d22 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -3629,8 +3629,7 @@ impl DataStore { paginated(dsl::volume, dsl::id, &p.current_pagparams()) .select(Volume::as_select()) .get_results_async::(conn) - .await - .unwrap(); + .await?; paginator = p.found_batch(&haystack, &|v| v.id()); @@ -3647,8 +3646,7 @@ impl DataStore { paginated(dsl::region, dsl::id, &p.current_pagparams()) .select(Region::as_select()) .get_results_async::(conn) - .await - .unwrap(); + .await?; paginator = p.found_batch(&haystack, &|r| r.id()); @@ -3693,8 +3691,7 @@ impl DataStore { let maybe_region = DataStore::target_to_region( conn, &sub_err, &target, false, // read-write ) - .await - .unwrap(); + .await?; let Some(_region) = maybe_region else { return Err(Self::volume_invariant_violated(format!( @@ -3723,8 +3720,7 @@ impl DataStore { &sub_err, read_only_target, ) - .await - .unwrap(); + .await?; let Some(_usage) = maybe_usage else { return Err(Self::volume_invariant_violated(format!( From d4de663b70a3a8e71f20eb00f0995e16923dc4be Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 00:36:33 +0000 Subject: [PATCH 13/24] Implement a deleting flag for read-only regions They can also have the use-after-free problem that region snapshots do! --- nexus/db-model/src/region.rs | 8 + nexus/db-model/src/schema.rs | 2 + nexus/db-queries/src/db/datastore/volume.rs | 29 +- .../src/db/queries/region_allocation.rs | 5 +- .../output/region_allocate_distinct_sleds.sql | 21 +- .../output/region_allocate_random_sleds.sql | 21 +- ..._allocate_with_snapshot_distinct_sleds.sql | 21 +- ...on_allocate_with_snapshot_random_sleds.sql | 21 +- nexus/test-utils/src/background.rs | 2 +- .../integration_tests/volume_management.rs | 490 ++++++++++++++++++ .../crdb/crucible-ref-count-records/up10.sql | 1 + .../crdb/crucible-ref-count-records/up11.sql | 1 + schema/crdb/dbinit.sql | 4 +- 13 files changed, 591 insertions(+), 35 deletions(-) create mode 100644 schema/crdb/crucible-ref-count-records/up10.sql create mode 100644 schema/crdb/crucible-ref-count-records/up11.sql diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs index e38c5543ef..02c7db8120 100644 --- a/nexus/db-model/src/region.rs +++ b/nexus/db-model/src/region.rs @@ -46,6 +46,10 @@ pub struct Region { // A region may be read-only read_only: bool, + + // Shared read-only regions require a "deleting" flag to avoid a + // use-after-free scenario + deleting: bool, } impl Region { @@ -67,6 +71,7 @@ impl Region { extent_count: extent_count as i64, port: Some(port.into()), read_only, + deleting: false, } } @@ -99,4 +104,7 @@ impl Region { pub fn read_only(&self) -> bool { self.read_only } + pub fn deleting(&self) -> bool { + self.deleting + } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index ca0bf3d3da..d15698afa4 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1049,6 +1049,8 @@ table! { port -> Nullable, read_only -> Bool, + + deleting -> Bool, } } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index dece9c4d22..3a522b1b53 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -199,6 +199,8 @@ impl DataStore { Ok(volume) } + /// Return a region by matching against the dataset's address and region's + /// port async fn target_to_region( conn: &async_bb8_diesel::Connection, err: &OptionalError, @@ -208,8 +210,6 @@ impl DataStore { let address: SocketAddrV6 = target.parse().map_err(|e| err.bail(e))?; let ip: db::model::Ipv6Addr = address.ip().into(); - // Match region by dataset id and target port - use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; @@ -224,6 +224,7 @@ impl DataStore { .eq(Some::(address.port().into())), ) .filter(region_dsl::read_only.eq(read_only)) + .filter(region_dsl::deleting.eq(false)) .select(Region::as_select()) .get_result_async::(conn) .await @@ -1389,6 +1390,28 @@ impl DataStore { // required here. The `validate_volume_invariants` // function will return an error if a read-only region // has an associated region snapshot during testing. + // + // However, don't forget to set `deleting`! These + // regions will be returned to the calling function for + // garbage collection. + use db::schema::region::dsl; + let updated_rows = diesel::update(dsl::region) + .filter(dsl::id.eq(region_id)) + .filter(dsl::read_only.eq(true)) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting (region)".into(), + ) + )); + } + regions.push(region_id); } } @@ -1476,7 +1499,7 @@ impl DataStore { return Err(err.bail( SoftDeleteTransactionError::UnexpectedDatabaseUpdate( updated_rows, - "setting deleting".into(), + "setting deleting (region snapshot)".into(), ) )); } diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index dbf37fda2e..9eaf82ff31 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -255,7 +255,8 @@ pub fn allocation_query( ").param().sql(" AS blocks_per_extent, ").param().sql(" AS extent_count, NULL AS port, - ").param().sql(" AS read_only + ").param().sql(" AS read_only, + FALSE as deleting FROM shuffled_candidate_datasets") // Only select the *additional* number of candidate regions for the required // redundancy level @@ -368,7 +369,7 @@ pub fn allocation_query( .sql(" inserted_regions AS ( INSERT INTO region - (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only) + (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting) SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql(" FROM candidate_regions WHERE diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql index 4e7dde244b..aa70a96984 100644 --- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -101,7 +102,8 @@ WITH $9 AS blocks_per_extent, $10 AS extent_count, NULL AS port, - $11 AS read_only + $11 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -211,7 +213,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -223,7 +226,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -238,7 +242,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -295,7 +300,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -322,7 +328,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql index b2c164a6d9..f067ad7c74 100644 --- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -99,7 +100,8 @@ WITH $8 AS blocks_per_extent, $9 AS extent_count, NULL AS port, - $10 AS read_only + $10 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -209,7 +211,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -221,7 +224,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -236,7 +240,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -293,7 +298,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -320,7 +326,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql index 97ee23f82e..5571e4f2bc 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -112,7 +113,8 @@ WITH $10 AS blocks_per_extent, $11 AS extent_count, NULL AS port, - $12 AS read_only + $12 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -222,7 +224,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -234,7 +237,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -249,7 +253,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -306,7 +311,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -333,7 +339,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql index a1cc103594..c363bc666a 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -110,7 +111,8 @@ WITH $9 AS blocks_per_extent, $10 AS extent_count, NULL AS port, - $11 AS read_only + $11 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -220,7 +222,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -232,7 +235,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -247,7 +251,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -304,7 +309,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -331,7 +337,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 859796f556..b0474037a8 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -332,7 +332,7 @@ pub async fn run_replacement_tasks_to_completion( } }, &Duration::from_secs(1), - &Duration::from_secs(10), + &Duration::from_secs(20), ) .await .unwrap(); diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index ff88de52e0..767ed7e4aa 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -14,6 +14,7 @@ use http::StatusCode; use nexus_config::RegionAllocationStrategy; use nexus_db_model::RegionSnapshotReplacement; use nexus_db_model::RegionSnapshotReplacementState; +use nexus_db_model::Volume; use nexus_db_model::VolumeResourceUsage; use nexus_db_model::VolumeResourceUsageRecord; use nexus_db_queries::context::OpContext; @@ -5828,3 +5829,492 @@ async fn test_double_layer_snapshot_with_read_only_region_delete_2( assert!(disk_test.crucible_resources_deleted().await); } + +#[nexus_test] +async fn test_no_zombie_region_snapshots(cptestctx: &ControlPlaneTestContext) { + // 1) Create disk, then a snapshot + // 2) Delete the disk + // 3) Create a volume that uses the snapshot volume as a read-only parent + // 4) Test that a race of the following steps does not cause a region + // snapshot to have volume usage records even though it is marked for + // deletion: + // + // a) delete the snapshot volume + // b) delete the volume created in step 3 + // c) create another volume that uses the snapshot volume as a read-only + // parent + + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create disk, then a snapshot + + let _disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Delete the disk + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // Create a volume that uses the snapshot volume as a read-only parent + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let snapshot_volume: Volume = datastore + .volume_get(db_snapshot.volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let snapshot_vcr: VolumeConstructionRequest = + serde_json::from_str(snapshot_volume.data()).unwrap(); + + let step_3_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_3_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_3_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(snapshot_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the snapshot volume + + let cr = datastore.soft_delete_volume(db_snapshot.volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the volume created as part of step 3 + + let cr = datastore.soft_delete_volume(step_3_volume_id).await.unwrap(); + + // Assert that region snapshots _are_ returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(!datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the snapshot volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(snapshot_vcr.clone())), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} + +#[nexus_test] +async fn test_no_zombie_read_only_regions(cptestctx: &ControlPlaneTestContext) { + // 1) Create a volume with three read-only regions + // 2) Create another volume that uses the first step's volume as a read-only + // parent + // 3) Test that a race of the following steps does not cause a region to + // have volume usage records: + // + // a) delete the step 1 volume + // b) delete the step 2 volume + // c) create another volume that uses the step 1 volume as a read-only + // parent + + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + // Create a volume with three read-only regions + + let step_1_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: step_1_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 3, + ) + .await + .unwrap(); + + // We're not sending allocation requests to any simulated crucible agent, so + // fill in a random port here. + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + datastore.region_set_port(region.id(), 20000 + i as u16).await.unwrap(); + } + + let region_addrs: Vec = vec![ + datastore + .region_addr(datasets_and_regions[0].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[1].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[2].1.id()) + .await + .unwrap() + .unwrap(), + ]; + + // Assert they're all unique + + assert_ne!(region_addrs[0], region_addrs[1]); + assert_ne!(region_addrs[0], region_addrs[2]); + assert_ne!(region_addrs[1], region_addrs[2]); + + // Mimic what a snapshot volume has: three read-only regions in a volume's + // subvolume, not the read-only parent + + datastore + .volume_create(nexus_db_model::Volume::new( + step_1_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_1_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_addrs[0].to_string(), + region_addrs[1].to_string(), + region_addrs[2].to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Create another volume that uses the first step's volume as a read-only + // parent + + let step_1_volume: Volume = datastore + .volume_get(step_1_volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let step_1_vcr: VolumeConstructionRequest = + serde_json::from_str(step_1_volume.data()).unwrap(); + + let step_2_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_2_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_2_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the step 1 volume + + let cr = datastore.soft_delete_volume(step_1_volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the step 2 volume + + let cr = datastore.soft_delete_volume(step_2_volume_id).await.unwrap(); + + // Assert that the read-only regions _are_ returned for clean-up + + assert!(!datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the step 1 volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr)), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} + +#[nexus_test] +async fn test_no_zombie_read_write_regions( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create a volume with three read-write regions + // 2) Create another volume that uses the first step's volume as a read-only + // parent + // 3) Test that a race of the following steps does not cause a region to + // have volume usage records: + // + // a) delete the step 1 volume + // b) delete the step 2 volume + // c) create another volume that uses the step 1 volume as a read-only + // parent + + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + // Create a volume with three read-only regions + + let step_1_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: step_1_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 3, + ) + .await + .unwrap(); + + // We're not sending allocation requests to any simulated crucible agent, so + // fill in a random port here. + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + datastore.region_set_port(region.id(), 20000 + i as u16).await.unwrap(); + } + + let region_addrs: Vec = vec![ + datastore + .region_addr(datasets_and_regions[0].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[1].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[2].1.id()) + .await + .unwrap() + .unwrap(), + ]; + + // Assert they're all unique + + assert_ne!(region_addrs[0], region_addrs[1]); + assert_ne!(region_addrs[0], region_addrs[2]); + assert_ne!(region_addrs[1], region_addrs[2]); + + // Mimic what a snapshot volume has: three read-only regions in a volume's + // subvolume, not the read-only parent + + datastore + .volume_create(nexus_db_model::Volume::new( + step_1_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_1_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_addrs[0].to_string(), + region_addrs[1].to_string(), + region_addrs[2].to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Create another volume that uses the first step's volume as a read-only + // parent + + let step_1_volume: Volume = datastore + .volume_get(step_1_volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let step_1_vcr: VolumeConstructionRequest = + serde_json::from_str(step_1_volume.data()).unwrap(); + + let step_2_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_2_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_2_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the step 1 volume + + let cr = datastore.soft_delete_volume(step_1_volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the step 2 volume + + let cr = datastore.soft_delete_volume(step_2_volume_id).await.unwrap(); + + // Assert that the read-only regions _are_ returned for clean-up + + assert!(!datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the step 1 volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr)), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} diff --git a/schema/crdb/crucible-ref-count-records/up10.sql b/schema/crdb/crucible-ref-count-records/up10.sql new file mode 100644 index 0000000000..5d7390e124 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up10.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region ADD COLUMN IF NOT EXISTS deleting BOOLEAN NOT NULL DEFAULT false; diff --git a/schema/crdb/crucible-ref-count-records/up11.sql b/schema/crdb/crucible-ref-count-records/up11.sql new file mode 100644 index 0000000000..ef7845c8d0 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up11.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region ALTER COLUMN deleting DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f253555ef5..92a5ff4f7c 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -599,7 +599,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region ( port INT4, - read_only BOOL NOT NULL + read_only BOOL NOT NULL, + + deleting BOOL NOT NULL ); /* From 1b57804b957535e78884e73c293a250ff59a83d4 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 00:41:17 +0000 Subject: [PATCH 14/24] use REGION_REDUNDANCY_THRESHOLD instead of 3 --- nexus/db-queries/src/db/datastore/volume.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 3a522b1b53..e7f26fdbd4 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -8,6 +8,7 @@ use super::DataStore; use crate::db; use crate::db::datastore::OpContext; use crate::db::datastore::RunnableQuery; +use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; @@ -1244,16 +1245,18 @@ impl DataStore { let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); - let mut regions: Vec = - Vec::with_capacity(3 * num_read_write_subvolumes); + let mut regions: Vec = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); let mut region_snapshots: Vec = Vec::with_capacity(crucible_targets.read_only_targets.len()); // First, grab read-write regions - they're not shared, but they are // not candidates for deletion if there are region snapshots - let mut read_write_targets = - Vec::with_capacity(3 * num_read_write_subvolumes); + let mut read_write_targets = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); read_write_resources_associated_with_volume( &vcr, @@ -3700,8 +3703,9 @@ impl DataStore { let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); - let mut read_write_targets = - Vec::with_capacity(3 * num_read_write_subvolumes); + let mut read_write_targets = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); read_write_resources_associated_with_volume( &vcr, From eae5f310a9f4cae65b179abea01751918548e3e8 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 05:31:43 +0000 Subject: [PATCH 15/24] use an enum instead of bool --- nexus/db-queries/src/db/datastore/volume.rs | 24 +++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index e7f26fdbd4..704b38ddce 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -111,6 +111,11 @@ enum VolumeCreationError { CouldNotFindResource(String), } +enum RegionType { + ReadWrite, + ReadOnly, +} + impl DataStore { async fn volume_create_txn( conn: &async_bb8_diesel::Connection, @@ -206,7 +211,7 @@ impl DataStore { conn: &async_bb8_diesel::Connection, err: &OptionalError, target: &str, - read_only: bool, + region_type: RegionType, ) -> Result, diesel::result::Error> { let address: SocketAddrV6 = target.parse().map_err(|e| err.bail(e))?; let ip: db::model::Ipv6Addr = address.ip().into(); @@ -214,6 +219,11 @@ impl DataStore { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; + let read_only = match region_type { + RegionType::ReadWrite => false, + RegionType::ReadOnly => true, + }; + dataset_dsl::dataset .inner_join( region_dsl::region @@ -266,7 +276,7 @@ impl DataStore { conn, err, read_only_target, - true, // read-only + RegionType::ReadOnly, ) .await?; @@ -1267,7 +1277,10 @@ impl DataStore { let sub_err = OptionalError::new(); let maybe_region = Self::target_to_region( - conn, &sub_err, &target, false, // read-write + conn, + &sub_err, + &target, + RegionType::ReadWrite, ) .await .map_err(|e| { @@ -3716,7 +3729,10 @@ impl DataStore { let sub_err = OptionalError::new(); let maybe_region = DataStore::target_to_region( - conn, &sub_err, &target, false, // read-write + conn, + &sub_err, + &target, + RegionType::ReadWrite, ) .await?; From 0e0a639516312ae484c1eb7b563e27ae8fe4b20f Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 06:03:47 +0000 Subject: [PATCH 16/24] use SocketAddrV6 instead of passing around strings --- nexus/db-queries/src/db/datastore/volume.rs | 143 +++++++------------- 1 file changed, 51 insertions(+), 92 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 704b38ddce..c7a81d7bd9 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -108,7 +108,7 @@ enum VolumeCreationError { AddressParseError(#[from] AddrParseError), #[error("Could not match read-only resource to {0}")] - CouldNotFindResource(String), + CouldNotFindResource(SocketAddrV6), } enum RegionType { @@ -159,21 +159,15 @@ impl DataStore { use db::schema::volume_resource_usage::dsl as ru_dsl; for read_only_target in crucible_targets.read_only_targets { - let sub_err = OptionalError::new(); + let read_only_target = read_only_target.parse().map_err(|e| { + err.bail(VolumeCreationError::AddressParseError(e)) + })?; let maybe_usage = Self::read_only_target_to_volume_resource_usage( conn, - &sub_err, &read_only_target, ) - .await - .map_err(|e| { - if let Some(sub_err) = sub_err.take() { - err.bail(VolumeCreationError::AddressParseError(sub_err)) - } else { - e - } - })?; + .await?; match maybe_usage { Some(usage) => { @@ -209,12 +203,10 @@ impl DataStore { /// port async fn target_to_region( conn: &async_bb8_diesel::Connection, - err: &OptionalError, - target: &str, + target: &SocketAddrV6, region_type: RegionType, ) -> Result, diesel::result::Error> { - let address: SocketAddrV6 = target.parse().map_err(|e| err.bail(e))?; - let ip: db::model::Ipv6Addr = address.ip().into(); + let ip: db::model::Ipv6Addr = target.ip().into(); use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; @@ -232,7 +224,7 @@ impl DataStore { .filter(dataset_dsl::ip.eq(ip)) .filter( region_dsl::port - .eq(Some::(address.port().into())), + .eq(Some::(target.port().into())), ) .filter(region_dsl::read_only.eq(read_only)) .filter(region_dsl::deleting.eq(false)) @@ -244,8 +236,7 @@ impl DataStore { async fn read_only_target_to_volume_resource_usage( conn: &async_bb8_diesel::Connection, - err: &OptionalError, - read_only_target: &str, + read_only_target: &SocketAddrV6, ) -> Result, diesel::result::Error> { // Easy case: it's a region snapshot, and we can match by the snapshot // address directly @@ -274,7 +265,6 @@ impl DataStore { let maybe_region = Self::target_to_region( conn, - err, read_only_target, RegionType::ReadOnly, ) @@ -1274,25 +1264,14 @@ impl DataStore { ); for target in read_write_targets { - let sub_err = OptionalError::new(); - - let maybe_region = Self::target_to_region( - conn, - &sub_err, - &target, - RegionType::ReadWrite, - ) - .await - .map_err(|e| { - if let Some(sub_err) = sub_err.take() { - err.bail(SoftDeleteTransactionError::AddressParseError( - sub_err, - )) - } else { - e - } + let target = target.parse().map_err(|e| { + err.bail(SoftDeleteTransactionError::AddressParseError(e)) })?; + let maybe_region = + Self::target_to_region(conn, &target, RegionType::ReadWrite) + .await?; + let Some(region) = maybe_region else { return Err(err.bail( SoftDeleteTransactionError::CouldNotFindResource(format!( @@ -1319,23 +1298,15 @@ impl DataStore { for read_only_target in &crucible_targets.read_only_targets { use db::schema::volume_resource_usage::dsl as ru_dsl; - let sub_err = OptionalError::new(); + let read_only_target = read_only_target.parse().map_err(|e| { + err.bail(SoftDeleteTransactionError::AddressParseError(e)) + })?; let maybe_usage = Self::read_only_target_to_volume_resource_usage( conn, - &sub_err, - read_only_target, + &read_only_target, ) - .await - .map_err(|e| { - if let Some(sub_err) = sub_err.take() { - err.bail(SoftDeleteTransactionError::AddressParseError( - sub_err, - )) - } else { - e - } - })?; + .await?; let Some(usage) = maybe_usage else { return Err(err.bail( @@ -1504,7 +1475,7 @@ impl DataStore { .filter(dsl::snapshot_id.eq(snapshot_id)) .filter( dsl::snapshot_addr - .eq(read_only_target.clone()), + .eq(read_only_target.to_string()), ) .filter(dsl::deleting.eq(false)) .set(dsl::deleting.eq(true)) @@ -1765,7 +1736,12 @@ impl DataStore { .await?; if num_updated != 1 { - return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); + return Err(err.bail( + RemoveReadOnlyParentError::UnexpectedDatabaseUpdate( + num_updated, + 1 + ) + )); } // Update the volume resource usage record for @@ -1780,21 +1756,17 @@ impl DataStore { }; for read_only_target in crucible_targets.read_only_targets { - let sub_err = OptionalError::new(); + let read_only_target = read_only_target + .parse() + .map_err(|e| err.bail( + RemoveReadOnlyParentError::AddressParseError(e) + ))?; let maybe_usage = Self::read_only_target_to_volume_resource_usage( &conn, - &sub_err, &read_only_target, ) - .await - .map_err(|e| { - if let Some(sub_err) = sub_err.take() { - err.bail(RemoveReadOnlyParentError::AddressParseError(sub_err)) - } else { - e - } - })?; + .await?; let Some(usage) = maybe_usage else { return Err(err.bail( @@ -2519,8 +2491,6 @@ pub struct VolumeReplacementParams { #[derive(Debug, Clone, Copy)] pub struct VolumeWithTarget(pub Uuid); -// Note: it would be easier to pass around strings, but comparison could fail -// due to formatting issues, so pass around SocketAddrV6 here #[derive(Debug, Clone, Copy)] pub struct ExistingTarget(pub SocketAddrV6); @@ -3080,21 +3050,11 @@ impl DataStore { // could either be a read-only region or a region snapshot, // so determine what it is first - let sub_err = OptionalError::new(); let maybe_existing_usage = Self::read_only_target_to_volume_resource_usage( &conn, - &sub_err, - &existing.0.to_string(), + &existing.0, ) - .await - .map_err(|e| if let Some(sub_err) = sub_err.take() { - err.bail(VolumeReplaceSnapshotError::AddressParseError( - sub_err - )) - } else { - e - } - )?; + .await?; let Some(existing_usage) = maybe_existing_usage else { return Err(err.bail( @@ -3127,22 +3087,12 @@ impl DataStore { }) })?; - let sub_err = OptionalError::new(); let maybe_replacement_usage = Self::read_only_target_to_volume_resource_usage( &conn, - &sub_err, - &replacement.0.to_string(), + &replacement.0, ) - .await - .map_err(|e| if let Some(sub_err) = sub_err.take() { - err.bail(VolumeReplaceSnapshotError::AddressParseError( - sub_err - )) - } else { - e - } - )?; + .await?; let Some(replacement_usage) = maybe_replacement_usage else { return Err(err.bail( @@ -3726,11 +3676,17 @@ impl DataStore { ); for target in read_write_targets { - let sub_err = OptionalError::new(); + let target = match target.parse() { + Ok(t) => t, + Err(e) => { + return Err(Self::volume_invariant_violated(format!( + "could not parse {target}: {e}" + ))); + } + }; let maybe_region = DataStore::target_to_region( conn, - &sub_err, &target, RegionType::ReadWrite, ) @@ -3755,13 +3711,16 @@ impl DataStore { }; for read_only_target in &crucible_targets.read_only_targets { - let sub_err = OptionalError::new(); + let read_only_target = read_only_target.parse().map_err(|e| { + Self::volume_invariant_violated(format!( + "could not parse {read_only_target}: {e}" + )) + })?; let maybe_usage = DataStore::read_only_target_to_volume_resource_usage( conn, - &sub_err, - read_only_target, + &read_only_target, ) .await?; From 58169feb63c0a6f4d9f8079b37ab06b2f9dd58ea Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 06:19:17 +0000 Subject: [PATCH 17/24] fix bad wording --- nexus/db-queries/src/db/datastore/volume.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index c7a81d7bd9..7559c69bc8 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -3105,18 +3105,17 @@ impl DataStore { ); }; - // This function may be called with a replacement volume - // that is completely blank, to be filled in later by this - // function. `volume_create` will have been called but will - // not have added any volume resource usage records, because - // it was blank! + // The intention leaving this transaction is that the + // correct volume resource usage records exist, so: // - // The indention leaving this transaction is that the - // correct volume resource usage records exist, so if this - // is the case, create a new record. + // - if no usage record existed for the replacement usage, + // then create a new record that points to the volume + // id (this can happen if the volume to delete was blank + // when coming into this function) // - // If the replacement volume usage records exist, then - // perform a swap instead. + // - if records exist for the "replacement" usage, then one + // of those will match the volume to delete id, so perform + // a swap instead to the volume id let existing_replacement_volume_usage_records = Self::volume_usage_records_for_resource_query( From 02ebcbe4f207c1f47a3586241065009cd826781b Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 06:25:19 +0000 Subject: [PATCH 18/24] correct 2 bit variant for v4 uuid --- schema/crdb/crucible-ref-count-records/up08.sql | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/schema/crdb/crucible-ref-count-records/up08.sql b/schema/crdb/crucible-ref-count-records/up08.sql index 9bcfebab76..a19559b6b9 100644 --- a/schema/crdb/crucible-ref-count-records/up08.sql +++ b/schema/crdb/crucible-ref-count-records/up08.sql @@ -1,3 +1,5 @@ +/* Construct a UUIDv4 deterministically from the existing data in the region + snapshot records to populate the volume resource usage records */ INSERT INTO volume_resource_usage ( SELECT ( @@ -6,7 +8,7 @@ INSERT INTO volume_resource_usage ( MD5(volume.id::TEXT || dataset_id::TEXT || region_id::TEXT || snapshot_id::TEXT || snapshot_addr || volume_references::TEXT) PLACING '4' from 13 ) - PLACING TO_HEX(volume_references) from 17 + PLACING '8' from 17 )::uuid ) as usage_id, volume.id AS volume_id, From b8cccb2c59ae826ac2e5feec6ac4cd37d7d34cb6 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 17:16:47 +0000 Subject: [PATCH 19/24] actually correct ipv6 --- .../integration_tests/volume_management.rs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 767ed7e4aa..887afff20f 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -2211,38 +2211,38 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101:7]:19016"), + String::from("[fd00:1122:3344:101::7]:19016"), ), ( zpool1.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:102:7]:19016"), + String::from("[fd00:1122:3344:102::7]:19016"), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103:7]:19016"), + String::from("[fd00:1122:3344:103::7]:19016"), ), // second snapshot-create ( zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101:7]:19016"), // duplicate! + String::from("[fd00:1122:3344:101::7]:19016"), // duplicate! ), ( zpool3.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:104:7]:19016"), + String::from("[fd00:1122:3344:104::7]:19016"), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103:7]:19017"), + String::from("[fd00:1122:3344:103::7]:19017"), ), ]; @@ -5380,25 +5380,25 @@ async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101:7]:19016"), + String::from("[fd00:1122:3344:101::7]:19016"), ), ( zpool1.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:102:7]:19016"), + String::from("[fd00:1122:3344:102::7]:19016"), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103:7]:19016"), + String::from("[fd00:1122:3344:103::7]:19016"), ), ( zpool3.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:104:7]:19016"), + String::from("[fd00:1122:3344:104::7]:19016"), ), ]; From 3849fe8566c2d7b77ef09a57c746181a607ef53a Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 17:37:29 +0000 Subject: [PATCH 20/24] we do not need to panic --- nexus/db-queries/src/db/datastore/volume.rs | 43 +++++++++++++++++---- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 7559c69bc8..66f94d5c72 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -905,7 +905,9 @@ impl DataStore { // We don't support a pure Region VCR at the volume // level in the database, so this choice should // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); + return Err(err.bail(VolumeGetError::InvalidVolume( + String::from("Region not supported as a top level volume") + ))); } VolumeConstructionRequest::File { id: _, @@ -1164,6 +1166,9 @@ enum SoftDeleteTransactionError { #[error("Address parsing error during Volume soft-delete: {0}")] AddressParseError(#[from] AddrParseError), + + #[error("Invalid Volume: {0}")] + InvalidVolume(String), } impl DataStore { @@ -1243,7 +1248,18 @@ impl DataStore { // references, collecting the regions and region snapshots that were // freed up for deletion. - let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); + let num_read_write_subvolumes = match count_read_write_sub_volumes(&vcr) + { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + SoftDeleteTransactionError::InvalidVolume(format!( + "volume {} invalid: {e}", + volume.id() + )), + )); + } + }; let mut regions: Vec = Vec::with_capacity( REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, @@ -3305,8 +3321,10 @@ pub fn read_write_resources_associated_with_volume( } /// Return the number of read-write subvolumes in a VolumeConstructionRequest. -pub fn count_read_write_sub_volumes(vcr: &VolumeConstructionRequest) -> usize { - match vcr { +pub fn count_read_write_sub_volumes( + vcr: &VolumeConstructionRequest, +) -> anyhow::Result { + Ok(match vcr { VolumeConstructionRequest::Volume { sub_volumes, .. } => { sub_volumes.len() } @@ -3317,11 +3335,11 @@ pub fn count_read_write_sub_volumes(vcr: &VolumeConstructionRequest) -> usize { // We don't support a pure Region VCR at the volume // level in the database, so this choice should // never be encountered. - panic!("Region not supported as a top level volume"); + bail!("Region not supported as a top level volume"); } VolumeConstructionRequest::File { .. } => 0, - } + }) } /// Returns true if the sub-volumes of a Volume are all read-only @@ -3351,7 +3369,7 @@ pub fn volume_is_read_only( // We don't support a pure Region VCR at the volume // level in the database, so this choice should // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); + bail!("Region not supported as a top level volume"); } VolumeConstructionRequest::File { .. } => { @@ -3663,7 +3681,16 @@ impl DataStore { // validate all read/write resources still exist - let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); + let num_read_write_subvolumes = match count_read_write_sub_volumes(&vcr) + { + Ok(v) => v, + Err(e) => { + return Err(Self::volume_invariant_violated(format!( + "volume {} had error: {e}", + volume.id(), + ))); + } + }; let mut read_write_targets = Vec::with_capacity( REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, From ce56ca3e7e3a5d614d2f95a21db6d1256def99f4 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 24 Oct 2024 18:30:20 +0000 Subject: [PATCH 21/24] no lines longer than 80! pull more transaction bodies into "in_txn" functions --- nexus/db-queries/src/db/datastore/volume.rs | 2257 ++++++++++--------- 1 file changed, 1143 insertions(+), 1114 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 66f94d5c72..716d7c858a 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -116,8 +116,68 @@ enum RegionType { ReadOnly, } +#[derive(Debug, thiserror::Error)] +enum RemoveRopError { + #[error("Error removing read-only parent: {0}")] + Public(Error), + + #[error("Serde error removing read-only parent: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + + #[error("Address parsing error during ROP removal: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), +} + +#[derive(Debug, thiserror::Error)] +enum ReplaceRegionError { + #[error("Error from Volume region replacement: {0}")] + Public(Error), + + #[error("Serde error during Volume region replacement: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Region replacement error: {0}")] + RegionReplacementError(#[from] anyhow::Error), +} + +#[derive(Debug, thiserror::Error)] +enum ReplaceSnapshotError { + #[error("Error from Volume snapshot replacement: {0}")] + Public(Error), + + #[error("Serde error during Volume snapshot replacement: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Snapshot replacement error: {0}")] + SnapshotReplacementError(#[from] anyhow::Error), + + #[error("Replaced {0} targets, expected {1}")] + UnexpectedReplacedTargets(usize, usize), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + + #[error( + "Address parsing error during Volume snapshot \ + replacement: {0}" + )] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), + + #[error("Multiple volume resource usage records for {0}")] + MultipleResourceUsageRecords(String), +} + impl DataStore { - async fn volume_create_txn( + async fn volume_create_in_txn( conn: &async_bb8_diesel::Connection, err: OptionalError, volume: Volume, @@ -310,7 +370,7 @@ impl DataStore { let crucible_targets = crucible_targets.clone(); let volume = volume.clone(); async move { - Self::volume_create_txn( + Self::volume_create_in_txn( &conn, err, volume, @@ -503,9 +563,14 @@ impl DataStore { match volume_is_read_only(&vcr) { Ok(read_only) => { if !read_only { - return Err(VolumeGetError::CheckoutConditionFailed( - String::from("Non-read-only Volume Checkout for use Copy!") - )); + return Err( + VolumeGetError::CheckoutConditionFailed( + String::from( + "Non-read-only Volume Checkout \ + for use Copy!", + ), + ), + ); } Ok(()) @@ -544,49 +609,49 @@ impl DataStore { let runtime = instance.runtime(); match (runtime.propolis_id, runtime.dst_propolis_id) { (Some(_), Some(_)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} is undergoing migration", - vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} is undergoing \ + migration", + vmm_id, + instance.id(), + ))) } (None, None) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} has no propolis ids", - vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} has no \ + propolis ids", + vmm_id, + instance.id(), + ))) } (Some(propolis_id), None) => { if propolis_id != vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} propolis id {} mismatch", + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance {} propolis \ + id {} mismatch", vmm_id, instance.id(), propolis_id, - ) - )); + ), + ), + ); } Ok(()) } (None, Some(dst_propolis_id)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} has no propolis id but dst propolis id {}", - vmm_id, - instance.id(), - dst_propolis_id, - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} has no \ + propolis id but dst propolis id {}", + vmm_id, + instance.id(), + dst_propolis_id, + ))) } } } @@ -608,60 +673,67 @@ impl DataStore { let runtime = instance.runtime(); match (runtime.propolis_id, runtime.dst_propolis_id) { (Some(propolis_id), Some(dst_propolis_id)) => { - if propolis_id != vmm_id.into_untyped_uuid() || dst_propolis_id != target_vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} propolis id mismatches {} {}", - vmm_id, - target_vmm_id, - instance.id(), - propolis_id, - dst_propolis_id, - ) - )); + if propolis_id != vmm_id.into_untyped_uuid() + || dst_propolis_id + != target_vmm_id.into_untyped_uuid() + { + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} \ + propolis id mismatches {} {}", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + dst_propolis_id, + ), + ), + ); } Ok(()) } (None, None) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} has no propolis ids", - vmm_id, - target_vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceMigrate {} {}: instance {} has no \ + propolis ids", + vmm_id, + target_vmm_id, + instance.id(), + ))) } (Some(propolis_id), None) => { // XXX is this right? if propolis_id != vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} propolis id {} mismatch", - vmm_id, - target_vmm_id, - instance.id(), - propolis_id, - ) - )); + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} \ + propolis id {} mismatch", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + ), + ), + ); } Ok(()) } (None, Some(dst_propolis_id)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} has no propolis id but dst propolis id {}", - vmm_id, - target_vmm_id, - instance.id(), - dst_propolis_id, - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceMigrate {} {}: instance {} has no \ + propolis id but dst propolis id {}", + vmm_id, + target_vmm_id, + instance.id(), + dst_propolis_id, + ))) } } } @@ -697,10 +769,11 @@ impl DataStore { // XXX this is a Nexus bug! return Err(VolumeGetError::CheckoutConditionFailed( format!( - "Pantry: instance {} backing disk {} does not exist?", + "Pantry: instance {} backing disk {} does not \ + exist?", attach_instance_id, disk.id(), - ) + ), )); }; @@ -723,6 +796,181 @@ impl DataStore { } } + async fn volume_checkout_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: Uuid, + reason: VolumeCheckoutReason, + ) -> Result { + use db::schema::volume::dsl; + + // Grab the volume in question. + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(conn) + .await?; + + // Turn the volume.data into the VolumeConstructionRequest + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()) + .map_err(|e| err.bail(VolumeGetError::SerdeError(e)))?; + + // The VolumeConstructionRequest resulting from this checkout will have + // its generation numbers bumped, and as result will (if it has + // non-read-only sub-volumes) take over from previous read/write + // activations when sent to a place that will `construct` a new Volume. + // Depending on the checkout reason, prevent creating multiple + // read/write Upstairs acting on the same Volume, except where the take + // over is intended. + + let (maybe_disk, maybe_instance) = { + use db::schema::disk::dsl as disk_dsl; + use db::schema::instance::dsl as instance_dsl; + + let maybe_disk: Option = disk_dsl::disk + .filter(disk_dsl::time_deleted.is_null()) + .filter(disk_dsl::volume_id.eq(volume_id)) + .select(Disk::as_select()) + .get_result_async(conn) + .await + .optional()?; + + let maybe_instance: Option = + if let Some(disk) = &maybe_disk { + if let Some(attach_instance_id) = + disk.runtime().attach_instance_id + { + instance_dsl::instance + .filter(instance_dsl::time_deleted.is_null()) + .filter(instance_dsl::id.eq(attach_instance_id)) + .select(Instance::as_select()) + .get_result_async(conn) + .await + .optional()? + } else { + // Disk not attached to an instance + None + } + } else { + // Volume not associated with disk + None + }; + + (maybe_disk, maybe_instance) + }; + + if let Err(e) = Self::volume_checkout_allowed( + &reason, + &vcr, + maybe_disk, + maybe_instance, + ) + .await + { + return Err(err.bail(e)); + } + + // Look to see if the VCR is a Volume type, and if so, look at its + // sub_volumes. If they are of type Region, then we need to update their + // generation numbers and record that update back to the database. We + // return to the caller whatever the original volume data was we pulled + // from the database. + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + let mut update_needed = false; + let mut new_sv = Vec::new(); + for sv in sub_volumes { + match sv { + VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen, + } => { + update_needed = true; + new_sv.push(VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen: gen + 1, + }); + } + _ => { + new_sv.push(sv); + } + } + } + + // Only update the volume data if we found the type of volume + // that needed it. + if update_needed { + // Create a new VCR and fill in the contents from what the + // original volume had, but with our updated sub_volume + // records. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes: new_sv, + read_only_parent, + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(VolumeGetError::SerdeError(e)))?; + + // Update the original volume_id with the new volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await?; + + // This should update just one row. If it does not, then + // something is terribly wrong in the database. + if num_updated != 1 { + return Err(err.bail( + VolumeGetError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + } + } + VolumeConstructionRequest::Region { + block_size: _, + blocks_per_extent: _, + extent_count: _, + opts: _, + gen: _, + } => { + // We don't support a pure Region VCR at the volume level in the + // database, so this choice should never be encountered, but I + // want to know if it is. + return Err(err.bail(VolumeGetError::InvalidVolume( + String::from("Region not supported as a top level volume"), + ))); + } + VolumeConstructionRequest::File { + id: _, + block_size: _, + path: _, + } + | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => + {} + } + + Ok(volume) + } + /// Checkout a copy of the Volume from the database. /// This action (getting a copy) will increase the generation number /// of Volumes of the VolumeConstructionRequest::Volume type that have @@ -734,8 +982,6 @@ impl DataStore { volume_id: Uuid, reason: VolumeCheckoutReason, ) -> LookupResult { - use db::schema::volume::dsl; - // We perform a transaction here, to be sure that on completion // of this, the database contains an updated version of the // volume with the generation number incremented (for the volume @@ -749,178 +995,8 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - // Grab the volume in question. - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await?; - - // Turn the volume.data into the VolumeConstructionRequest - let vcr: VolumeConstructionRequest = - serde_json::from_str(volume.data()).map_err(|e| { - err.bail(VolumeGetError::SerdeError(e)) - })?; - - // The VolumeConstructionRequest resulting from this checkout will have its - // generation numbers bumped, and as result will (if it has non-read-only - // sub-volumes) take over from previous read/write activations when sent to a - // place that will `construct` a new Volume. Depending on the checkout reason, - // prevent creating multiple read/write Upstairs acting on the same Volume, - // except where the take over is intended. - - let (maybe_disk, maybe_instance) = { - use db::schema::instance::dsl as instance_dsl; - use db::schema::disk::dsl as disk_dsl; - - let maybe_disk: Option = disk_dsl::disk - .filter(disk_dsl::time_deleted.is_null()) - .filter(disk_dsl::volume_id.eq(volume_id)) - .select(Disk::as_select()) - .get_result_async(&conn) - .await - .optional()?; - - let maybe_instance: Option = if let Some(disk) = &maybe_disk { - if let Some(attach_instance_id) = disk.runtime().attach_instance_id { - instance_dsl::instance - .filter(instance_dsl::time_deleted.is_null()) - .filter(instance_dsl::id.eq(attach_instance_id)) - .select(Instance::as_select()) - .get_result_async(&conn) - .await - .optional()? - } else { - // Disk not attached to an instance - None - } - } else { - // Volume not associated with disk - None - }; - - (maybe_disk, maybe_instance) - }; - - if let Err(e) = Self::volume_checkout_allowed( - &reason, - &vcr, - maybe_disk, - maybe_instance, - ) - .await { - return Err(err.bail(e)); - } - - // Look to see if the VCR is a Volume type, and if so, look at - // its sub_volumes. If they are of type Region, then we need - // to update their generation numbers and record that update - // back to the database. We return to the caller whatever the - // original volume data was we pulled from the database. - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - let mut update_needed = false; - let mut new_sv = Vec::new(); - for sv in sub_volumes { - match sv { - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen, - } => { - update_needed = true; - new_sv.push( - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen: gen + 1, - }, - ); - } - _ => { - new_sv.push(sv); - } - } - } - - // Only update the volume data if we found the type - // of volume that needed it. - if update_needed { - // Create a new VCR and fill in the contents - // from what the original volume had, but with our - // updated sub_volume records. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes: new_sv, - read_only_parent, - }; - - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - err.bail(VolumeGetError::SerdeError(e)) - })?; - - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; - - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(err.bail( - VolumeGetError::UnexpectedDatabaseUpdate( - num_updated, - 1, - ), - )); - } - } - } - VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _, - } => { - // We don't support a pure Region VCR at the volume - // level in the database, so this choice should - // never be encountered, but I want to know if it is. - return Err(err.bail(VolumeGetError::InvalidVolume( - String::from("Region not supported as a top level volume") - ))); - } - VolumeConstructionRequest::File { - id: _, - block_size: _, - path: _, - } - | VolumeConstructionRequest::Url { - id: _, - block_size: _, - url: _, - } => {} - } - Ok(volume) + Self::volume_checkout_in_txn(&conn, err, volume_id, reason) + .await } }) .await @@ -932,7 +1008,10 @@ impl DataStore { } _ => { - return Error::internal_error(&format!("Transaction error: {}", err)); + return Error::internal_error(&format!( + "Transaction error: {}", + err + )); } } } @@ -977,11 +1056,12 @@ impl DataStore { VolumeConstructionRequest::Region { opts, .. } => { if !opts.read_only { // Only one volume can "own" a Region, and that volume's - // UUID is recorded in the region table accordingly. It is - // an error to make a copy of a volume construction request - // that references non-read-only Regions. + // UUID is recorded in the region table accordingly. It + // is an error to make a copy of a volume construction + // request that references non-read-only Regions. bail!( - "only one Volume can reference a Region non-read-only!" + "only one Volume can reference a Region \ + non-read-only!" ); } @@ -1035,13 +1115,13 @@ impl DataStore { let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("find_deleted_volume_regions") .transaction(&conn, |conn| async move { - Self::find_deleted_volume_regions_txn(&conn).await + Self::find_deleted_volume_regions_in_txn(&conn).await }) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - async fn find_deleted_volume_regions_txn( + async fn find_deleted_volume_regions_in_txn( conn: &async_bb8_diesel::Connection, ) -> Result)>, diesel::result::Error> { @@ -1051,7 +1131,7 @@ impl DataStore { use db::schema::volume::dsl as volume_dsl; // Find all read-write regions (read-only region cleanup is taken care - // of in soft_delete_volume_txn!) and their associated datasets + // of in soft_delete_volume_in_txn!) and their associated datasets let unfiltered_deleted_regions = region_dsl::region .filter(region_dsl::read_only.eq(false)) // the volume may be hard deleted, so use a left join here @@ -1153,7 +1233,7 @@ impl DataStore { } #[derive(Debug, thiserror::Error)] -enum SoftDeleteTransactionError { +enum SoftDeleteError { #[error("Serde error decreasing Crucible resources: {0}")] SerdeError(#[from] serde_json::Error), @@ -1173,10 +1253,10 @@ enum SoftDeleteTransactionError { impl DataStore { // See comment for `soft_delete_volume` - async fn soft_delete_volume_txn( + async fn soft_delete_volume_in_txn( conn: &async_bb8_diesel::Connection, volume_id: Uuid, - err: OptionalError, + err: OptionalError, ) -> Result { // Grab the volume, and check if the volume was already soft-deleted. // We have to guard against the case where this function is called @@ -1248,18 +1328,15 @@ impl DataStore { // references, collecting the regions and region snapshots that were // freed up for deletion. - let num_read_write_subvolumes = match count_read_write_sub_volumes(&vcr) - { - Ok(v) => v, - Err(e) => { - return Err(err.bail( - SoftDeleteTransactionError::InvalidVolume(format!( - "volume {} invalid: {e}", - volume.id() - )), - )); - } - }; + let num_read_write_subvolumes = + match count_read_write_sub_volumes(&vcr) { + Ok(v) => v, + Err(e) => { + return Err(err.bail(SoftDeleteError::InvalidVolume( + format!("volume {} invalid: {e}", volume.id()), + ))); + } + }; let mut regions: Vec = Vec::with_capacity( REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, @@ -1280,20 +1357,18 @@ impl DataStore { ); for target in read_write_targets { - let target = target.parse().map_err(|e| { - err.bail(SoftDeleteTransactionError::AddressParseError(e)) - })?; + let target = target + .parse() + .map_err(|e| err.bail(SoftDeleteError::AddressParseError(e)))?; let maybe_region = Self::target_to_region(conn, &target, RegionType::ReadWrite) .await?; let Some(region) = maybe_region else { - return Err(err.bail( - SoftDeleteTransactionError::CouldNotFindResource(format!( - "could not find resource for {target}" - )), - )); + return Err(err.bail(SoftDeleteError::CouldNotFindResource( + format!("could not find resource for {target}"), + ))); }; // Filter out regions that have any region-snapshots @@ -1314,9 +1389,9 @@ impl DataStore { for read_only_target in &crucible_targets.read_only_targets { use db::schema::volume_resource_usage::dsl as ru_dsl; - let read_only_target = read_only_target.parse().map_err(|e| { - err.bail(SoftDeleteTransactionError::AddressParseError(e)) - })?; + let read_only_target = read_only_target + .parse() + .map_err(|e| err.bail(SoftDeleteError::AddressParseError(e)))?; let maybe_usage = Self::read_only_target_to_volume_resource_usage( conn, @@ -1325,11 +1400,9 @@ impl DataStore { .await?; let Some(usage) = maybe_usage else { - return Err(err.bail( - SoftDeleteTransactionError::CouldNotFindResource(format!( - "could not find resource for {read_only_target}" - )), - )); + return Err(err.bail(SoftDeleteError::CouldNotFindResource( + format!("could not find resource for {read_only_target}"), + ))); }; // For each read-only resource, remove the associated volume @@ -1351,10 +1424,10 @@ impl DataStore { if updated_rows != 1 { return Err(err.bail( - SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + SoftDeleteError::UnexpectedDatabaseUpdate( updated_rows, "volume_resource_usage (region)".into(), - ) + ), )); } @@ -1408,10 +1481,10 @@ impl DataStore { if updated_rows != 1 { return Err(err.bail( - SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + SoftDeleteError::UnexpectedDatabaseUpdate( updated_rows, "setting deleting (region)".into(), - ) + ), )); } @@ -1449,11 +1522,12 @@ impl DataStore { if updated_rows != 1 { return Err(err.bail( - SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + SoftDeleteError::UnexpectedDatabaseUpdate( updated_rows, "volume_resource_usage \ - (region_snapshot)".into(), - ) + (region_snapshot)" + .into(), + ), )); } @@ -1482,30 +1556,27 @@ impl DataStore { if region_snapshot_usage_left == 0 { // Don't forget to set `deleting`! see: omicron#4095 - { - use db::schema::region_snapshot::dsl; - let updated_rows = - diesel::update(dsl::region_snapshot) - .filter(dsl::dataset_id.eq(dataset_id)) - .filter(dsl::region_id.eq(region_id)) - .filter(dsl::snapshot_id.eq(snapshot_id)) - .filter( - dsl::snapshot_addr - .eq(read_only_target.to_string()), - ) - .filter(dsl::deleting.eq(false)) - .set(dsl::deleting.eq(true)) - .execute_async(conn) - .await?; - - if updated_rows != 1 { - return Err(err.bail( - SoftDeleteTransactionError::UnexpectedDatabaseUpdate( - updated_rows, - "setting deleting (region snapshot)".into(), - ) - )); - } + use db::schema::region_snapshot::dsl; + let updated_rows = diesel::update(dsl::region_snapshot) + .filter(dsl::dataset_id.eq(dataset_id)) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::snapshot_id.eq(snapshot_id)) + .filter( + dsl::snapshot_addr + .eq(read_only_target.to_string()), + ) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting (region snapshot)".into(), + ), + )); } region_snapshots.push(RegionSnapshotV3 { @@ -1540,7 +1611,7 @@ impl DataStore { if updated_rows != 1 { return Err(err.bail( - SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + SoftDeleteError::UnexpectedDatabaseUpdate( updated_rows, "volume".into(), ), @@ -1569,7 +1640,7 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - Self::soft_delete_volume_txn(&conn, volume_id, err).await + Self::soft_delete_volume_in_txn(&conn, volume_id, err).await } }) .await @@ -1582,257 +1653,238 @@ impl DataStore { }) } - // Here we remove the read only parent from volume_id, and attach it - // to temp_volume_id. - // - // As this is part of a saga, it will be able to handle being replayed - // If we call this twice, any work done the first time through should - // not happen again, or be undone. - pub async fn volume_remove_rop( - &self, + async fn volume_remove_rop_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, volume_id: Uuid, temp_volume_id: Uuid, - ) -> Result { - #[derive(Debug, thiserror::Error)] - enum RemoveReadOnlyParentError { - #[error("Error removing read-only parent: {0}")] - Public(Error), + ) -> Result { + // Grab the volume in question. If the volume record was already deleted + // then we can just return. + let volume = { + use db::schema::volume::dsl; - #[error("Serde error removing read-only parent: {0}")] - SerdeError(#[from] serde_json::Error), + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(conn) + .await + .optional()?; - #[error("Updated {0} database rows, expected {1}")] - UnexpectedDatabaseUpdate(usize, usize), + let volume = if let Some(v) = volume { + v + } else { + // the volume does not exist, nothing to do. + return Ok(false); + }; - #[error("Address parsing error during ROP removal: {0}")] - AddressParseError(#[from] AddrParseError), + if volume.time_deleted.is_some() { + // this volume is deleted, so let whatever is deleting it clean + // it up. + return Ok(false); + } else { + // A volume record exists, and was not deleted, we can attempt + // to remove its read_only_parent. + volume + } + }; - #[error("Could not match read-only resource to {0}")] - CouldNotFindResource(String), - } + // If a read_only_parent exists, remove it from volume_id, and attach it + // to temp_volume_id. + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; - // In this single transaction: - // - Get the given volume from the volume_id from the database - // - Extract the volume.data into a VolumeConstructionRequest (VCR) - // - Create a new VCR, copying over anything from the original VCR, - // but, replacing the read_only_parent with None. - // - Put the new VCR into volume.data, then update the volume in the - // database. - // - Get the given volume from temp_volume_id from the database - // - Extract the temp volume.data into a VCR - // - Create a new VCR, copying over anything from the original VCR, - // but, replacing the read_only_parent with the read_only_parent - // data from original volume_id. - // - Put the new temp VCR into the temp volume.data, update the - // temp_volume in the database. - let err = OptionalError::new(); - let conn = self.pool_connection_unauthorized().await?; - self.transaction_retry_wrapper("volume_remove_rop") - .transaction(&conn, |conn| { - let err = err.clone(); - async move { - // Grab the volume in question. If the volume record was - // already deleted then we can just return. - let volume = { - use db::schema::volume::dsl; - - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await - .optional()?; + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + if read_only_parent.is_none() { + // This volume has no read_only_parent + Ok(false) + } else { + // Create a new VCR and fill in the contents from what the + // original volume had. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent: None, + }; - let volume = if let Some(v) = volume { - v - } else { - // the volume does not exist, nothing to do. - return Ok(false); - }; + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; - if volume.time_deleted.is_some() { - // this volume is deleted, so let whatever is - // deleting it clean it up. - return Ok(false); - } else { - // A volume record exists, and was not deleted, we - // can attempt to remove its read_only_parent. - volume - } + // Update the original volume_id with the new volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await?; + + // This should update just one row. If it does not, then + // something is terribly wrong in the database. + if num_updated != 1 { + return Err(err.bail( + RemoveRopError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + + // Make a new VCR, with the information from our + // temp_volume_id, but the read_only_parent from the + // original volume. + let rop_vcr = VolumeConstructionRequest::Volume { + id: temp_volume_id, + block_size, + sub_volumes: vec![], + read_only_parent, }; - // If a read_only_parent exists, remove it from volume_id, - // and attach it to temp_volume_id. - let vcr: VolumeConstructionRequest = - serde_json::from_str( - volume.data() + let rop_volume_data = serde_json::to_string(&rop_vcr) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; + + // Update the temp_volume_id with the volume data that + // contains the read_only_parent. + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(temp_volume_id)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(rop_volume_data)) + .execute_async(conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + RemoveRopError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + + // Update the volume resource usage record for every + // read-only resource in the ROP + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &rop_vcr, + &mut crucible_targets, + ); + crucible_targets + }; + + for read_only_target in crucible_targets.read_only_targets { + let read_only_target = + read_only_target.parse().map_err(|e| { + err.bail(RemoveRopError::AddressParseError(e)) + })?; + + let maybe_usage = + Self::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; + + let Some(usage) = maybe_usage else { + return Err(err.bail( + RemoveRopError::CouldNotFindResource(format!( + "could not find resource for \ + {read_only_target}" + )), + )); + }; + + Self::swap_volume_usage_records_for_resources( + conn, + usage, + volume_id, + temp_volume_id, ) + .await .map_err(|e| { - err.bail( - RemoveReadOnlyParentError::SerdeError( - e, + err.bail_retryable_or_else(e, |e| { + RemoveRopError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ), ) - ) + }) })?; + } - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - if read_only_parent.is_none() { - // This volume has no read_only_parent - Ok(false) - } else { - // Create a new VCR and fill in the contents - // from what the original volume had. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent: None, - }; - - let new_volume_data = - serde_json::to_string( - &new_vcr - ) - .map_err(|e| { - err.bail(RemoveReadOnlyParentError::SerdeError( - e, - )) - })?; - - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; - - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); - } - - // Make a new VCR, with the information from - // our temp_volume_id, but the read_only_parent - // from the original volume. - let rop_vcr = VolumeConstructionRequest::Volume { - id: temp_volume_id, - block_size, - sub_volumes: vec![], - read_only_parent, - }; - - let rop_volume_data = - serde_json::to_string( - &rop_vcr - ) - .map_err(|e| { - err.bail(RemoveReadOnlyParentError::SerdeError( - e, - )) - })?; - - // Update the temp_volume_id with the volume - // data that contains the read_only_parent. - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(temp_volume_id)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(rop_volume_data)) - .execute_async(&conn) - .await?; - - if num_updated != 1 { - return Err(err.bail( - RemoveReadOnlyParentError::UnexpectedDatabaseUpdate( - num_updated, - 1 - ) - )); - } - - // Update the volume resource usage record for - // every read-only resource in the ROP - let crucible_targets = { - let mut crucible_targets = CrucibleTargets::default(); - read_only_resources_associated_with_volume( - &rop_vcr, - &mut crucible_targets, - ); - crucible_targets - }; - - for read_only_target in crucible_targets.read_only_targets { - let read_only_target = read_only_target - .parse() - .map_err(|e| err.bail( - RemoveReadOnlyParentError::AddressParseError(e) - ))?; - - let maybe_usage = Self::read_only_target_to_volume_resource_usage( - &conn, - &read_only_target, - ) - .await?; - - let Some(usage) = maybe_usage else { - return Err(err.bail( - RemoveReadOnlyParentError::CouldNotFindResource(format!( - "could not find resource for {read_only_target}" - )), - )); - }; - - Self::swap_volume_usage_records_for_resources( - &conn, - usage, - volume_id, - temp_volume_id, - ) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - RemoveReadOnlyParentError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; - } + // After read-only parent removal, validate invariants for + // all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; - // After read-only parent removal, validate - // invariants for all volumes - #[cfg(any(test, feature = "testing"))] - Self::validate_volume_invariants(&conn).await?; + Ok(true) + } + } - Ok(true) - } - } + VolumeConstructionRequest::File { .. } + | VolumeConstructionRequest::Region { .. } + | VolumeConstructionRequest::Url { .. } => { + // Volume has a format that does not contain ROPs + Ok(false) + } + } + } - VolumeConstructionRequest::File { .. } - | VolumeConstructionRequest::Region { .. } - | VolumeConstructionRequest::Url { .. } => { - // Volume has a format that does not contain ROPs - Ok(false) - } - } + // Here we remove the read only parent from volume_id, and attach it + // to temp_volume_id. + // + // As this is part of a saga, it will be able to handle being replayed + // If we call this twice, any work done the first time through should + // not happen again, or be undone. + pub async fn volume_remove_rop( + &self, + volume_id: Uuid, + temp_volume_id: Uuid, + ) -> Result { + // In this single transaction: + // - Get the given volume from the volume_id from the database + // - Extract the volume.data into a VolumeConstructionRequest (VCR) + // - Create a new VCR, copying over anything from the original VCR, + // but, replacing the read_only_parent with None. + // - Put the new VCR into volume.data, then update the volume in the + // database. + // - Get the given volume from temp_volume_id from the database + // - Extract the temp volume.data into a VCR + // - Create a new VCR, copying over anything from the original VCR, + // but, replacing the read_only_parent with the read_only_parent + // data from original volume_id. + // - Put the new temp VCR into the temp volume.data, update the + // temp_volume in the database. + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("volume_remove_rop") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::volume_remove_rop_in_txn( + &conn, + err, + volume_id, + temp_volume_id, + ) + .await } }) .await .map_err(|e| { if let Some(err) = err.take() { - return Error::internal_error(&format!("Transaction error: {}", err)); + return Error::internal_error(&format!( + "Transaction error: {}", + err + )); } public_error_from_diesel(e, ErrorHandler::Server) }) @@ -1917,9 +1969,9 @@ impl DataStore { if mismatched_record_type_count > 0 { return Err(err.bail(Error::conflict(&format!( - "existing repair type for id {} does not match {:?}!", - record.repair_id, - record.repair_type, + "existing repair type for id {} does not \ + match {:?}!", + record.repair_id, record.repair_type, )))); } @@ -1993,6 +2045,54 @@ impl DataStore { }) } + async fn upstairs_repair_progress_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> Result<(), diesel::result::Error> { + use db::schema::upstairs_repair_notification::dsl as notification_dsl; + use db::schema::upstairs_repair_progress::dsl; + + // Check that there is a repair id for the upstairs id + let matching_repair: Option = + notification_dsl::upstairs_repair_notification + .filter( + notification_dsl::repair_id + .eq(nexus_db_model::to_db_typed_uuid(repair_id)), + ) + .filter( + notification_dsl::upstairs_id + .eq(nexus_db_model::to_db_typed_uuid(upstairs_id)), + ) + .filter( + notification_dsl::notification_type + .eq(UpstairsRepairNotificationType::Started), + ) + .get_result_async(conn) + .await + .optional()?; + + if matching_repair.is_none() { + return Err(err.bail(Error::non_resourcetype_not_found(&format!( + "upstairs {upstairs_id} repair {repair_id} not found" + )))); + } + + diesel::insert_into(dsl::upstairs_repair_progress) + .values(UpstairsRepairProgress { + repair_id: repair_id.into(), + time: repair_progress.time, + current_item: repair_progress.current_item, + total_items: repair_progress.total_items, + }) + .execute_async(conn) + .await?; + + Ok(()) + } + /// Record Upstairs repair progress pub async fn upstairs_repair_progress( &self, @@ -2001,9 +2101,6 @@ impl DataStore { repair_id: TypedUuid, repair_progress: RepairProgress, ) -> Result<(), Error> { - use db::schema::upstairs_repair_notification::dsl as notification_dsl; - use db::schema::upstairs_repair_progress::dsl; - let conn = self.pool_connection_authorized(opctx).await?; let err = OptionalError::new(); @@ -2013,33 +2110,14 @@ impl DataStore { let err = err.clone(); async move { - // Check that there is a repair id for the upstairs id - let matching_repair: Option = - notification_dsl::upstairs_repair_notification - .filter(notification_dsl::repair_id.eq(nexus_db_model::to_db_typed_uuid(repair_id))) - .filter(notification_dsl::upstairs_id.eq(nexus_db_model::to_db_typed_uuid(upstairs_id))) - .filter(notification_dsl::notification_type.eq(UpstairsRepairNotificationType::Started)) - .get_result_async(&conn) - .await - .optional()?; - - if matching_repair.is_none() { - return Err(err.bail(Error::non_resourcetype_not_found(&format!( - "upstairs {upstairs_id} repair {repair_id} not found" - )))); - } - - diesel::insert_into(dsl::upstairs_repair_progress) - .values(UpstairsRepairProgress { - repair_id: repair_id.into(), - time: repair_progress.time, - current_item: repair_progress.current_item, - total_items: repair_progress.total_items, - }) - .execute_async(&conn) - .await?; - - Ok(()) + Self::upstairs_repair_progress_in_txn( + &conn, + err, + upstairs_id, + repair_id, + repair_progress, + ) + .await } }) .await @@ -2495,6 +2573,7 @@ fn read_only_target_in_vcr( Ok(false) } +#[derive(Clone)] pub struct VolumeReplacementParams { pub volume_id: Uuid, pub region_id: Uuid, @@ -2532,12 +2611,12 @@ pub enum VolumeReplaceResult { } impl DataStore { - /// Replace a read-write region in a Volume with a new region. - pub async fn volume_replace_region( - &self, + async fn volume_replace_region_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, existing: VolumeReplacementParams, replacement: VolumeReplacementParams, - ) -> Result { + ) -> Result { // In a single transaction: // // - set the existing region's volume id to the replacement's volume id @@ -2628,196 +2707,490 @@ impl DataStore { // referenced (via its socket address) in NEW_VOL. For an example, this // is done as part of the region replacement start saga. - #[derive(Debug, thiserror::Error)] - enum VolumeReplaceRegionError { - #[error("Error from Volume region replacement: {0}")] - Public(Error), + // Grab the old volume first + let maybe_old_volume = { + volume_dsl::volume + .filter(volume_dsl::id.eq(existing.volume_id)) + .select(Volume::as_select()) + .first_async::(conn) + .await + .optional() + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + }; + + let old_volume = if let Some(old_volume) = maybe_old_volume { + old_volume + } else { + // Existing volume was hard-deleted, so return here. We can't + // perform the region replacement now, and this will short-circuit + // the rest of the process. - #[error("Serde error during Volume region replacement: {0}")] - SerdeError(#[from] serde_json::Error), + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + }; - #[error("Region replacement error: {0}")] - RegionReplacementError(#[from] anyhow::Error), + if old_volume.time_deleted.is_some() { + // Existing volume was soft-deleted, so return here for the same + // reason: the region replacement process should be short-circuited + // now. + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); } + + let old_vcr: VolumeConstructionRequest = + match serde_json::from_str(&old_volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(err.bail(ReplaceRegionError::SerdeError(e))); + } + }; + + // Does it look like this replacement already happened? + let old_region_in_vcr = + match region_in_vcr(&old_vcr, &existing.region_addr) { + Ok(v) => v, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; + let new_region_in_vcr = + match region_in_vcr(&old_vcr, &replacement.region_addr) { + Ok(v) => v, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; + + if !old_region_in_vcr && new_region_in_vcr { + // It does seem like the replacement happened + return Ok(VolumeReplaceResult::AlreadyHappened); + } + + use db::schema::region::dsl as region_dsl; + use db::schema::volume::dsl as volume_dsl; + + // Set the existing region's volume id to the replacement's volume id + diesel::update(region_dsl::region) + .filter(region_dsl::id.eq(existing.region_id)) + .set(region_dsl::volume_id.eq(replacement.volume_id)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Set the replacement region's volume id to the existing's volume id + diesel::update(region_dsl::region) + .filter(region_dsl::id.eq(replacement.region_id)) + .set(region_dsl::volume_id.eq(existing.volume_id)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Update the existing volume's construction request to replace the + // existing region's SocketAddrV6 with the replacement region's + + // Copy the old volume's VCR, changing out the old region for the new. + let new_vcr = match replace_region_in_vcr( + &old_vcr, + existing.region_addr, + replacement.region_addr, + ) { + Ok(new_vcr) => new_vcr, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(ReplaceRegionError::SerdeError(e)))?; + + // Update the existing volume's data + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(existing.volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // After region replacement, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; + + Ok(VolumeReplaceResult::Done) + } + + /// Replace a read-write region in a Volume with a new region. + pub async fn volume_replace_region( + &self, + existing: VolumeReplacementParams, + replacement: VolumeReplacementParams, + ) -> Result { let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("volume_replace_region") .transaction(&conn, |conn| { let err = err.clone(); + let existing = existing.clone(); + let replacement = replacement.clone(); async move { - // Grab the old volume first - let maybe_old_volume = { - volume_dsl::volume - .filter(volume_dsl::id.eq(existing.volume_id)) - .select(Volume::as_select()) - .first_async::(&conn) - .await - .optional() - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })? - }; + Self::volume_replace_region_in_txn( + &conn, + err, + existing, + replacement, + ) + .await + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + ReplaceRegionError::Public(e) => e, - let old_volume = if let Some(old_volume) = maybe_old_volume { - old_volume - } else { - // Existing volume was hard-deleted, so return here. We - // can't perform the region replacement now, and this - // will short-circuit the rest of the process. + ReplaceRegionError::SerdeError(_) => { + Error::internal_error(&err.to_string()) + } + + ReplaceRegionError::RegionReplacementError(_) => { + Error::internal_error(&err.to_string()) + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + + async fn volume_replace_snapshot_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: VolumeWithTarget, + existing: ExistingTarget, + replacement: ReplacementTarget, + volume_to_delete_id: VolumeToDelete, + ) -> Result { + use db::schema::volume::dsl as volume_dsl; + use db::schema::volume_resource_usage::dsl as ru_dsl; + + // Grab the old volume first + let maybe_old_volume = { + volume_dsl::volume + .filter(volume_dsl::id.eq(volume_id.0)) + .select(Volume::as_select()) + .first_async::(conn) + .await + .optional() + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + }; + + let old_volume = if let Some(old_volume) = maybe_old_volume { + old_volume + } else { + // Existing volume was hard-deleted, so return here. We can't + // perform the region replacement now, and this will short-circuit + // the rest of the process. + + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + }; + + if old_volume.time_deleted.is_some() { + // Existing volume was soft-deleted, so return here for the same + // reason: the region replacement process should be short-circuited + // now. + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + } + + let old_vcr: VolumeConstructionRequest = + match serde_json::from_str(&old_volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(err.bail(ReplaceSnapshotError::SerdeError(e))); + } + }; + + // Does it look like this replacement already happened? + let old_target_in_vcr = + match read_only_target_in_vcr(&old_vcr, &existing.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + ReplaceSnapshotError::SnapshotReplacementError(e), + )); + } + }; + + let new_target_in_vcr = + match read_only_target_in_vcr(&old_vcr, &replacement.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + ReplaceSnapshotError::SnapshotReplacementError(e), + )); + } + }; + + if !old_target_in_vcr && new_target_in_vcr { + // It does seem like the replacement happened + return Ok(VolumeReplaceResult::AlreadyHappened); + } + + // Update the existing volume's construction request to replace the + // existing target's SocketAddrV6 with the replacement target's + + // Copy the old volume's VCR, changing out the old target for the new. + let (new_vcr, replacements) = match replace_read_only_target_in_vcr( + &old_vcr, + existing, + replacement, + ) { + Ok(new_vcr) => new_vcr, + Err(e) => { + return Err( + err.bail(ReplaceSnapshotError::SnapshotReplacementError(e)) + ); + } + }; + + // Expect that this only happened once. If it happened multiple times, + // question everything: how would a snapshot be used twice?! + + if replacements != 1 { + return Err(err.bail( + ReplaceSnapshotError::UnexpectedReplacedTargets( + replacements, + 1, + ), + )); + } + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(ReplaceSnapshotError::SerdeError(e)))?; + + // Update the existing volume's data + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id.0)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Make a new VCR that will stash the target to delete. The values here + // don't matter, just that it gets fed into the volume_delete machinery + // later. + let vcr = VolumeConstructionRequest::Volume { + id: volume_to_delete_id.0, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: sled_agent_client::types::CrucibleOpts { + id: volume_to_delete_id.0, + target: vec![existing.0.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&vcr) + .map_err(|e| err.bail(ReplaceSnapshotError::SerdeError(e)))?; + + // Update the volume to delete data + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_to_delete_id.0)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(volume_data)) + .execute_async(conn) + .await?; - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - }; + if num_updated != 1 { + return Err(err.bail( + ReplaceSnapshotError::UnexpectedDatabaseUpdate(num_updated, 1), + )); + } - if old_volume.time_deleted.is_some() { - // Existing volume was soft-deleted, so return here for - // the same reason: the region replacement process - // should be short-circuited now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - } + // Update the appropriate volume resource usage records - it could + // either be a read-only region or a region snapshot, so determine what + // it is first - let old_vcr: VolumeConstructionRequest = - match serde_json::from_str(&old_volume.data()) { - Ok(vcr) => vcr, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::SerdeError(e))); - }, - }; + let maybe_existing_usage = + Self::read_only_target_to_volume_resource_usage(conn, &existing.0) + .await?; - // Does it look like this replacement already happened? - let old_region_in_vcr = match region_in_vcr(&old_vcr, &existing.region_addr) { - Ok(v) => v, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::RegionReplacementError(e))); - }, - }; - let new_region_in_vcr = match region_in_vcr(&old_vcr, &replacement.region_addr) { - Ok(v) => v, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::RegionReplacementError(e))); - }, - }; + let Some(existing_usage) = maybe_existing_usage else { + return Err(err.bail(ReplaceSnapshotError::CouldNotFindResource( + format!("could not find resource for {}", existing.0,), + ))); + }; - if !old_region_in_vcr && new_region_in_vcr { - // It does seem like the replacement happened - return Ok(VolumeReplaceResult::AlreadyHappened); - } + // The "existing" target moved into the volume to delete - use db::schema::region::dsl as region_dsl; - use db::schema::volume::dsl as volume_dsl; + Self::swap_volume_usage_records_for_resources( + conn, + existing_usage, + volume_id.0, + volume_to_delete_id.0, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; - // Set the existing region's volume id to the replacement's - // volume id - diesel::update(region_dsl::region) - .filter(region_dsl::id.eq(existing.region_id)) - .set(region_dsl::volume_id.eq(replacement.volume_id)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + let maybe_replacement_usage = + Self::read_only_target_to_volume_resource_usage( + conn, + &replacement.0, + ) + .await?; - // Set the replacement region's volume id to the existing's - // volume id - diesel::update(region_dsl::region) - .filter(region_dsl::id.eq(replacement.region_id)) - .set(region_dsl::volume_id.eq(existing.volume_id)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + let Some(replacement_usage) = maybe_replacement_usage else { + return Err(err.bail(ReplaceSnapshotError::CouldNotFindResource( + format!("could not find resource for {}", existing.0,), + ))); + }; - // Update the existing volume's construction request to - // replace the existing region's SocketAddrV6 with the - // replacement region's - - // Copy the old volume's VCR, changing out the old region - // for the new. - let new_vcr = match replace_region_in_vcr( - &old_vcr, - existing.region_addr, - replacement.region_addr, - ) { - Ok(new_vcr) => new_vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceRegionError::RegionReplacementError(e) - )); - } - }; + // The intention leaving this transaction is that the correct volume + // resource usage records exist, so: + // + // - if no usage record existed for the replacement usage, then create a + // new record that points to the volume id (this can happen if the + // volume to delete was blank when coming into this function) + // + // - if records exist for the "replacement" usage, then one of those + // will match the volume to delete id, so perform a swap instead to + // the volume id - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - err.bail(VolumeReplaceRegionError::SerdeError(e)) - })?; + let existing_replacement_volume_usage_records = + Self::volume_usage_records_for_resource_query( + replacement_usage.clone(), + ) + .load_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + // TODO be smart enough to .filter the above query + .into_iter() + .filter(|record| record.volume_id == volume_to_delete_id.0) + .count(); - // Update the existing volume's data - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(existing.volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + // The "replacement" target moved into the volume - // After region replacement, validate invariants for all - // volumes - #[cfg(any(test, feature = "testing"))] - Self::validate_volume_invariants(&conn).await?; + if existing_replacement_volume_usage_records == 0 { + // No matching record + let new_record = + VolumeResourceUsageRecord::new(volume_id.0, replacement_usage); - Ok(VolumeReplaceResult::Done) - } - }) + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(new_record) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + } else if existing_replacement_volume_usage_records == 1 { + // One matching record: perform swap + Self::swap_volume_usage_records_for_resources( + conn, + replacement_usage, + volume_to_delete_id.0, + volume_id.0, + ) .await .map_err(|e| { - if let Some(err) = err.take() { - match err { - VolumeReplaceRegionError::Public(e) => e, + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + } else { + // More than one matching record! + return Err(err.bail( + ReplaceSnapshotError::MultipleResourceUsageRecords(format!( + "{replacement_usage:?}" + )), + )); + } - VolumeReplaceRegionError::SerdeError(_) => { - Error::internal_error(&err.to_string()) - } + // After region snapshot replacement, validate invariants for all + // volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; - VolumeReplaceRegionError::RegionReplacementError(_) => { - Error::internal_error(&err.to_string()) - } - } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + Ok(VolumeReplaceResult::Done) } /// Replace a read-only target in a Volume with a new region @@ -2843,35 +3216,6 @@ impl DataStore { replacement: ReplacementTarget, volume_to_delete_id: VolumeToDelete, ) -> Result { - #[derive(Debug, thiserror::Error)] - enum VolumeReplaceSnapshotError { - #[error("Error from Volume snapshot replacement: {0}")] - Public(Error), - - #[error("Serde error during Volume snapshot replacement: {0}")] - SerdeError(#[from] serde_json::Error), - - #[error("Snapshot replacement error: {0}")] - SnapshotReplacementError(#[from] anyhow::Error), - - #[error("Replaced {0} targets, expected {1}")] - UnexpectedReplacedTargets(usize, usize), - - #[error("Updated {0} database rows, expected {1}")] - UnexpectedDatabaseUpdate(usize, usize), - - #[error( - "Address parsing error during Volume snapshot \ - replacement: {0}" - )] - AddressParseError(#[from] AddrParseError), - - #[error("Could not match read-only resource to {0}")] - CouldNotFindResource(String), - - #[error("Multiple volume resource usage records for {0}")] - MultipleResourceUsageRecords(String), - } let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; @@ -2880,354 +3224,38 @@ impl DataStore { let err = err.clone(); async move { - use db::schema::volume::dsl as volume_dsl; - use db::schema::volume_resource_usage::dsl as ru_dsl; - - // Grab the old volume first - let maybe_old_volume = { - volume_dsl::volume - .filter(volume_dsl::id.eq(volume_id.0)) - .select(Volume::as_select()) - .first_async::(&conn) - .await - .optional() - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })? - }; - - let old_volume = if let Some(old_volume) = maybe_old_volume { - old_volume - } else { - // Existing volume was hard-deleted, so return here. We - // can't perform the region replacement now, and this - // will short-circuit the rest of the process. - - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - }; - - if old_volume.time_deleted.is_some() { - // Existing volume was soft-deleted, so return here for - // the same reason: the region replacement process - // should be short-circuited now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - } - - let old_vcr: VolumeConstructionRequest = - match serde_json::from_str(&old_volume.data()) { - Ok(vcr) => vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SerdeError(e) - )); - }, - }; - - // Does it look like this replacement already happened? - let old_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &existing.0) { - Ok(v) => v, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - }, - }; - - let new_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &replacement.0) { - Ok(v) => v, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - }, - }; - - if !old_target_in_vcr && new_target_in_vcr { - // It does seem like the replacement happened - return Ok(VolumeReplaceResult::AlreadyHappened); - } - - // Update the existing volume's construction request to - // replace the existing target's SocketAddrV6 with the - // replacement target's - - // Copy the old volume's VCR, changing out the old target - // for the new. - let (new_vcr, replacements) = match replace_read_only_target_in_vcr( - &old_vcr, + Self::volume_replace_snapshot_in_txn( + &conn, + err, + volume_id, existing, replacement, - ) { - Ok(new_vcr) => new_vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - } - }; - - // Expect that this only happened once. If it happened - // multiple times, question everything: how would a snapshot - // be used twice?! - - if replacements != 1 { - return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedReplacedTargets( - replacements, 1, - ) - )); - } - - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; - - // Update the existing volume's data - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id.0)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; - - // Make a new VCR that will stash the target to delete. The - // values here don't matter, just that it gets fed into the - // volume_delete machinery later. - let vcr = VolumeConstructionRequest::Volume { - id: volume_to_delete_id.0, - block_size: 512, - sub_volumes: vec![ - VolumeConstructionRequest::Region { - block_size: 512, - blocks_per_extent: 1, - extent_count: 1, - gen: 1, - opts: sled_agent_client::types::CrucibleOpts { - id: volume_to_delete_id.0, - target: vec![ - existing.0.to_string(), - ], - lossy: false, - flush_timeout: None, - key: None, - cert_pem: None, - key_pem: None, - root_cert_pem: None, - control: None, - read_only: true, - }, - } - ], - read_only_parent: None, - }; - - let volume_data = serde_json::to_string(&vcr) - .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; - - // Update the volume to delete data - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_to_delete_id.0)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(volume_data)) - .execute_async(&conn) - .await?; - - if num_updated != 1 { - return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( - num_updated, 1, - ) - )); - } - - // Update the appropriate volume resource usage records - it - // could either be a read-only region or a region snapshot, - // so determine what it is first - - let maybe_existing_usage = Self::read_only_target_to_volume_resource_usage( - &conn, - &existing.0, - ) - .await?; - - let Some(existing_usage) = maybe_existing_usage else { - return Err(err.bail( - VolumeReplaceSnapshotError::CouldNotFindResource( - format!( - "could not find resource for {}", - existing.0, - ) - )) - ); - }; - - // The "existing" target moved into the volume to delete - - Self::swap_volume_usage_records_for_resources( - &conn, - existing_usage, - volume_id.0, - volume_to_delete_id.0, + volume_to_delete_id, ) .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; - - let maybe_replacement_usage = - Self::read_only_target_to_volume_resource_usage( - &conn, - &replacement.0, - ) - .await?; - - let Some(replacement_usage) = maybe_replacement_usage else { - return Err(err.bail( - VolumeReplaceSnapshotError::CouldNotFindResource( - format!( - "could not find resource for {}", - existing.0, - ) - )) - ); - }; - - // The intention leaving this transaction is that the - // correct volume resource usage records exist, so: - // - // - if no usage record existed for the replacement usage, - // then create a new record that points to the volume - // id (this can happen if the volume to delete was blank - // when coming into this function) - // - // - if records exist for the "replacement" usage, then one - // of those will match the volume to delete id, so perform - // a swap instead to the volume id - - let existing_replacement_volume_usage_records = - Self::volume_usage_records_for_resource_query( - replacement_usage.clone(), - ) - .load_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })? - // TODO be smart enough to .filter the above query - .into_iter() - .filter(|record| record.volume_id == volume_to_delete_id.0) - .count(); - - // The "replacement" target moved into the volume - - if existing_replacement_volume_usage_records == 0 { - // No matching record - let new_record = VolumeResourceUsageRecord::new( - volume_id.0, - replacement_usage, - ); - - diesel::insert_into(ru_dsl::volume_resource_usage) - .values(new_record) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; - } else if existing_replacement_volume_usage_records == 1 { - // One matching record: perform swap - Self::swap_volume_usage_records_for_resources( - &conn, - replacement_usage, - volume_to_delete_id.0, - volume_id.0, - ) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; - } else { - // More than one matching record! - return Err(err.bail( - VolumeReplaceSnapshotError::MultipleResourceUsageRecords( - format!("{replacement_usage:?}") - ) - )); - } - - // After region snapshot replacement, validate invariants - // for all volumes - #[cfg(any(test, feature = "testing"))] - Self::validate_volume_invariants(&conn).await?; - - Ok(VolumeReplaceResult::Done) } }) .await .map_err(|e| { if let Some(err) = err.take() { match err { - VolumeReplaceSnapshotError::Public(e) => e, - - VolumeReplaceSnapshotError::SerdeError(_) | - VolumeReplaceSnapshotError::SnapshotReplacementError(_) | - VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) | - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) | - VolumeReplaceSnapshotError::AddressParseError(_) | - VolumeReplaceSnapshotError::CouldNotFindResource(_) | - VolumeReplaceSnapshotError::MultipleResourceUsageRecords(_) => { - Error::internal_error(&err.to_string()) - } + ReplaceSnapshotError::Public(e) => e, + + ReplaceSnapshotError::SerdeError(_) + | ReplaceSnapshotError::SnapshotReplacementError(_) + | ReplaceSnapshotError::UnexpectedReplacedTargets( + _, + _, + ) + | ReplaceSnapshotError::UnexpectedDatabaseUpdate( + _, + _, + ) + | ReplaceSnapshotError::AddressParseError(_) + | ReplaceSnapshotError::CouldNotFindResource(_) + | ReplaceSnapshotError::MultipleResourceUsageRecords( + _, + ) => Error::internal_error(&err.to_string()), } } else { public_error_from_diesel(e, ErrorHandler::Server) @@ -3761,7 +3789,7 @@ impl DataStore { } /// Assert that read-only regions do not have any associated region - /// snapshots (see associated comment in `soft_delete_volume_txn`) + /// snapshots (see associated comment in `soft_delete_volume_in_txn`) async fn validate_read_only_region_has_no_snapshots( conn: &async_bb8_diesel::Connection, region: Region, @@ -3839,8 +3867,8 @@ mod tests { .await .unwrap(); - // Add old CrucibleResources json in the `resources_to_clean_up` column - - // this was before the `deleting` column / field was added to + // Add old CrucibleResources json in the `resources_to_clean_up` column + // - this was before the `deleting` column / field was added to // ResourceSnapshot. { @@ -4012,7 +4040,8 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - region_addresses[0].clone(), // target to replace + // target to replace + region_addresses[0].clone(), region_addresses[1].clone(), region_addresses[2].clone(), ], From f52fa636ffe9481c0846c1297bfd9d57b67e5032 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Tue, 5 Nov 2024 21:57:23 +0000 Subject: [PATCH 22/24] expand on different old_region_in_vcr + new_region_in_vcr cases --- nexus/db-queries/src/db/datastore/volume.rs | 59 ++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index ce0dfc072a..4fdbc94e29 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -31,6 +31,7 @@ use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::DbConnection; use crate::transaction_retry::OptionalError; +use anyhow::anyhow; use anyhow::bail; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -2771,8 +2772,64 @@ impl DataStore { }; if !old_region_in_vcr && new_region_in_vcr { - // It does seem like the replacement happened + // It does seem like the replacement happened - if this function is + // called twice in a row then this can happen. return Ok(VolumeReplaceResult::AlreadyHappened); + } else if old_region_in_vcr && !new_region_in_vcr { + // The replacement hasn't happened yet, but can proceed + } else if old_region_in_vcr && new_region_in_vcr { + // Both the old region and new region exist in this VCR. Regions are + // not reused, so this is an illegal state: if the replacement of + // the old region occurred, then the new region would be present + // multiple times in the volume. We have to bail out here. + // + // The guards against this happening are: + // + // - only one replacement can occur for a volume at a time (due to + // the volume repair lock), and + // + // - region replacement does not delete the old region until the + // "region replacement finish" saga, which happens at the very end + // of the process. If it eagerly deleted the region, the crucible + // agent would be free to reuse the port for another region + // allocation, and an identical target (read: ip and port) could + // be confusing. Most of the time, we assume that the dataset + // containing that agent has been expunged, so the agent is gone, + // so this port reuse cannot occur + return Err(err.bail(ReplaceRegionError::RegionReplacementError( + anyhow!("old_region_in_vcr && new_region_in_vcr"), + ))); + } else if !old_region_in_vcr && !new_region_in_vcr { + // Neither the region we've been asked to replace or the new region + // is in the VCR. This is an illegal state, as this function would + // be performing a no-op. We have to bail out here. + // + // The guard against this happening is again that only one + // replacement can occur for a volume at a time: if it was possible + // for multiple region replacements to occur, then both would be + // attempting to swap out the same old region for different new + // regions: + // + // region replacement one: + // + // volume_replace_region_in_txn( + // .., + // existing = [fd00:1122:3344:145::10]:40001, + // replacement = [fd00:1122:3344:322::4]:3956, + // ) + // + // region replacement two: + // + // volume_replace_region_in_txn( + // .., + // existing = [fd00:1122:3344:145::10]:40001, + // replacement = [fd00:1122:3344:fd1::123]:27001, + // ) + // + // The one that replaced second would always land in this branch. + return Err(err.bail(ReplaceRegionError::RegionReplacementError( + anyhow!("!old_region_in_vcr && !new_region_in_vcr"), + ))); } use db::schema::region::dsl as region_dsl; From ff590d6264d5686bed90a9674cfe63ac88fd5555 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 7 Nov 2024 16:15:02 +0000 Subject: [PATCH 23/24] retain previous find_deleted_volume_regions behaviour: only operate on soft-deleted volumes --- nexus/db-queries/src/db/datastore/volume.rs | 56 +++++++++++++++++++-- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 4fdbc94e29..4cd83fff3f 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -1174,13 +1174,13 @@ impl DataStore { for (dataset, region, region_snapshot, volume) in unfiltered_deleted_regions { - // only operate on soft or hard deleted volumes - let deleted = match &volume { + // only operate on soft deleted volumes + let soft_deleted = match &volume { Some(volume) => volume.time_deleted.is_some(), - None => true, + None => false, }; - if !deleted { + if !soft_deleted { continue; } @@ -5389,4 +5389,52 @@ mod tests { } ); } + + /// Assert that there are no "deleted" r/w regions found when the associated + /// volume hasn't been created yet. + #[tokio::test] + async fn test_no_find_deleted_region_for_no_volume() { + let logctx = + dev::test_setup_log("test_no_find_deleted_region_for_no_volume"); + let log = logctx.log.new(o!()); + let db = TestDatabase::new_with_datastore(&log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let _test_datasets = TestDatasets::create( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; + + let volume_id = Uuid::new_v4(); + + // Assert that allocating regions without creating the volume does not + // cause them to be returned as "deleted" regions, as this can cause + // sagas that allocate regions to race with the volume delete saga and + // cause premature region deletion. + + let _datasets_and_regions = datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); + + let deleted_regions = datastore + .find_deleted_volume_regions() + .await + .expect("find_deleted_volume_regions"); + + assert!(deleted_regions.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } } From 2b511b17d082fcbca0cb853721fd04402c89da49 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Thu, 7 Nov 2024 21:53:35 +0000 Subject: [PATCH 24/24] non-jank idempotent query to populate volume_resource_usage --- .../crdb/crucible-ref-count-records/up08.sql | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/schema/crdb/crucible-ref-count-records/up08.sql b/schema/crdb/crucible-ref-count-records/up08.sql index a19559b6b9..49374301ec 100644 --- a/schema/crdb/crucible-ref-count-records/up08.sql +++ b/schema/crdb/crucible-ref-count-records/up08.sql @@ -2,15 +2,7 @@ snapshot records to populate the volume resource usage records */ INSERT INTO volume_resource_usage ( SELECT - ( - OVERLAY( - OVERLAY( - MD5(volume.id::TEXT || dataset_id::TEXT || region_id::TEXT || snapshot_id::TEXT || snapshot_addr || volume_references::TEXT) - PLACING '4' from 13 - ) - PLACING '8' from 17 - )::uuid - ) as usage_id, + gen_random_uuid() AS usage_id, volume.id AS volume_id, 'region_snapshot' AS usage_type, NULL AS region_id, @@ -22,10 +14,18 @@ INSERT INTO volume_resource_usage ( JOIN region_snapshot ON - volume.data like '%' || region_snapshot.snapshot_addr || '%' + volume.data like ('%' || region_snapshot.snapshot_addr || '%') + LEFT JOIN + volume_resource_usage + ON + volume_resource_usage.volume_id = volume.id AND + volume_resource_usage.usage_type = 'region_snapshot' AND + volume_resource_usage.region_id IS NULL AND + volume_resource_usage.region_snapshot_region_id = region_snapshot.region_id AND + volume_resource_usage.region_snapshot_dataset_id = region_snapshot.dataset_id AND + volume_resource_usage.region_snapshot_snapshot_id = region_snapshot.snapshot_id WHERE volume.time_deleted is NULL AND - region_snapshot.deleting = false -) -ON CONFLICT (usage_id) DO NOTHING -; + region_snapshot.deleting = false AND + volume_resource_usage.usage_id IS NULL +);