diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 9c43d0d670..605cd9384e 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -2564,26 +2564,22 @@ async fn cmd_db_region_find_deleted( struct Row { dataset_id: Uuid, region_id: Uuid, - region_snapshot_id: String, - volume_id: Uuid, + volume_id: String, } let rows: Vec = datasets_regions_volumes .into_iter() .map(|row| { - let (dataset, region, region_snapshot, volume) = row; + let (dataset, region, volume) = row; Row { dataset_id: dataset.id(), region_id: region.id(), - region_snapshot_id: if let Some(region_snapshot) = - region_snapshot - { - region_snapshot.snapshot_id.to_string() + volume_id: if let Some(volume) = volume { + volume.id().to_string() } else { String::from("") }, - volume_id: volume.id(), } }) .collect(); diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 9137b0b3c3..66723e0b18 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -109,6 +109,7 @@ mod vmm; mod vni; mod volume; mod volume_repair; +mod volume_resource_usage; mod vpc; mod vpc_firewall_rule; mod vpc_route; @@ -219,6 +220,7 @@ pub use vmm_state::*; pub use vni::*; pub use volume::*; pub use volume_repair::*; +pub use volume_resource_usage::*; pub use vpc::*; pub use vpc_firewall_rule::*; pub use vpc_route::*; diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs index e38c5543ef..02c7db8120 100644 --- a/nexus/db-model/src/region.rs +++ b/nexus/db-model/src/region.rs @@ -46,6 +46,10 @@ pub struct Region { // A region may be read-only read_only: bool, + + // Shared read-only regions require a "deleting" flag to avoid a + // use-after-free scenario + deleting: bool, } impl Region { @@ -67,6 +71,7 @@ impl Region { extent_count: extent_count as i64, port: Some(port.into()), read_only, + deleting: false, } } @@ -99,4 +104,7 @@ impl Region { pub fn read_only(&self) -> bool { self.read_only } + pub fn deleting(&self) -> bool { + self.deleting + } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 8a97e43639..6ca7171793 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1075,6 +1075,8 @@ table! { port -> Nullable, read_only -> Bool, + + deleting -> Bool, } } @@ -2087,3 +2089,19 @@ joinable!(instance_ssh_key -> instance (instance_id)); allow_tables_to_appear_in_same_query!(sled, sled_instance); joinable!(network_interface -> probe (parent_id)); + +table! { + volume_resource_usage (usage_id) { + usage_id -> Uuid, + + volume_id -> Uuid, + + usage_type -> crate::VolumeResourceUsageTypeEnum, + + region_id -> Nullable, + + region_snapshot_dataset_id -> Nullable, + region_snapshot_region_id -> Nullable, + region_snapshot_snapshot_id -> Nullable, + } +} diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 14eba1cb1c..70450a7776 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(113, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(114, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(114, "crucible-ref-count-records"), KnownVersion::new(113, "add-tx-eq"), KnownVersion::new(112, "blueprint-dataset"), KnownVersion::new(111, "drop-omicron-zone-underlay-address"), diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs new file mode 100644 index 0000000000..c55b053c62 --- /dev/null +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -0,0 +1,142 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::volume_resource_usage; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel( + postgres_type(name = "volume_resource_usage_type", schema = "public") + )] + pub struct VolumeResourceUsageTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq, Eq, Hash)] + #[diesel(sql_type = VolumeResourceUsageTypeEnum)] + pub enum VolumeResourceUsageType; + + ReadOnlyRegion => b"read_only_region" + RegionSnapshot => b"region_snapshot" +); + +/// Crucible volumes are created by layering read-write regions over a hierarchy +/// of read-only resources. Originally only a region snapshot could be used as a +/// read-only resource for a volume. With the introduction of read-only regions +/// (created during the region snapshot replacement process) this is no longer +/// true. +/// +/// Read-only resources can be used by many volumes, and because of this they +/// need to have a reference count so they can be deleted when they're not +/// referenced anymore. The region_snapshot table used a `volume_references` +/// column, which counts how many uses there are. The region table does not have +/// this column, and more over a simple integer works for reference counting but +/// does not tell you _what_ volume that use is from. This can be determined +/// (see omdb's validate volume references command) but it's information that is +/// tossed out, as Nexus knows what volumes use what resources! Instead of +/// throwing away that knowledge and only incrementing and decrementing an +/// integer, record what read-only resources a volume uses in this table. +/// +/// Note: users should not use this object directly, and instead use the +/// [`VolumeResourceUsage`] enum, which is type-safe and will convert to and +/// from a [`VolumeResourceUsageRecord`] when interacting with the DB. +#[derive( + Queryable, Insertable, Debug, Clone, Selectable, PartialEq, Eq, Hash, +)] +#[diesel(table_name = volume_resource_usage)] +pub struct VolumeResourceUsageRecord { + pub usage_id: Uuid, + + pub volume_id: Uuid, + + pub usage_type: VolumeResourceUsageType, + + pub region_id: Option, + + pub region_snapshot_dataset_id: Option, + pub region_snapshot_region_id: Option, + pub region_snapshot_snapshot_id: Option, +} + +#[derive(Debug, Clone)] +pub enum VolumeResourceUsage { + ReadOnlyRegion { region_id: Uuid }, + + RegionSnapshot { dataset_id: Uuid, region_id: Uuid, snapshot_id: Uuid }, +} + +impl VolumeResourceUsageRecord { + pub fn new(volume_id: Uuid, usage: VolumeResourceUsage) -> Self { + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::ReadOnlyRegion, + + region_id: Some(region_id), + + region_snapshot_dataset_id: None, + region_snapshot_region_id: None, + region_snapshot_snapshot_id: None, + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::RegionSnapshot, + + region_id: None, + + region_snapshot_dataset_id: Some(dataset_id), + region_snapshot_region_id: Some(region_id), + region_snapshot_snapshot_id: Some(snapshot_id), + }, + } + } +} + +impl TryFrom for VolumeResourceUsage { + type Error = String; + + fn try_from( + record: VolumeResourceUsageRecord, + ) -> Result { + match record.usage_type { + VolumeResourceUsageType::ReadOnlyRegion => { + let Some(region_id) = record.region_id else { + return Err("valid read-only region usage record".into()); + }; + + Ok(VolumeResourceUsage::ReadOnlyRegion { region_id }) + } + + VolumeResourceUsageType::RegionSnapshot => { + let Some(dataset_id) = record.region_snapshot_dataset_id else { + return Err("valid region snapshot usage record".into()); + }; + + let Some(region_id) = record.region_snapshot_region_id else { + return Err("valid region snapshot usage record".into()); + }; + + let Some(snapshot_id) = record.region_snapshot_snapshot_id + else { + return Err("valid region snapshot usage record".into()); + }; + + Ok(VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }) + } + } + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 5344ffc1d9..059d43b8c7 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -811,7 +811,7 @@ mod test { } #[derive(Debug)] - struct TestDatasets { + pub(crate) struct TestDatasets { // eligible and ineligible aren't currently used, but are probably handy // for the future. #[allow(dead_code)] @@ -828,7 +828,7 @@ mod test { type SledToDatasetMap = HashMap>; impl TestDatasets { - async fn create( + pub(crate) async fn create( opctx: &OpContext, datastore: Arc, num_eligible_sleds: usize, diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index df733af5b5..da32494d6f 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -264,7 +264,7 @@ impl DataStore { block_size, blocks_per_extent, extent_count, - read_only: false, + read_only: maybe_snapshot_id.is_some(), }, allocation_strategy, num_regions_required, @@ -362,6 +362,12 @@ impl DataStore { ) .execute_async(&conn).await?; } + + // Whenever a region is hard-deleted, validate invariants + // for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(()) } }) diff --git a/nexus/db-queries/src/db/datastore/region_snapshot.rs b/nexus/db-queries/src/db/datastore/region_snapshot.rs index 242560a415..4a5db14bd6 100644 --- a/nexus/db-queries/src/db/datastore/region_snapshot.rs +++ b/nexus/db-queries/src/db/datastore/region_snapshot.rs @@ -64,14 +64,25 @@ impl DataStore { ) -> DeleteResult { use db::schema::region_snapshot::dsl; - diesel::delete(dsl::region_snapshot) + let conn = self.pool_connection_unauthorized().await?; + + let result = diesel::delete(dsl::region_snapshot) .filter(dsl::dataset_id.eq(dataset_id)) .filter(dsl::region_id.eq(region_id)) .filter(dsl::snapshot_id.eq(snapshot_id)) - .execute_async(&*self.pool_connection_unauthorized().await?) + .execute_async(&*conn) .await .map(|_rows_deleted| ()) - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)); + + // Whenever a region snapshot is hard-deleted, validate invariants for + // all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + result } /// Find region snapshots on expunged disks diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 9d4900e2a4..b9f6b589d7 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -322,4 +322,23 @@ impl DataStore { .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + /// Get a snapshot, returning None if it does not exist (instead of a + /// NotFound error). + pub async fn snapshot_get( + &self, + opctx: &OpContext, + snapshot_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_authorized(opctx).await?; + + use db::schema::snapshot::dsl; + dsl::snapshot + .filter(dsl::id.eq(snapshot_id)) + .select(Snapshot::as_select()) + .first_async(&*conn) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 88b40fcf64..4cd83fff3f 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -7,6 +7,8 @@ use super::DataStore; use crate::db; use crate::db::datastore::OpContext; +use crate::db::datastore::RunnableQuery; +use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; @@ -22,11 +24,14 @@ use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; +use crate::db::model::VolumeResourceUsage; +use crate::db::model::VolumeResourceUsageRecord; +use crate::db::model::VolumeResourceUsageType; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; -use crate::db::queries::volume::*; use crate::db::DbConnection; use crate::transaction_retry::OptionalError; +use anyhow::anyhow; use anyhow::bail; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -53,6 +58,7 @@ use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; use std::collections::VecDeque; +use std::net::AddrParseError; use std::net::SocketAddr; use std::net::SocketAddrV6; use uuid::Uuid; @@ -91,19 +97,252 @@ enum VolumeGetError { InvalidVolume(String), } +#[derive(Debug, thiserror::Error)] +enum VolumeCreationError { + #[error("Error from Volume creation: {0}")] + Public(Error), + + #[error("Serde error during Volume creation: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Address parsing error during Volume creation: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(SocketAddrV6), +} + +enum RegionType { + ReadWrite, + ReadOnly, +} + +#[derive(Debug, thiserror::Error)] +enum RemoveRopError { + #[error("Error removing read-only parent: {0}")] + Public(Error), + + #[error("Serde error removing read-only parent: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + + #[error("Address parsing error during ROP removal: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), +} + +#[derive(Debug, thiserror::Error)] +enum ReplaceRegionError { + #[error("Error from Volume region replacement: {0}")] + Public(Error), + + #[error("Serde error during Volume region replacement: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Region replacement error: {0}")] + RegionReplacementError(#[from] anyhow::Error), +} + +#[derive(Debug, thiserror::Error)] +enum ReplaceSnapshotError { + #[error("Error from Volume snapshot replacement: {0}")] + Public(Error), + + #[error("Serde error during Volume snapshot replacement: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Snapshot replacement error: {0}")] + SnapshotReplacementError(#[from] anyhow::Error), + + #[error("Replaced {0} targets, expected {1}")] + UnexpectedReplacedTargets(usize, usize), + + #[error("Updated {0} database rows, expected {1}")] + UnexpectedDatabaseUpdate(usize, usize), + + #[error( + "Address parsing error during Volume snapshot \ + replacement: {0}" + )] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), + + #[error("Multiple volume resource usage records for {0}")] + MultipleResourceUsageRecords(String), +} + impl DataStore { - pub async fn volume_create(&self, volume: Volume) -> CreateResult { + async fn volume_create_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume: Volume, + crucible_targets: CrucibleTargets, + ) -> Result { use db::schema::volume::dsl; - #[derive(Debug, thiserror::Error)] - enum VolumeCreationError { - #[error("Error from Volume creation: {0}")] - Public(Error), + let maybe_volume: Option = dsl::volume + .filter(dsl::id.eq(volume.id())) + .select(Volume::as_select()) + .first_async(conn) + .await + .optional()?; + + // If the volume existed already, return it and do not increase usage + // counts. + if let Some(volume) = maybe_volume { + return Ok(volume); + } + + let volume: Volume = diesel::insert_into(dsl::volume) + .values(volume.clone()) + .returning(Volume::as_returning()) + .get_result_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeCreationError::Public(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Volume, + volume.id().to_string().as_str(), + ), + )) + }) + })?; + + // Increase the usage count for the read-only Crucible resources + use db::schema::volume_resource_usage::dsl as ru_dsl; + + for read_only_target in crucible_targets.read_only_targets { + let read_only_target = read_only_target.parse().map_err(|e| { + err.bail(VolumeCreationError::AddressParseError(e)) + })?; + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; + + match maybe_usage { + Some(usage) => { + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(VolumeResourceUsageRecord::new( + volume.id(), + usage, + )) + .execute_async(conn) + .await?; + } + + None => { + // Something went wrong, bail out - we can't create this + // Volume if we can't record its resource usage correctly + return Err(err.bail( + VolumeCreationError::CouldNotFindResource( + read_only_target, + ), + )); + } + } + } + + // After volume creation, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + + Ok(volume) + } + + /// Return a region by matching against the dataset's address and region's + /// port + async fn target_to_region( + conn: &async_bb8_diesel::Connection, + target: &SocketAddrV6, + region_type: RegionType, + ) -> Result, diesel::result::Error> { + let ip: db::model::Ipv6Addr = target.ip().into(); + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl as region_dsl; + + let read_only = match region_type { + RegionType::ReadWrite => false, + RegionType::ReadOnly => true, + }; + + dataset_dsl::dataset + .inner_join( + region_dsl::region + .on(region_dsl::dataset_id.eq(dataset_dsl::id)), + ) + .filter(dataset_dsl::ip.eq(ip)) + .filter( + region_dsl::port + .eq(Some::(target.port().into())), + ) + .filter(region_dsl::read_only.eq(read_only)) + .filter(region_dsl::deleting.eq(false)) + .select(Region::as_select()) + .get_result_async::(conn) + .await + .optional() + } + + async fn read_only_target_to_volume_resource_usage( + conn: &async_bb8_diesel::Connection, + read_only_target: &SocketAddrV6, + ) -> Result, diesel::result::Error> { + // Easy case: it's a region snapshot, and we can match by the snapshot + // address directly + + let maybe_region_snapshot = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::snapshot_addr.eq(read_only_target.to_string())) + .filter(dsl::deleting.eq(false)) + .select(RegionSnapshot::as_select()) + .get_result_async::(conn) + .await + .optional()? + }; + + if let Some(region_snapshot) = maybe_region_snapshot { + return Ok(Some(VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + })); + } + + // Less easy case: it's a read-only region, and we have to match by + // dataset ip and region port + + let maybe_region = Self::target_to_region( + conn, + read_only_target, + RegionType::ReadOnly, + ) + .await?; - #[error("Serde error during Volume creation: {0}")] - SerdeError(#[from] serde_json::Error), + if let Some(region) = maybe_region { + return Ok(Some(VolumeResourceUsage::ReadOnlyRegion { + region_id: region.id(), + })); } + // If the resource was hard-deleted, or in the off chance that the + // region didn't have an assigned port, return None here. + Ok(None) + } + + pub async fn volume_create(&self, volume: Volume) -> CreateResult { // Grab all the targets that the volume construction request references. // Do this outside the transaction, as the data inside volume doesn't // change and this would simply add to the transaction time. @@ -132,67 +371,13 @@ impl DataStore { let crucible_targets = crucible_targets.clone(); let volume = volume.clone(); async move { - let maybe_volume: Option = dsl::volume - .filter(dsl::id.eq(volume.id())) - .select(Volume::as_select()) - .first_async(&conn) - .await - .optional()?; - - // If the volume existed already, return it and do not increase - // usage counts. - if let Some(volume) = maybe_volume { - return Ok(volume); - } - - // TODO do we need on_conflict do_nothing here? if the transaction - // model is read-committed, the SELECT above could return nothing, - // and the INSERT here could still result in a conflict. - // - // See also https://github.com/oxidecomputer/omicron/issues/1168 - let volume: Volume = diesel::insert_into(dsl::volume) - .values(volume.clone()) - .on_conflict(dsl::id) - .do_nothing() - .returning(Volume::as_returning()) - .get_result_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Volume, - volume.id().to_string().as_str(), - ), - ), - ) - }) - })?; - - // Increase the usage count for Crucible resources according to the - // contents of the volume. - - // Increase the number of uses for each referenced region snapshot. - use db::schema::region_snapshot::dsl as rs_dsl; - for read_only_target in &crucible_targets.read_only_targets - { - diesel::update(rs_dsl::region_snapshot) - .filter( - rs_dsl::snapshot_addr - .eq(read_only_target.clone()), - ) - .filter(rs_dsl::deleting.eq(false)) - .set( - rs_dsl::volume_references - .eq(rs_dsl::volume_references + 1), - ) - .execute_async(&conn) - .await?; - } - - Ok(volume) + Self::volume_create_in_txn( + &conn, + err, + volume, + crucible_targets, + ) + .await } }) .await @@ -200,10 +385,22 @@ impl DataStore { if let Some(err) = err.take() { match err { VolumeCreationError::Public(err) => err, + VolumeCreationError::SerdeError(err) => { Error::internal_error(&format!( - "Transaction error: {}", - err + "SerdeError error: {err}" + )) + } + + VolumeCreationError::CouldNotFindResource(s) => { + Error::internal_error(&format!( + "CouldNotFindResource error: {s}" + )) + } + + VolumeCreationError::AddressParseError(err) => { + Error::internal_error(&format!( + "AddressParseError error: {err}" )) } } @@ -213,18 +410,27 @@ impl DataStore { }) } - /// Return a `Option` based on id, even if it's soft deleted. - pub async fn volume_get( - &self, + async fn volume_get_impl( + conn: &async_bb8_diesel::Connection, volume_id: Uuid, - ) -> LookupResult> { + ) -> Result, diesel::result::Error> { use db::schema::volume::dsl; dsl::volume .filter(dsl::id.eq(volume_id)) .select(Volume::as_select()) - .first_async::(&*self.pool_connection_unauthorized().await?) + .first_async::(conn) .await .optional() + } + + /// Return a `Option` based on id, even if it's soft deleted. + pub async fn volume_get( + &self, + volume_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_unauthorized().await?; + Self::volume_get_impl(&conn, volume_id) + .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -241,6 +447,105 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + fn volume_usage_records_for_resource_query( + resource: VolumeResourceUsage, + ) -> impl RunnableQuery { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + dsl::volume_resource_usage + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .into_boxed() + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => dsl::volume_resource_usage + .filter( + dsl::usage_type.eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .into_boxed(), + } + } + + /// For a given VolumeResourceUsage, return all found usage records for it. + pub async fn volume_usage_records_for_resource( + &self, + resource: VolumeResourceUsage, + ) -> ListResultVec { + let conn = self.pool_connection_unauthorized().await?; + + Self::volume_usage_records_for_resource_query(resource) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// When moving a resource from one volume to another, call this to update + /// the corresponding volume resource usage record + pub async fn swap_volume_usage_records_for_resources( + conn: &async_bb8_diesel::Connection, + resource: VolumeResourceUsage, + from_volume_id: Uuid, + to_volume_id: Uuid, + ) -> Result<(), diesel::result::Error> { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + } + + Ok(()) + } + async fn volume_checkout_allowed( reason: &VolumeCheckoutReason, vcr: &VolumeConstructionRequest, @@ -259,9 +564,14 @@ impl DataStore { match volume_is_read_only(&vcr) { Ok(read_only) => { if !read_only { - return Err(VolumeGetError::CheckoutConditionFailed( - String::from("Non-read-only Volume Checkout for use Copy!") - )); + return Err( + VolumeGetError::CheckoutConditionFailed( + String::from( + "Non-read-only Volume Checkout \ + for use Copy!", + ), + ), + ); } Ok(()) @@ -300,49 +610,49 @@ impl DataStore { let runtime = instance.runtime(); match (runtime.propolis_id, runtime.dst_propolis_id) { (Some(_), Some(_)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} is undergoing migration", - vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} is undergoing \ + migration", + vmm_id, + instance.id(), + ))) } (None, None) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} has no propolis ids", - vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} has no \ + propolis ids", + vmm_id, + instance.id(), + ))) } (Some(propolis_id), None) => { if propolis_id != vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} propolis id {} mismatch", + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceStart {}: instance {} propolis \ + id {} mismatch", vmm_id, instance.id(), propolis_id, - ) - )); + ), + ), + ); } Ok(()) } (None, Some(dst_propolis_id)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceStart {}: instance {} has no propolis id but dst propolis id {}", - vmm_id, - instance.id(), - dst_propolis_id, - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceStart {}: instance {} has no \ + propolis id but dst propolis id {}", + vmm_id, + instance.id(), + dst_propolis_id, + ))) } } } @@ -364,60 +674,67 @@ impl DataStore { let runtime = instance.runtime(); match (runtime.propolis_id, runtime.dst_propolis_id) { (Some(propolis_id), Some(dst_propolis_id)) => { - if propolis_id != vmm_id.into_untyped_uuid() || dst_propolis_id != target_vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} propolis id mismatches {} {}", - vmm_id, - target_vmm_id, - instance.id(), - propolis_id, - dst_propolis_id, - ) - )); + if propolis_id != vmm_id.into_untyped_uuid() + || dst_propolis_id + != target_vmm_id.into_untyped_uuid() + { + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} \ + propolis id mismatches {} {}", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + dst_propolis_id, + ), + ), + ); } Ok(()) } (None, None) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} has no propolis ids", - vmm_id, - target_vmm_id, - instance.id(), - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceMigrate {} {}: instance {} has no \ + propolis ids", + vmm_id, + target_vmm_id, + instance.id(), + ))) } (Some(propolis_id), None) => { // XXX is this right? if propolis_id != vmm_id.into_untyped_uuid() { - return Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} propolis id {} mismatch", - vmm_id, - target_vmm_id, - instance.id(), - propolis_id, - ) - )); + return Err( + VolumeGetError::CheckoutConditionFailed( + format!( + "InstanceMigrate {} {}: instance {} \ + propolis id {} mismatch", + vmm_id, + target_vmm_id, + instance.id(), + propolis_id, + ), + ), + ); } Ok(()) } (None, Some(dst_propolis_id)) => { - Err(VolumeGetError::CheckoutConditionFailed( - format!( - "InstanceMigrate {} {}: instance {} has no propolis id but dst propolis id {}", - vmm_id, - target_vmm_id, - instance.id(), - dst_propolis_id, - ) - )) + Err(VolumeGetError::CheckoutConditionFailed(format!( + "InstanceMigrate {} {}: instance {} has no \ + propolis id but dst propolis id {}", + vmm_id, + target_vmm_id, + instance.id(), + dst_propolis_id, + ))) } } } @@ -453,10 +770,11 @@ impl DataStore { // XXX this is a Nexus bug! return Err(VolumeGetError::CheckoutConditionFailed( format!( - "Pantry: instance {} backing disk {} does not exist?", + "Pantry: instance {} backing disk {} does not \ + exist?", attach_instance_id, disk.id(), - ) + ), )); }; @@ -479,6 +797,181 @@ impl DataStore { } } + async fn volume_checkout_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: Uuid, + reason: VolumeCheckoutReason, + ) -> Result { + use db::schema::volume::dsl; + + // Grab the volume in question. + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(conn) + .await?; + + // Turn the volume.data into the VolumeConstructionRequest + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()) + .map_err(|e| err.bail(VolumeGetError::SerdeError(e)))?; + + // The VolumeConstructionRequest resulting from this checkout will have + // its generation numbers bumped, and as result will (if it has + // non-read-only sub-volumes) take over from previous read/write + // activations when sent to a place that will `construct` a new Volume. + // Depending on the checkout reason, prevent creating multiple + // read/write Upstairs acting on the same Volume, except where the take + // over is intended. + + let (maybe_disk, maybe_instance) = { + use db::schema::disk::dsl as disk_dsl; + use db::schema::instance::dsl as instance_dsl; + + let maybe_disk: Option = disk_dsl::disk + .filter(disk_dsl::time_deleted.is_null()) + .filter(disk_dsl::volume_id.eq(volume_id)) + .select(Disk::as_select()) + .get_result_async(conn) + .await + .optional()?; + + let maybe_instance: Option = + if let Some(disk) = &maybe_disk { + if let Some(attach_instance_id) = + disk.runtime().attach_instance_id + { + instance_dsl::instance + .filter(instance_dsl::time_deleted.is_null()) + .filter(instance_dsl::id.eq(attach_instance_id)) + .select(Instance::as_select()) + .get_result_async(conn) + .await + .optional()? + } else { + // Disk not attached to an instance + None + } + } else { + // Volume not associated with disk + None + }; + + (maybe_disk, maybe_instance) + }; + + if let Err(e) = Self::volume_checkout_allowed( + &reason, + &vcr, + maybe_disk, + maybe_instance, + ) + .await + { + return Err(err.bail(e)); + } + + // Look to see if the VCR is a Volume type, and if so, look at its + // sub_volumes. If they are of type Region, then we need to update their + // generation numbers and record that update back to the database. We + // return to the caller whatever the original volume data was we pulled + // from the database. + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + let mut update_needed = false; + let mut new_sv = Vec::new(); + for sv in sub_volumes { + match sv { + VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen, + } => { + update_needed = true; + new_sv.push(VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen: gen + 1, + }); + } + _ => { + new_sv.push(sv); + } + } + } + + // Only update the volume data if we found the type of volume + // that needed it. + if update_needed { + // Create a new VCR and fill in the contents from what the + // original volume had, but with our updated sub_volume + // records. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes: new_sv, + read_only_parent, + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(VolumeGetError::SerdeError(e)))?; + + // Update the original volume_id with the new volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await?; + + // This should update just one row. If it does not, then + // something is terribly wrong in the database. + if num_updated != 1 { + return Err(err.bail( + VolumeGetError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + } + } + VolumeConstructionRequest::Region { + block_size: _, + blocks_per_extent: _, + extent_count: _, + opts: _, + gen: _, + } => { + // We don't support a pure Region VCR at the volume level in the + // database, so this choice should never be encountered, but I + // want to know if it is. + return Err(err.bail(VolumeGetError::InvalidVolume( + String::from("Region not supported as a top level volume"), + ))); + } + VolumeConstructionRequest::File { + id: _, + block_size: _, + path: _, + } + | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => + {} + } + + Ok(volume) + } + /// Checkout a copy of the Volume from the database. /// This action (getting a copy) will increase the generation number /// of Volumes of the VolumeConstructionRequest::Volume type that have @@ -490,8 +983,6 @@ impl DataStore { volume_id: Uuid, reason: VolumeCheckoutReason, ) -> LookupResult { - use db::schema::volume::dsl; - // We perform a transaction here, to be sure that on completion // of this, the database contains an updated version of the // volume with the generation number incremented (for the volume @@ -505,176 +996,8 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - // Grab the volume in question. - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await?; - - // Turn the volume.data into the VolumeConstructionRequest - let vcr: VolumeConstructionRequest = - serde_json::from_str(volume.data()).map_err(|e| { - err.bail(VolumeGetError::SerdeError(e)) - })?; - - // The VolumeConstructionRequest resulting from this checkout will have its - // generation numbers bumped, and as result will (if it has non-read-only - // sub-volumes) take over from previous read/write activations when sent to a - // place that will `construct` a new Volume. Depending on the checkout reason, - // prevent creating multiple read/write Upstairs acting on the same Volume, - // except where the take over is intended. - - let (maybe_disk, maybe_instance) = { - use db::schema::instance::dsl as instance_dsl; - use db::schema::disk::dsl as disk_dsl; - - let maybe_disk: Option = disk_dsl::disk - .filter(disk_dsl::time_deleted.is_null()) - .filter(disk_dsl::volume_id.eq(volume_id)) - .select(Disk::as_select()) - .get_result_async(&conn) - .await - .optional()?; - - let maybe_instance: Option = if let Some(disk) = &maybe_disk { - if let Some(attach_instance_id) = disk.runtime().attach_instance_id { - instance_dsl::instance - .filter(instance_dsl::time_deleted.is_null()) - .filter(instance_dsl::id.eq(attach_instance_id)) - .select(Instance::as_select()) - .get_result_async(&conn) - .await - .optional()? - } else { - // Disk not attached to an instance - None - } - } else { - // Volume not associated with disk - None - }; - - (maybe_disk, maybe_instance) - }; - - if let Err(e) = Self::volume_checkout_allowed( - &reason, - &vcr, - maybe_disk, - maybe_instance, - ) - .await { - return Err(err.bail(e)); - } - - // Look to see if the VCR is a Volume type, and if so, look at - // its sub_volumes. If they are of type Region, then we need - // to update their generation numbers and record that update - // back to the database. We return to the caller whatever the - // original volume data was we pulled from the database. - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - let mut update_needed = false; - let mut new_sv = Vec::new(); - for sv in sub_volumes { - match sv { - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen, - } => { - update_needed = true; - new_sv.push( - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen: gen + 1, - }, - ); - } - _ => { - new_sv.push(sv); - } - } - } - - // Only update the volume data if we found the type - // of volume that needed it. - if update_needed { - // Create a new VCR and fill in the contents - // from what the original volume had, but with our - // updated sub_volume records. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes: new_sv, - read_only_parent, - }; - - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - err.bail(VolumeGetError::SerdeError(e)) - })?; - - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; - - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(err.bail( - VolumeGetError::UnexpectedDatabaseUpdate( - num_updated, - 1, - ), - )); - } - } - } - VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _, - } => { - // We don't support a pure Region VCR at the volume - // level in the database, so this choice should - // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); - } - VolumeConstructionRequest::File { - id: _, - block_size: _, - path: _, - } - | VolumeConstructionRequest::Url { - id: _, - block_size: _, - url: _, - } => {} - } - Ok(volume) + Self::volume_checkout_in_txn(&conn, err, volume_id, reason) + .await } }) .await @@ -686,7 +1009,10 @@ impl DataStore { } _ => { - return Error::internal_error(&format!("Transaction error: {}", err)); + return Error::internal_error(&format!( + "Transaction error: {}", + err + )); } } } @@ -731,11 +1057,12 @@ impl DataStore { VolumeConstructionRequest::Region { opts, .. } => { if !opts.read_only { // Only one volume can "own" a Region, and that volume's - // UUID is recorded in the region table accordingly. It is - // an error to make a copy of a volume construction request - // that references non-read-only Regions. + // UUID is recorded in the region table accordingly. It + // is an error to make a copy of a volume construction + // request that references non-read-only Regions. bail!( - "only one Volume can reference a Region non-read-only!" + "only one Volume can reference a Region \ + non-read-only!" ); } @@ -781,18 +1108,35 @@ impl DataStore { } /// Find regions for deleted volumes that do not have associated region - /// snapshots. + /// snapshots and are not being used by any other non-deleted volumes, and + /// return them for garbage collection pub async fn find_deleted_volume_regions( &self, - ) -> ListResultVec<(Dataset, Region, Option, Volume)> { + ) -> ListResultVec<(Dataset, Region, Option)> { + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("find_deleted_volume_regions") + .transaction(&conn, |conn| async move { + Self::find_deleted_volume_regions_in_txn(&conn).await + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + async fn find_deleted_volume_regions_in_txn( + conn: &async_bb8_diesel::Connection, + ) -> Result)>, diesel::result::Error> + { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; use db::schema::volume::dsl as volume_dsl; - // Find all regions and datasets - region_dsl::region - .inner_join( + // Find all read-write regions (read-only region cleanup is taken care + // of in soft_delete_volume_in_txn!) and their associated datasets + let unfiltered_deleted_regions = region_dsl::region + .filter(region_dsl::read_only.eq(false)) + // the volume may be hard deleted, so use a left join here + .left_join( volume_dsl::volume.on(region_dsl::volume_id.eq(volume_dsl::id)), ) .inner_join( @@ -800,33 +1144,70 @@ impl DataStore { .on(region_dsl::dataset_id.eq(dataset_dsl::id)), ) // where there either are no region snapshots, or the region - // snapshot volume references have gone to zero + // snapshot volume has deleted = true .left_join( dsl::region_snapshot.on(dsl::region_id .eq(region_dsl::id) .and(dsl::dataset_id.eq(dataset_dsl::id))), ) - .filter( - dsl::volume_references - .eq(0) - // Despite the SQL specifying that this column is NOT NULL, - // this null check is required for this function to work! - // It's possible that the left join of region_snapshot above - // could join zero rows, making this null. - .or(dsl::volume_references.is_null()), - ) - // where the volume has already been soft-deleted - .filter(volume_dsl::time_deleted.is_not_null()) + .filter(dsl::deleting.eq(true).or(dsl::deleting.is_null())) // and return them (along with the volume so it can be hard deleted) .select(( Dataset::as_select(), Region::as_select(), Option::::as_select(), - Volume::as_select(), + // Diesel can't express a difference between + // + // a) the volume record existing and the nullable + // volume.time_deleted column being set to null + // b) the volume record does not exist (null due to left join) + // + // so return an Option and check below + Option::::as_select(), )) - .load_async(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .load_async(conn) + .await?; + + let mut deleted_regions = + Vec::with_capacity(unfiltered_deleted_regions.len()); + + for (dataset, region, region_snapshot, volume) in + unfiltered_deleted_regions + { + // only operate on soft deleted volumes + let soft_deleted = match &volume { + Some(volume) => volume.time_deleted.is_some(), + None => false, + }; + + if !soft_deleted { + continue; + } + + if region_snapshot.is_some() { + // We cannot delete this region: the presence of the region + // snapshot record means that the Crucible agent's snapshot has + // not been deleted yet (as the lifetime of the region snapshot + // record is equal to or longer than the lifetime of the + // Crucible agent's snapshot). + // + // This condition can occur when multiple volume delete sagas + // run concurrently: one will decrement the crucible resources + // (but hasn't made the appropriate DELETE calls to the + // appropriate Agents to tombstone the running snapshots and + // snapshots yet), and the other will be in the "delete freed + // regions" saga node trying to delete the region. Without this + // check, This race results in the Crucible Agent returning + // "must delete snapshots first" and causing saga unwinds. + // + // Another volume delete will pick up this region and remove it. + continue; + } + + deleted_regions.push((dataset, region, volume)); + } + + Ok(deleted_regions) } pub async fn read_only_resources_associated_with_volume( @@ -853,20 +1234,30 @@ impl DataStore { } #[derive(Debug, thiserror::Error)] -enum SoftDeleteTransactionError { +enum SoftDeleteError { #[error("Serde error decreasing Crucible resources: {0}")] SerdeError(#[from] serde_json::Error), - #[error("Updated {0} database rows, expected 1")] - UnexpectedDatabaseUpdate(usize), + #[error("Updated {0} database rows in {1}, expected 1")] + UnexpectedDatabaseUpdate(usize, String), + + // XXX is this an error? delete volume anyway, else we're stuck? + #[error("Could not match resource to {0}")] + CouldNotFindResource(String), + + #[error("Address parsing error during Volume soft-delete: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Invalid Volume: {0}")] + InvalidVolume(String), } impl DataStore { // See comment for `soft_delete_volume` - async fn soft_delete_volume_txn( + async fn soft_delete_volume_in_txn( conn: &async_bb8_diesel::Connection, volume_id: Uuid, - err: OptionalError, + err: OptionalError, ) -> Result { // Grab the volume, and check if the volume was already soft-deleted. // We have to guard against the case where this function is called @@ -923,8 +1314,8 @@ impl DataStore { .map_err(|e| err.bail(e.into()))?; // Grab all the targets that the volume construction request references. - // Do this _inside_ the transaction, as the read-only parent data inside - // volume can change as a result of region snapshot replacement. + // Do this _inside_ the transaction, as the data inside volumes can + // change as a result of region / region snapshot replacement. let crucible_targets = { let mut crucible_targets = CrucibleTargets::default(); read_only_resources_associated_with_volume( @@ -934,52 +1325,270 @@ impl DataStore { crucible_targets }; - // Decrease the number of references for each region snapshot that a - // volume references, returning the updated rows. - let updated_region_snapshots = ConditionallyDecreaseReferences::new( - crucible_targets.read_only_targets, - ) - .get_results_async::(conn) - .await?; + // Decrease the number of references for each resource that a volume + // references, collecting the regions and region snapshots that were + // freed up for deletion. + + let num_read_write_subvolumes = + match count_read_write_sub_volumes(&vcr) { + Ok(v) => v, + Err(e) => { + return Err(err.bail(SoftDeleteError::InvalidVolume( + format!("volume {} invalid: {e}", volume.id()), + ))); + } + }; - // Return all regions for the volume to be cleaned up. - let regions: Vec = { - use db::schema::region::dsl as region_dsl; - use db::schema::region_snapshot::dsl as region_snapshot_dsl; + let mut regions: Vec = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); - region_dsl::region - .left_join( - region_snapshot_dsl::region_snapshot - .on(region_snapshot_dsl::region_id.eq(region_dsl::id)), - ) - .filter( - region_snapshot_dsl::volume_references - .eq(0) - .or(region_snapshot_dsl::volume_references.is_null()), - ) - .filter(region_dsl::volume_id.eq(volume_id)) - .select(Region::as_select()) - .get_results_async(conn) - .await? - .into_iter() - .map(|region| region.id()) - .collect() - }; + let mut region_snapshots: Vec = + Vec::with_capacity(crucible_targets.read_only_targets.len()); - // Return the region snapshots that had their volume_references updated - // to 0 (and had the deleting flag set) to be cleaned up. - let region_snapshots = updated_region_snapshots - .into_iter() - .filter(|region_snapshot| { - region_snapshot.volume_references == 0 - && region_snapshot.deleting - }) - .map(|region_snapshot| RegionSnapshotV3 { - dataset: region_snapshot.dataset_id, - region: region_snapshot.region_id, - snapshot: region_snapshot.snapshot_id, - }) - .collect(); + // First, grab read-write regions - they're not shared, but they are + // not candidates for deletion if there are region snapshots + let mut read_write_targets = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); + + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let target = target + .parse() + .map_err(|e| err.bail(SoftDeleteError::AddressParseError(e)))?; + + let maybe_region = + Self::target_to_region(conn, &target, RegionType::ReadWrite) + .await?; + + let Some(region) = maybe_region else { + return Err(err.bail(SoftDeleteError::CouldNotFindResource( + format!("could not find resource for {target}"), + ))); + }; + + // Filter out regions that have any region-snapshots + let region_snapshot_count: i64 = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::region_id.eq(region.id())) + .count() + .get_result_async::(conn) + .await? + }; + + if region_snapshot_count == 0 { + regions.push(region.id()); + } + } + + for read_only_target in &crucible_targets.read_only_targets { + use db::schema::volume_resource_usage::dsl as ru_dsl; + + let read_only_target = read_only_target + .parse() + .map_err(|e| err.bail(SoftDeleteError::AddressParseError(e)))?; + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; + + let Some(usage) = maybe_usage else { + return Err(err.bail(SoftDeleteError::CouldNotFindResource( + format!("could not find resource for {read_only_target}"), + ))); + }; + + // For each read-only resource, remove the associated volume + // resource usage record for this volume. Only return a resource for + // deletion if no more associated volume usage records are found. + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::ReadOnlyRegion, + ), + ) + .filter(ru_dsl::region_id.eq(Some(region_id))) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage (region)".into(), + ), + )); + } + + let region_usage_left = ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(ru_dsl::region_id.eq(region_id)) + .count() + .get_result_async::(conn) + .await?; + + if region_usage_left == 0 { + // There are several factors that mean Nexus does _not_ + // have to check here for region snapshots taken of + // read-only regions: + // + // - When a Crucible Volume receives a flush, it is only + // propagated to the subvolumes of that Volume, not + // the read-only parent. There's a few reasons for + // this, but the main one is that a Crucible flush + // changes the on-disk data, and the directory that + // the downstairs of a read-only parent is serving out + // of may be on read-only storage, as is the case when + // serving out of a .zfs/snapshot/ directory. + // + // - Even if a Crucible flush _did_ propagate to the + // read-only parent, Nexus should never directly send + // the snapshot volume to a place where it will be + // constructed, meaning the read-only regions of the + // snapshot volume's subvolumes will never themselves + // receive a flush. + // + // If either of these factors change, then that check is + // required here. The `validate_volume_invariants` + // function will return an error if a read-only region + // has an associated region snapshot during testing. + // + // However, don't forget to set `deleting`! These + // regions will be returned to the calling function for + // garbage collection. + use db::schema::region::dsl; + let updated_rows = diesel::update(dsl::region) + .filter(dsl::id.eq(region_id)) + .filter(dsl::read_only.eq(true)) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting (region)".into(), + ), + )); + } + + regions.push(region_id); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage \ + (region_snapshot)" + .into(), + ), + )); + } + + let region_snapshot_usage_left = + ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .count() + .get_result_async::(conn) + .await?; + + if region_snapshot_usage_left == 0 { + // Don't forget to set `deleting`! see: omicron#4095 + use db::schema::region_snapshot::dsl; + let updated_rows = diesel::update(dsl::region_snapshot) + .filter(dsl::dataset_id.eq(dataset_id)) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::snapshot_id.eq(snapshot_id)) + .filter( + dsl::snapshot_addr + .eq(read_only_target.to_string()), + ) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting (region snapshot)".into(), + ), + )); + } + + region_snapshots.push(RegionSnapshotV3 { + dataset: dataset_id, + region: region_id, + snapshot: snapshot_id, + }); + } + } + } + } let resources_to_delete = CrucibleResources::V3(CrucibleResourcesV3 { regions, @@ -1003,13 +1612,18 @@ impl DataStore { if updated_rows != 1 { return Err(err.bail( - SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + SoftDeleteError::UnexpectedDatabaseUpdate( updated_rows, + "volume".into(), ), )); } } + // After volume deletion, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(&conn).await?; + Ok(resources_to_delete) } @@ -1027,7 +1641,7 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - Self::soft_delete_volume_txn(&conn, volume_id, err).await + Self::soft_delete_volume_in_txn(&conn, volume_id, err).await } }) .await @@ -1040,6 +1654,191 @@ impl DataStore { }) } + async fn volume_remove_rop_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: Uuid, + temp_volume_id: Uuid, + ) -> Result { + // Grab the volume in question. If the volume record was already deleted + // then we can just return. + let volume = { + use db::schema::volume::dsl; + + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(conn) + .await + .optional()?; + + let volume = if let Some(v) = volume { + v + } else { + // the volume does not exist, nothing to do. + return Ok(false); + }; + + if volume.time_deleted.is_some() { + // this volume is deleted, so let whatever is deleting it clean + // it up. + return Ok(false); + } else { + // A volume record exists, and was not deleted, we can attempt + // to remove its read_only_parent. + volume + } + }; + + // If a read_only_parent exists, remove it from volume_id, and attach it + // to temp_volume_id. + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; + + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + if read_only_parent.is_none() { + // This volume has no read_only_parent + Ok(false) + } else { + // Create a new VCR and fill in the contents from what the + // original volume had. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent: None, + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; + + // Update the original volume_id with the new volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await?; + + // This should update just one row. If it does not, then + // something is terribly wrong in the database. + if num_updated != 1 { + return Err(err.bail( + RemoveRopError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + + // Make a new VCR, with the information from our + // temp_volume_id, but the read_only_parent from the + // original volume. + let rop_vcr = VolumeConstructionRequest::Volume { + id: temp_volume_id, + block_size, + sub_volumes: vec![], + read_only_parent, + }; + + let rop_volume_data = serde_json::to_string(&rop_vcr) + .map_err(|e| err.bail(RemoveRopError::SerdeError(e)))?; + + // Update the temp_volume_id with the volume data that + // contains the read_only_parent. + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(temp_volume_id)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(rop_volume_data)) + .execute_async(conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + RemoveRopError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } + + // Update the volume resource usage record for every + // read-only resource in the ROP + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &rop_vcr, + &mut crucible_targets, + ); + crucible_targets + }; + + for read_only_target in crucible_targets.read_only_targets { + let read_only_target = + read_only_target.parse().map_err(|e| { + err.bail(RemoveRopError::AddressParseError(e)) + })?; + + let maybe_usage = + Self::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; + + let Some(usage) = maybe_usage else { + return Err(err.bail( + RemoveRopError::CouldNotFindResource(format!( + "could not find resource for \ + {read_only_target}" + )), + )); + }; + + Self::swap_volume_usage_records_for_resources( + conn, + usage, + volume_id, + temp_volume_id, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + RemoveRopError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ), + ) + }) + })?; + } + + // After read-only parent removal, validate invariants for + // all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; + + Ok(true) + } + } + + VolumeConstructionRequest::File { .. } + | VolumeConstructionRequest::Region { .. } + | VolumeConstructionRequest::Url { .. } => { + // Volume has a format that does not contain ROPs + Ok(false) + } + } + } + // Here we remove the read only parent from volume_id, and attach it // to temp_volume_id. // @@ -1051,15 +1850,6 @@ impl DataStore { volume_id: Uuid, temp_volume_id: Uuid, ) -> Result { - #[derive(Debug, thiserror::Error)] - enum RemoveReadOnlyParentError { - #[error("Serde error removing read only parent: {0}")] - SerdeError(#[from] serde_json::Error), - - #[error("Updated {0} database rows, expected {1}")] - UnexpectedDatabaseUpdate(usize, usize), - } - // In this single transaction: // - Get the given volume from the volume_id from the database // - Extract the volume.data into a VolumeConstructionRequest (VCR) @@ -1080,147 +1870,22 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - // Grab the volume in question. If the volume record was already - // deleted then we can just return. - let volume = { - use db::schema::volume::dsl; - - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await - .optional()?; - - let volume = if let Some(v) = volume { - v - } else { - // the volume does not exist, nothing to do. - return Ok(false); - }; - - if volume.time_deleted.is_some() { - // this volume is deleted, so let whatever is deleting - // it clean it up. - return Ok(false); - } else { - // A volume record exists, and was not deleted, we - // can attempt to remove its read_only_parent. - volume - } - }; - - // If a read_only_parent exists, remove it from volume_id, and - // attach it to temp_volume_id. - let vcr: VolumeConstructionRequest = - serde_json::from_str( - volume.data() - ) - .map_err(|e| { - err.bail( - RemoveReadOnlyParentError::SerdeError( - e, - ) - ) - })?; - - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - if read_only_parent.is_none() { - // This volume has no read_only_parent - Ok(false) - } else { - // Create a new VCR and fill in the contents - // from what the original volume had. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent: None, - }; - - let new_volume_data = - serde_json::to_string( - &new_vcr - ) - .map_err(|e| { - err.bail(RemoveReadOnlyParentError::SerdeError( - e, - )) - })?; - - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; - - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); - } - - // Make a new VCR, with the information from - // our temp_volume_id, but the read_only_parent - // from the original volume. - let rop_vcr = VolumeConstructionRequest::Volume { - id: temp_volume_id, - block_size, - sub_volumes: vec![], - read_only_parent, - }; - let rop_volume_data = - serde_json::to_string( - &rop_vcr - ) - .map_err(|e| { - err.bail(RemoveReadOnlyParentError::SerdeError( - e, - )) - })?; - // Update the temp_volume_id with the volume - // data that contains the read_only_parent. - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(temp_volume_id)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(rop_volume_data)) - .execute_async(&conn) - .await?; - if num_updated != 1 { - return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); - } - Ok(true) - } - } - VolumeConstructionRequest::File { id: _, block_size: _, path: _ } - | VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _ } - | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { - // Volume has a format that does not contain ROPs - Ok(false) - } - } + Self::volume_remove_rop_in_txn( + &conn, + err, + volume_id, + temp_volume_id, + ) + .await } }) .await .map_err(|e| { if let Some(err) = err.take() { - return Error::internal_error(&format!("Transaction error: {}", err)); + return Error::internal_error(&format!( + "Transaction error: {}", + err + )); } public_error_from_diesel(e, ErrorHandler::Server) }) @@ -1305,9 +1970,9 @@ impl DataStore { if mismatched_record_type_count > 0 { return Err(err.bail(Error::conflict(&format!( - "existing repair type for id {} does not match {:?}!", - record.repair_id, - record.repair_type, + "existing repair type for id {} does not \ + match {:?}!", + record.repair_id, record.repair_type, )))); } @@ -1368,17 +2033,65 @@ impl DataStore { .execute_async(&conn) .await?; - Ok(()) - } - }) - .await - .map_err(|e| { - if let Some(err) = err.take() { - err - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + + async fn upstairs_repair_progress_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + upstairs_id: TypedUuid, + repair_id: TypedUuid, + repair_progress: RepairProgress, + ) -> Result<(), diesel::result::Error> { + use db::schema::upstairs_repair_notification::dsl as notification_dsl; + use db::schema::upstairs_repair_progress::dsl; + + // Check that there is a repair id for the upstairs id + let matching_repair: Option = + notification_dsl::upstairs_repair_notification + .filter( + notification_dsl::repair_id + .eq(nexus_db_model::to_db_typed_uuid(repair_id)), + ) + .filter( + notification_dsl::upstairs_id + .eq(nexus_db_model::to_db_typed_uuid(upstairs_id)), + ) + .filter( + notification_dsl::notification_type + .eq(UpstairsRepairNotificationType::Started), + ) + .get_result_async(conn) + .await + .optional()?; + + if matching_repair.is_none() { + return Err(err.bail(Error::non_resourcetype_not_found(&format!( + "upstairs {upstairs_id} repair {repair_id} not found" + )))); + } + + diesel::insert_into(dsl::upstairs_repair_progress) + .values(UpstairsRepairProgress { + repair_id: repair_id.into(), + time: repair_progress.time, + current_item: repair_progress.current_item, + total_items: repair_progress.total_items, }) + .execute_async(conn) + .await?; + + Ok(()) } /// Record Upstairs repair progress @@ -1389,9 +2102,6 @@ impl DataStore { repair_id: TypedUuid, repair_progress: RepairProgress, ) -> Result<(), Error> { - use db::schema::upstairs_repair_notification::dsl as notification_dsl; - use db::schema::upstairs_repair_progress::dsl; - let conn = self.pool_connection_authorized(opctx).await?; let err = OptionalError::new(); @@ -1401,33 +2111,14 @@ impl DataStore { let err = err.clone(); async move { - // Check that there is a repair id for the upstairs id - let matching_repair: Option = - notification_dsl::upstairs_repair_notification - .filter(notification_dsl::repair_id.eq(nexus_db_model::to_db_typed_uuid(repair_id))) - .filter(notification_dsl::upstairs_id.eq(nexus_db_model::to_db_typed_uuid(upstairs_id))) - .filter(notification_dsl::notification_type.eq(UpstairsRepairNotificationType::Started)) - .get_result_async(&conn) - .await - .optional()?; - - if matching_repair.is_none() { - return Err(err.bail(Error::non_resourcetype_not_found(&format!( - "upstairs {upstairs_id} repair {repair_id} not found" - )))); - } - - diesel::insert_into(dsl::upstairs_repair_progress) - .values(UpstairsRepairProgress { - repair_id: repair_id.into(), - time: repair_progress.time, - current_item: repair_progress.current_item, - total_items: repair_progress.total_items, - }) - .execute_async(&conn) - .await?; - - Ok(()) + Self::upstairs_repair_progress_in_txn( + &conn, + err, + upstairs_id, + repair_id, + repair_progress, + ) + .await } }) .await @@ -1608,7 +2299,7 @@ pub struct CrucibleResourcesV2 { pub snapshots_to_delete: Vec, } -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] pub struct RegionSnapshotV3 { dataset: Uuid, region: Uuid, @@ -1883,6 +2574,7 @@ fn read_only_target_in_vcr( Ok(false) } +#[derive(Clone)] pub struct VolumeReplacementParams { pub volume_id: Uuid, pub region_id: Uuid, @@ -1920,12 +2612,12 @@ pub enum VolumeReplaceResult { } impl DataStore { - /// Replace a read-write region in a Volume with a new region. - pub async fn volume_replace_region( - &self, + async fn volume_replace_region_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, existing: VolumeReplacementParams, replacement: VolumeReplacementParams, - ) -> Result { + ) -> Result { // In a single transaction: // // - set the existing region's volume id to the replacement's volume id @@ -2016,191 +2708,546 @@ impl DataStore { // referenced (via its socket address) in NEW_VOL. For an example, this // is done as part of the region replacement start saga. - #[derive(Debug, thiserror::Error)] - enum VolumeReplaceRegionError { - #[error("Error from Volume region replacement: {0}")] - Public(Error), + // Grab the old volume first + let maybe_old_volume = { + volume_dsl::volume + .filter(volume_dsl::id.eq(existing.volume_id)) + .select(Volume::as_select()) + .first_async::(conn) + .await + .optional() + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + }; + + let old_volume = if let Some(old_volume) = maybe_old_volume { + old_volume + } else { + // Existing volume was hard-deleted, so return here. We can't + // perform the region replacement now, and this will short-circuit + // the rest of the process. + + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + }; + + if old_volume.time_deleted.is_some() { + // Existing volume was soft-deleted, so return here for the same + // reason: the region replacement process should be short-circuited + // now. + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + } + + let old_vcr: VolumeConstructionRequest = + match serde_json::from_str(&old_volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(err.bail(ReplaceRegionError::SerdeError(e))); + } + }; - #[error("Serde error during Volume region replacement: {0}")] - SerdeError(#[from] serde_json::Error), + // Does it look like this replacement already happened? + let old_region_in_vcr = + match region_in_vcr(&old_vcr, &existing.region_addr) { + Ok(v) => v, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; + let new_region_in_vcr = + match region_in_vcr(&old_vcr, &replacement.region_addr) { + Ok(v) => v, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; - #[error("Region replacement error: {0}")] - RegionReplacementError(#[from] anyhow::Error), + if !old_region_in_vcr && new_region_in_vcr { + // It does seem like the replacement happened - if this function is + // called twice in a row then this can happen. + return Ok(VolumeReplaceResult::AlreadyHappened); + } else if old_region_in_vcr && !new_region_in_vcr { + // The replacement hasn't happened yet, but can proceed + } else if old_region_in_vcr && new_region_in_vcr { + // Both the old region and new region exist in this VCR. Regions are + // not reused, so this is an illegal state: if the replacement of + // the old region occurred, then the new region would be present + // multiple times in the volume. We have to bail out here. + // + // The guards against this happening are: + // + // - only one replacement can occur for a volume at a time (due to + // the volume repair lock), and + // + // - region replacement does not delete the old region until the + // "region replacement finish" saga, which happens at the very end + // of the process. If it eagerly deleted the region, the crucible + // agent would be free to reuse the port for another region + // allocation, and an identical target (read: ip and port) could + // be confusing. Most of the time, we assume that the dataset + // containing that agent has been expunged, so the agent is gone, + // so this port reuse cannot occur + return Err(err.bail(ReplaceRegionError::RegionReplacementError( + anyhow!("old_region_in_vcr && new_region_in_vcr"), + ))); + } else if !old_region_in_vcr && !new_region_in_vcr { + // Neither the region we've been asked to replace or the new region + // is in the VCR. This is an illegal state, as this function would + // be performing a no-op. We have to bail out here. + // + // The guard against this happening is again that only one + // replacement can occur for a volume at a time: if it was possible + // for multiple region replacements to occur, then both would be + // attempting to swap out the same old region for different new + // regions: + // + // region replacement one: + // + // volume_replace_region_in_txn( + // .., + // existing = [fd00:1122:3344:145::10]:40001, + // replacement = [fd00:1122:3344:322::4]:3956, + // ) + // + // region replacement two: + // + // volume_replace_region_in_txn( + // .., + // existing = [fd00:1122:3344:145::10]:40001, + // replacement = [fd00:1122:3344:fd1::123]:27001, + // ) + // + // The one that replaced second would always land in this branch. + return Err(err.bail(ReplaceRegionError::RegionReplacementError( + anyhow!("!old_region_in_vcr && !new_region_in_vcr"), + ))); } + + use db::schema::region::dsl as region_dsl; + use db::schema::volume::dsl as volume_dsl; + + // Set the existing region's volume id to the replacement's volume id + diesel::update(region_dsl::region) + .filter(region_dsl::id.eq(existing.region_id)) + .set(region_dsl::volume_id.eq(replacement.volume_id)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Set the replacement region's volume id to the existing's volume id + diesel::update(region_dsl::region) + .filter(region_dsl::id.eq(replacement.region_id)) + .set(region_dsl::volume_id.eq(existing.volume_id)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Update the existing volume's construction request to replace the + // existing region's SocketAddrV6 with the replacement region's + + // Copy the old volume's VCR, changing out the old region for the new. + let new_vcr = match replace_region_in_vcr( + &old_vcr, + existing.region_addr, + replacement.region_addr, + ) { + Ok(new_vcr) => new_vcr, + Err(e) => { + return Err( + err.bail(ReplaceRegionError::RegionReplacementError(e)) + ); + } + }; + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(ReplaceRegionError::SerdeError(e)))?; + + // Update the existing volume's data + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(existing.volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceRegionError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // After region replacement, validate invariants for all volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; + + Ok(VolumeReplaceResult::Done) + } + + /// Replace a read-write region in a Volume with a new region. + pub async fn volume_replace_region( + &self, + existing: VolumeReplacementParams, + replacement: VolumeReplacementParams, + ) -> Result { let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("volume_replace_region") .transaction(&conn, |conn| { let err = err.clone(); + let existing = existing.clone(); + let replacement = replacement.clone(); async move { - // Grab the old volume first - let maybe_old_volume = { - volume_dsl::volume - .filter(volume_dsl::id.eq(existing.volume_id)) - .select(Volume::as_select()) - .first_async::(&conn) - .await - .optional() - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })? - }; - - let old_volume = if let Some(old_volume) = maybe_old_volume { - old_volume - } else { - // Existing volume was hard-deleted, so return here. We - // can't perform the region replacement now, and this - // will short-circuit the rest of the process. + Self::volume_replace_region_in_txn( + &conn, + err, + existing, + replacement, + ) + .await + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + ReplaceRegionError::Public(e) => e, - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - }; + ReplaceRegionError::SerdeError(_) => { + Error::internal_error(&err.to_string()) + } - if old_volume.time_deleted.is_some() { - // Existing volume was soft-deleted, so return here for - // the same reason: the region replacement process - // should be short-circuited now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + ReplaceRegionError::RegionReplacementError(_) => { + Error::internal_error(&err.to_string()) + } } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } - let old_vcr: VolumeConstructionRequest = - match serde_json::from_str(&old_volume.data()) { - Ok(vcr) => vcr, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::SerdeError(e))); - }, - }; + async fn volume_replace_snapshot_in_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume_id: VolumeWithTarget, + existing: ExistingTarget, + replacement: ReplacementTarget, + volume_to_delete_id: VolumeToDelete, + ) -> Result { + use db::schema::volume::dsl as volume_dsl; + use db::schema::volume_resource_usage::dsl as ru_dsl; - // Does it look like this replacement already happened? - let old_region_in_vcr = match region_in_vcr(&old_vcr, &existing.region_addr) { - Ok(v) => v, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::RegionReplacementError(e))); - }, - }; - let new_region_in_vcr = match region_in_vcr(&old_vcr, &replacement.region_addr) { - Ok(v) => v, - Err(e) => { - return Err(err.bail(VolumeReplaceRegionError::RegionReplacementError(e))); - }, - }; + // Grab the old volume first + let maybe_old_volume = { + volume_dsl::volume + .filter(volume_dsl::id.eq(volume_id.0)) + .select(Volume::as_select()) + .first_async::(conn) + .await + .optional() + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + }; - if !old_region_in_vcr && new_region_in_vcr { - // It does seem like the replacement happened - return Ok(VolumeReplaceResult::AlreadyHappened); - } + let old_volume = if let Some(old_volume) = maybe_old_volume { + old_volume + } else { + // Existing volume was hard-deleted, so return here. We can't + // perform the region replacement now, and this will short-circuit + // the rest of the process. - use db::schema::region::dsl as region_dsl; - use db::schema::volume::dsl as volume_dsl; + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + }; - // Set the existing region's volume id to the replacement's - // volume id - diesel::update(region_dsl::region) - .filter(region_dsl::id.eq(existing.region_id)) - .set(region_dsl::volume_id.eq(replacement.volume_id)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + if old_volume.time_deleted.is_some() { + // Existing volume was soft-deleted, so return here for the same + // reason: the region replacement process should be short-circuited + // now. + return Ok(VolumeReplaceResult::ExistingVolumeDeleted); + } - // Set the replacement region's volume id to the existing's - // volume id - diesel::update(region_dsl::region) - .filter(region_dsl::id.eq(replacement.region_id)) - .set(region_dsl::volume_id.eq(existing.volume_id)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + let old_vcr: VolumeConstructionRequest = + match serde_json::from_str(&old_volume.data()) { + Ok(vcr) => vcr, + Err(e) => { + return Err(err.bail(ReplaceSnapshotError::SerdeError(e))); + } + }; - // Update the existing volume's construction request to - // replace the existing region's SocketAddrV6 with the - // replacement region's - - // Copy the old volume's VCR, changing out the old region - // for the new. - let new_vcr = match replace_region_in_vcr( - &old_vcr, - existing.region_addr, - replacement.region_addr, - ) { - Ok(new_vcr) => new_vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceRegionError::RegionReplacementError(e) - )); - } - }; + // Does it look like this replacement already happened? + let old_target_in_vcr = + match read_only_target_in_vcr(&old_vcr, &existing.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + ReplaceSnapshotError::SnapshotReplacementError(e), + )); + } + }; - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - err.bail(VolumeReplaceRegionError::SerdeError(e)) - })?; + let new_target_in_vcr = + match read_only_target_in_vcr(&old_vcr, &replacement.0) { + Ok(v) => v, + Err(e) => { + return Err(err.bail( + ReplaceSnapshotError::SnapshotReplacementError(e), + )); + } + }; - // Update the existing volume's data - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(existing.volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceRegionError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; + if !old_target_in_vcr && new_target_in_vcr { + // It does seem like the replacement happened + return Ok(VolumeReplaceResult::AlreadyHappened); + } - Ok(VolumeReplaceResult::Done) - } + // Update the existing volume's construction request to replace the + // existing target's SocketAddrV6 with the replacement target's + + // Copy the old volume's VCR, changing out the old target for the new. + let (new_vcr, replacements) = match replace_read_only_target_in_vcr( + &old_vcr, + existing, + replacement, + ) { + Ok(new_vcr) => new_vcr, + Err(e) => { + return Err( + err.bail(ReplaceSnapshotError::SnapshotReplacementError(e)) + ); + } + }; + + // Expect that this only happened once. If it happened multiple times, + // question everything: how would a snapshot be used twice?! + + if replacements != 1 { + return Err(err.bail( + ReplaceSnapshotError::UnexpectedReplacedTargets( + replacements, + 1, + ), + )); + } + + let new_volume_data = serde_json::to_string(&new_vcr) + .map_err(|e| err.bail(ReplaceSnapshotError::SerdeError(e)))?; + + // Update the existing volume's data + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id.0)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + + // Make a new VCR that will stash the target to delete. The values here + // don't matter, just that it gets fed into the volume_delete machinery + // later. + let vcr = VolumeConstructionRequest::Volume { + id: volume_to_delete_id.0, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: sled_agent_client::types::CrucibleOpts { + id: volume_to_delete_id.0, + target: vec![existing.0.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }; + + let volume_data = serde_json::to_string(&vcr) + .map_err(|e| err.bail(ReplaceSnapshotError::SerdeError(e)))?; + + // Update the volume to delete data + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_to_delete_id.0)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(volume_data)) + .execute_async(conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + ReplaceSnapshotError::UnexpectedDatabaseUpdate(num_updated, 1), + )); + } + + // Update the appropriate volume resource usage records - it could + // either be a read-only region or a region snapshot, so determine what + // it is first + + let maybe_existing_usage = + Self::read_only_target_to_volume_resource_usage(conn, &existing.0) + .await?; + + let Some(existing_usage) = maybe_existing_usage else { + return Err(err.bail(ReplaceSnapshotError::CouldNotFindResource( + format!("could not find resource for {}", existing.0,), + ))); + }; + + // The "existing" target moved into the volume to delete + + Self::swap_volume_usage_records_for_resources( + conn, + existing_usage, + volume_id.0, + volume_to_delete_id.0, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) }) + })?; + + let maybe_replacement_usage = + Self::read_only_target_to_volume_resource_usage( + conn, + &replacement.0, + ) + .await?; + + let Some(replacement_usage) = maybe_replacement_usage else { + return Err(err.bail(ReplaceSnapshotError::CouldNotFindResource( + format!("could not find resource for {}", existing.0,), + ))); + }; + + // The intention leaving this transaction is that the correct volume + // resource usage records exist, so: + // + // - if no usage record existed for the replacement usage, then create a + // new record that points to the volume id (this can happen if the + // volume to delete was blank when coming into this function) + // + // - if records exist for the "replacement" usage, then one of those + // will match the volume to delete id, so perform a swap instead to + // the volume id + + let existing_replacement_volume_usage_records = + Self::volume_usage_records_for_resource_query( + replacement_usage.clone(), + ) + .load_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })? + // TODO be smart enough to .filter the above query + .into_iter() + .filter(|record| record.volume_id == volume_to_delete_id.0) + .count(); + + // The "replacement" target moved into the volume + + if existing_replacement_volume_usage_records == 0 { + // No matching record + let new_record = + VolumeResourceUsageRecord::new(volume_id.0, replacement_usage); + + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(new_record) + .execute_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + } else if existing_replacement_volume_usage_records == 1 { + // One matching record: perform swap + Self::swap_volume_usage_records_for_resources( + conn, + replacement_usage, + volume_to_delete_id.0, + volume_id.0, + ) .await .map_err(|e| { - if let Some(err) = err.take() { - match err { - VolumeReplaceRegionError::Public(e) => e, + err.bail_retryable_or_else(e, |e| { + ReplaceSnapshotError::Public(public_error_from_diesel( + e, + ErrorHandler::Server, + )) + }) + })?; + } else { + // More than one matching record! + return Err(err.bail( + ReplaceSnapshotError::MultipleResourceUsageRecords(format!( + "{replacement_usage:?}" + )), + )); + } - VolumeReplaceRegionError::SerdeError(_) => { - Error::internal_error(&err.to_string()) - } + // After region snapshot replacement, validate invariants for all + // volumes + #[cfg(any(test, feature = "testing"))] + Self::validate_volume_invariants(conn).await?; - VolumeReplaceRegionError::RegionReplacementError(_) => { - Error::internal_error(&err.to_string()) - } - } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + Ok(VolumeReplaceResult::Done) } /// Replace a read-only target in a Volume with a new region @@ -2226,235 +3273,46 @@ impl DataStore { replacement: ReplacementTarget, volume_to_delete_id: VolumeToDelete, ) -> Result { - #[derive(Debug, thiserror::Error)] - enum VolumeReplaceSnapshotError { - #[error("Error from Volume snapshot replacement: {0}")] - Public(Error), - - #[error("Serde error during Volume snapshot replacement: {0}")] - SerdeError(#[from] serde_json::Error), - - #[error("Snapshot replacement error: {0}")] - SnapshotReplacementError(#[from] anyhow::Error), - - #[error("Replaced {0} targets, expected {1}")] - UnexpectedReplacedTargets(usize, usize), - - #[error("Updated {0} database rows, expected {1}")] - UnexpectedDatabaseUpdate(usize, usize), - } let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("volume_replace_snapshot") .transaction(&conn, |conn| { let err = err.clone(); - async move { - use db::schema::volume::dsl as volume_dsl; - - // Grab the old volume first - let maybe_old_volume = { - volume_dsl::volume - .filter(volume_dsl::id.eq(volume_id.0)) - .select(Volume::as_select()) - .first_async::(&conn) - .await - .optional() - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })? - }; - - let old_volume = if let Some(old_volume) = maybe_old_volume { - old_volume - } else { - // Existing volume was hard-deleted, so return here. We - // can't perform the region replacement now, and this - // will short-circuit the rest of the process. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - }; - - if old_volume.time_deleted.is_some() { - // Existing volume was soft-deleted, so return here for - // the same reason: the region replacement process - // should be short-circuited now. - return Ok(VolumeReplaceResult::ExistingVolumeDeleted); - } - - let old_vcr: VolumeConstructionRequest = - match serde_json::from_str(&old_volume.data()) { - Ok(vcr) => vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SerdeError(e) - )); - }, - }; - - // Does it look like this replacement already happened? - let old_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &existing.0) { - Ok(v) => v, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - }, - }; - - let new_target_in_vcr = match read_only_target_in_vcr(&old_vcr, &replacement.0) { - Ok(v) => v, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - }, - }; - - if !old_target_in_vcr && new_target_in_vcr { - // It does seem like the replacement happened - return Ok(VolumeReplaceResult::AlreadyHappened); - } - - // Update the existing volume's construction request to - // replace the existing target's SocketAddrV6 with the - // replacement target's - - // Copy the old volume's VCR, changing out the old target - // for the new. - let (new_vcr, replacements) = match replace_read_only_target_in_vcr( - &old_vcr, + async move { + Self::volume_replace_snapshot_in_txn( + &conn, + err, + volume_id, existing, replacement, - ) { - Ok(new_vcr) => new_vcr, - Err(e) => { - return Err(err.bail( - VolumeReplaceSnapshotError::SnapshotReplacementError(e) - )); - } - }; - - // Expect that this only happened once. If it happened - // multiple times, question everything: how would a snapshot - // be used twice?! - - if replacements != 1 { - return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedReplacedTargets( - replacements, 1, - ) - )); - } - - let new_volume_data = serde_json::to_string( - &new_vcr, + volume_to_delete_id, ) - .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; - - // Update the existing volume's data - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id.0)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeReplaceSnapshotError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - ) - }) - })?; - - // Make a new VCR that will stash the target to delete. The - // values here don't matter, just that it gets fed into the - // volume_delete machinery later. - let vcr = VolumeConstructionRequest::Volume { - id: volume_to_delete_id.0, - block_size: 512, - sub_volumes: vec![ - VolumeConstructionRequest::Region { - block_size: 512, - blocks_per_extent: 1, - extent_count: 1, - gen: 1, - opts: sled_agent_client::types::CrucibleOpts { - id: volume_to_delete_id.0, - target: vec![ - existing.0.to_string(), - ], - lossy: false, - flush_timeout: None, - key: None, - cert_pem: None, - key_pem: None, - root_cert_pem: None, - control: None, - read_only: true, - }, - } - ], - read_only_parent: None, - }; - - let volume_data = serde_json::to_string(&vcr) - .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; - - // Update the volume to delete data - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_to_delete_id.0)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(volume_data)) - .execute_async(&conn) - .await?; - - if num_updated != 1 { - return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( - num_updated, 1, - ) - )); - } - - Ok(VolumeReplaceResult::Done) + .await } }) .await .map_err(|e| { if let Some(err) = err.take() { match err { - VolumeReplaceSnapshotError::Public(e) => e, - - VolumeReplaceSnapshotError::SerdeError(_) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::SnapshotReplacementError(_) => { - Error::internal_error(&err.to_string()) - } + ReplaceSnapshotError::Public(e) => e, - VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) => { - Error::internal_error(&err.to_string()) - } + ReplaceSnapshotError::SerdeError(_) + | ReplaceSnapshotError::SnapshotReplacementError(_) + | ReplaceSnapshotError::UnexpectedReplacedTargets( + _, + _, + ) + | ReplaceSnapshotError::UnexpectedDatabaseUpdate( + _, + _, + ) + | ReplaceSnapshotError::AddressParseError(_) + | ReplaceSnapshotError::CouldNotFindResource(_) + | ReplaceSnapshotError::MultipleResourceUsageRecords( + _, + ) => Error::internal_error(&err.to_string()), } } else { public_error_from_diesel(e, ErrorHandler::Server) @@ -2463,7 +3321,7 @@ impl DataStore { } } -/// Return the targets from a VolumeConstructionRequest. +/// Return the read-only targets from a VolumeConstructionRequest. /// /// The targets of a volume construction request map to resources. pub fn read_only_resources_associated_with_volume( @@ -2508,6 +3366,67 @@ pub fn read_only_resources_associated_with_volume( } } +/// Return the read-write targets from a VolumeConstructionRequest. +/// +/// The targets of a volume construction request map to resources. +pub fn read_write_resources_associated_with_volume( + vcr: &VolumeConstructionRequest, + targets: &mut Vec, +) { + let mut parts: VecDeque<&VolumeConstructionRequest> = VecDeque::new(); + parts.push_back(&vcr); + + while let Some(vcr_part) = parts.pop_front() { + match vcr_part { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + for sub_volume in sub_volumes { + parts.push_back(sub_volume); + } + + // No need to look under read-only parent + } + + VolumeConstructionRequest::Url { .. } => { + // no action required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if !opts.read_only { + for target in &opts.target { + targets.push(target.clone()); + } + } + } + + VolumeConstructionRequest::File { .. } => { + // no action required + } + } + } +} + +/// Return the number of read-write subvolumes in a VolumeConstructionRequest. +pub fn count_read_write_sub_volumes( + vcr: &VolumeConstructionRequest, +) -> anyhow::Result { + Ok(match vcr { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + sub_volumes.len() + } + + VolumeConstructionRequest::Url { .. } => 0, + + VolumeConstructionRequest::Region { .. } => { + // We don't support a pure Region VCR at the volume + // level in the database, so this choice should + // never be encountered. + bail!("Region not supported as a top level volume"); + } + + VolumeConstructionRequest::File { .. } => 0, + }) +} + /// Returns true if the sub-volumes of a Volume are all read-only pub fn volume_is_read_only( vcr: &VolumeConstructionRequest, @@ -2535,7 +3454,7 @@ pub fn volume_is_read_only( // We don't support a pure Region VCR at the volume // level in the database, so this choice should // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); + bail!("Region not supported as a top level volume"); } VolumeConstructionRequest::File { .. } => { @@ -2713,66 +3632,253 @@ fn find_matching_rw_regions_in_volume( } } - VolumeConstructionRequest::Url { .. } => { - // nothing required + VolumeConstructionRequest::Url { .. } => { + // nothing required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if !opts.read_only { + for target in &opts.target { + let parsed_target: SocketAddrV6 = target.parse()?; + if parsed_target.ip() == ip { + matched_targets.push(parsed_target); + } + } + } + } + + VolumeConstructionRequest::File { .. } => { + // nothing required + } + } + } + + Ok(()) +} + +impl DataStore { + pub async fn find_volumes_referencing_socket_addr( + &self, + opctx: &OpContext, + address: SocketAddr, + ) -> ListResultVec { + opctx.check_complex_operations_allowed()?; + + let mut volumes = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = self.pool_connection_authorized(opctx).await?; + + let needle = address.to_string(); + + while let Some(p) = paginator.next() { + use db::schema::volume::dsl; + + let haystack = + paginated(dsl::volume, dsl::id, &p.current_pagparams()) + .select(Volume::as_select()) + .get_results_async::(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for volume in haystack { + if volume.data().contains(&needle) { + volumes.push(volume); + } + } + } + + Ok(volumes) + } +} + +// Add some validation that runs only for tests +#[cfg(any(test, feature = "testing"))] +impl DataStore { + fn volume_invariant_violated(msg: String) -> diesel::result::Error { + diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::CheckViolation, + Box::new(msg), + ) + } + + /// Tests each Volume to see if invariants hold + /// + /// If an invariant is violated, this function returns a `CheckViolation` + /// with the text of what invariant was violated. + pub(crate) async fn validate_volume_invariants( + conn: &async_bb8_diesel::Connection, + ) -> Result<(), diesel::result::Error> { + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + + while let Some(p) = paginator.next() { + use db::schema::volume::dsl; + let haystack = + paginated(dsl::volume, dsl::id, &p.current_pagparams()) + .select(Volume::as_select()) + .get_results_async::(conn) + .await?; + + paginator = p.found_batch(&haystack, &|v| v.id()); + + for volume in haystack { + Self::validate_volume_has_all_resources(&conn, volume).await?; + } + } + + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + + while let Some(p) = paginator.next() { + use db::schema::region::dsl; + let haystack = + paginated(dsl::region, dsl::id, &p.current_pagparams()) + .select(Region::as_select()) + .get_results_async::(conn) + .await?; + + paginator = p.found_batch(&haystack, &|r| r.id()); + + for region in haystack { + Self::validate_read_only_region_has_no_snapshots(&conn, region) + .await?; + } + } + + Ok(()) + } + + /// Assert that the resources that comprise non-deleted volumes have not + /// been prematurely deleted. + async fn validate_volume_has_all_resources( + conn: &async_bb8_diesel::Connection, + volume: Volume, + ) -> Result<(), diesel::result::Error> { + if volume.time_deleted.is_some() { + // Do not need to validate resources for soft-deleted volumes + return Ok(()); + } + + let vcr: VolumeConstructionRequest = + serde_json::from_str(&volume.data()).unwrap(); + + // validate all read/write resources still exist + + let num_read_write_subvolumes = match count_read_write_sub_volumes(&vcr) + { + Ok(v) => v, + Err(e) => { + return Err(Self::volume_invariant_violated(format!( + "volume {} had error: {e}", + volume.id(), + ))); } + }; - VolumeConstructionRequest::Region { opts, .. } => { - if !opts.read_only { - for target in &opts.target { - let parsed_target: SocketAddrV6 = target.parse()?; - if parsed_target.ip() == ip { - matched_targets.push(parsed_target); - } - } + let mut read_write_targets = Vec::with_capacity( + REGION_REDUNDANCY_THRESHOLD * num_read_write_subvolumes, + ); + + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let target = match target.parse() { + Ok(t) => t, + Err(e) => { + return Err(Self::volume_invariant_violated(format!( + "could not parse {target}: {e}" + ))); } - } + }; - VolumeConstructionRequest::File { .. } => { - // nothing required - } + let maybe_region = DataStore::target_to_region( + conn, + &target, + RegionType::ReadWrite, + ) + .await?; + + let Some(_region) = maybe_region else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {target}" + ))); + }; } - } - Ok(()) -} + // validate all read-only resources still exist -impl DataStore { - pub async fn find_volumes_referencing_socket_addr( - &self, - opctx: &OpContext, - address: SocketAddr, - ) -> ListResultVec { - opctx.check_complex_operations_allowed()?; + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &vcr, + &mut crucible_targets, + ); + crucible_targets + }; - let mut volumes = Vec::new(); - let mut paginator = Paginator::new(SQL_BATCH_SIZE); - let conn = self.pool_connection_authorized(opctx).await?; + for read_only_target in &crucible_targets.read_only_targets { + let read_only_target = read_only_target.parse().map_err(|e| { + Self::volume_invariant_violated(format!( + "could not parse {read_only_target}: {e}" + )) + })?; - let needle = address.to_string(); + let maybe_usage = + DataStore::read_only_target_to_volume_resource_usage( + conn, + &read_only_target, + ) + .await?; - while let Some(p) = paginator.next() { - use db::schema::volume::dsl; + let Some(_usage) = maybe_usage else { + return Err(Self::volume_invariant_violated(format!( + "could not find resource for {read_only_target}" + ))); + }; + } - let haystack = - paginated(dsl::volume, dsl::id, &p.current_pagparams()) - .select(Volume::as_select()) - .get_results_async::(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + Ok(()) + } - paginator = p.found_batch(&haystack, &|r| r.id()); + /// Assert that read-only regions do not have any associated region + /// snapshots (see associated comment in `soft_delete_volume_in_txn`) + async fn validate_read_only_region_has_no_snapshots( + conn: &async_bb8_diesel::Connection, + region: Region, + ) -> Result<(), diesel::result::Error> { + if !region.read_only() { + return Ok(()); + } - for volume in haystack { - if volume.data().contains(&needle) { - volumes.push(volume); - } - } + use db::schema::volume_resource_usage::dsl; + + let matching_usage_records: Vec = + dsl::volume_resource_usage + .filter( + dsl::usage_type.eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_region_id.eq(region.id())) + .select(VolumeResourceUsageRecord::as_select()) + .get_results_async(conn) + .await? + .into_iter() + .map(|r| r.try_into().unwrap()) + .collect(); + + if !matching_usage_records.is_empty() { + return Err(Self::volume_invariant_violated(format!( + "read-only region {} has matching usage records: {:?}", + region.id(), + matching_usage_records, + ))); } - Ok(volumes) + Ok(()) } } @@ -2780,7 +3886,13 @@ impl DataStore { mod tests { use super::*; + use crate::db::datastore::test::TestDatasets; + use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; use crate::db::pub_test_utils::TestDatabase; + use nexus_config::RegionAllocationStrategy; + use nexus_db_model::SqlU16; + use nexus_types::external_api::params::DiskSource; + use omicron_common::api::external::ByteCount; use omicron_test_utils::dev; use sled_agent_client::types::CrucibleOpts; @@ -2811,8 +3923,8 @@ mod tests { .await .unwrap(); - // Add old CrucibleResources json in the `resources_to_clean_up` column - - // this was before the `deleting` column / field was added to + // Add old CrucibleResources json in the `resources_to_clean_up` column + // - this was before the `deleting` column / field was added to // ResourceSnapshot. { @@ -2894,47 +4006,82 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_region"); let log = logctx.log.new(o!()); let db = TestDatabase::new_with_datastore(&log).await; + let opctx = db.opctx(); let datastore = db.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); - // Insert four Region records (three, plus one additionally allocated) + let _test_datasets = TestDatasets::create( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); - let new_volume_id = Uuid::new_v4(); - - let mut region_and_volume_ids = [ - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), new_volume_id), - ]; + let volume_to_delete_id = Uuid::new_v4(); - { - let conn = datastore.pool_connection_for_tests().await.unwrap(); + let datasets_and_regions = datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); - for i in 0..4 { - let (_, volume_id) = region_and_volume_ids[i]; + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); - let region = Region::new( - Uuid::new_v4(), // dataset id - volume_id, - 512_i64.try_into().unwrap(), - 10, - 10, - 10001, - false, - ); + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); - region_and_volume_ids[i].0 = region.id(); + let address: SocketAddrV6 = + datastore.region_addr(region.id()).await.unwrap().unwrap(); - use nexus_db_model::schema::region::dsl; - diesel::insert_into(dsl::region) - .values(region.clone()) - .execute_async(&*conn) - .await - .unwrap(); - } + region_addresses.push(address.to_string()); } + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + false, // read-write + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + let _volume = datastore .volume_create(nexus_db_model::Volume::new( volume_id, @@ -2949,9 +4096,10 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // target to replace - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + // target to replace + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -2972,26 +4120,19 @@ mod tests { // Replace one - let target = region_and_volume_ids[0]; - let replacement = region_and_volume_ids[3]; - let volume_replace_region_result = datastore .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, ) .await @@ -3018,9 +4159,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:101::1]:11111"), // replaced - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + replacement_region_addr.to_string(), // replaced + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3041,19 +4182,15 @@ mod tests { .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, ) .await @@ -3080,9 +4217,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // back to what it was - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), // back to what it was + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3107,13 +4244,127 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_snapshot"); let log = logctx.log.new(o!()); let db = TestDatabase::new_with_datastore(&log).await; + let opctx = db.opctx(); let datastore = db.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); - // Insert two volumes: one with the target to replace, and one temporary - // "volume to delete" that's blank. + let _test_datasets = TestDatasets::create( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); let volume_to_delete_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); + + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); + + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); + + let address: SocketAddrV6 = + datastore.region_addr(region.id()).await.unwrap().unwrap(); + + region_addresses.push(address.to_string()); + } + + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + true, // read-only + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + let region_snapshots = [ + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + ), + ]; + + datastore + .region_snapshot_create(region_snapshots[0].clone()) + .await + .unwrap(); + datastore + .region_snapshot_create(region_snapshots[1].clone()) + .await + .unwrap(); + datastore + .region_snapshot_create(region_snapshots[2].clone()) + .await + .unwrap(); + + // Insert two volumes: one with the target to replace, and one temporary + // "volume to delete" that's blank. Validate the pre-replacement volume + // resource usage records. + let rop_id = Uuid::new_v4(); datastore @@ -3130,9 +4381,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3154,9 +4405,9 @@ mod tests { id: rop_id, target: vec![ // target to replace - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3175,6 +4426,22 @@ mod tests { .await .unwrap(); + for region_snapshot in ®ion_snapshots { + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + datastore .volume_create(nexus_db_model::Volume::new( volume_to_delete_id, @@ -3189,21 +4456,33 @@ mod tests { .await .unwrap(); + // `volume_create` above was called with a blank volume, so no usage + // record will have been created for the read-only region + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + // Do the replacement let volume_replace_snapshot_result = datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd00:1122:3344:104::1]:400".parse().unwrap()), - ReplacementTarget( - "[fd55:1122:3344:101::1]:111".parse().unwrap(), - ), + ExistingTarget(address_1.parse().unwrap()), + ReplacementTarget(replacement_region_addr), VolumeToDelete(volume_to_delete_id), ) .await .unwrap(); - assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done,); + assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done); // Ensure the shape of the resulting VCRs @@ -3225,9 +4504,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3249,9 +4528,9 @@ mod tests { id: rop_id, target: vec![ // target replaced - String::from("[fd55:1122:3344:101::1]:111"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + replacement_region_addr.to_string(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3291,7 +4570,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replaced target stashed here - String::from("[fd00:1122:3344:104::1]:400"), + address_1.clone(), ], lossy: false, flush_timeout: None, @@ -3307,15 +4586,54 @@ mod tests { }, ); + // Validate the post-replacement volume resource usage records + + for (i, region_snapshot) in region_snapshots.iter().enumerate() { + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + + match i { + 0 => { + assert_eq!(usage[0].volume_id, volume_to_delete_id); + } + + 1 | 2 => { + assert_eq!(usage[0].volume_id, volume_id); + } + + _ => panic!("out of range"), + } + } + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + // Now undo the replacement. Note volume ID is not swapped. let volume_replace_snapshot_result = datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd55:1122:3344:101::1]:111".parse().unwrap()), - ReplacementTarget( - "[fd00:1122:3344:104::1]:400".parse().unwrap(), - ), + ExistingTarget(replacement_region_addr), + ReplacementTarget(address_1.parse().unwrap()), VolumeToDelete(volume_to_delete_id), ) .await @@ -3342,9 +4660,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3366,9 +4684,7 @@ mod tests { id: rop_id, target: vec![ // back to what it was - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1, address_2, address_3, ], lossy: false, flush_timeout: None, @@ -3408,7 +4724,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replacement stashed here - String::from("[fd55:1122:3344:101::1]:111"), + replacement_region_addr.to_string(), ], lossy: false, flush_timeout: None, @@ -3424,6 +4740,36 @@ mod tests { }, ); + // Validate the post-post-replacement volume resource usage records + + for region_snapshot in ®ion_snapshots { + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); + db.terminate().await; logctx.cleanup_successful(); } @@ -3438,6 +4784,41 @@ mod tests { let volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + )) + .await + .unwrap(); + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + )) + .await + .unwrap(); + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + )) + .await + .unwrap(); + // case where the needle is found datastore @@ -3456,9 +4837,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2, + address_3, ], lossy: false, flush_timeout: None, @@ -3480,7 +4861,7 @@ mod tests { let volumes = datastore .find_volumes_referencing_socket_addr( &opctx, - "[fd00:1122:3344:104::1]:400".parse().unwrap(), + address_1.parse().unwrap(), ) .await .unwrap(); @@ -4008,4 +5389,52 @@ mod tests { } ); } + + /// Assert that there are no "deleted" r/w regions found when the associated + /// volume hasn't been created yet. + #[tokio::test] + async fn test_no_find_deleted_region_for_no_volume() { + let logctx = + dev::test_setup_log("test_no_find_deleted_region_for_no_volume"); + let log = logctx.log.new(o!()); + let db = TestDatabase::new_with_datastore(&log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let _test_datasets = TestDatasets::create( + &opctx, + datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; + + let volume_id = Uuid::new_v4(); + + // Assert that allocating regions without creating the volume does not + // cause them to be returned as "deleted" regions, as this can cause + // sagas that allocate regions to race with the volume delete saga and + // cause premature region deletion. + + let _datasets_and_regions = datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); + + let deleted_regions = datastore + .find_deleted_volume_regions() + .await + .expect("find_deleted_volume_regions"); + + assert!(deleted_regions.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index 02800e3a3c..5f34c7cfb3 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -14,7 +14,6 @@ pub mod network_interface; pub mod oximeter; pub mod region_allocation; pub mod virtual_provisioning_collection_update; -pub mod volume; pub mod vpc; pub mod vpc_subnet; diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index efdc9a21fb..a9130d87f7 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -255,7 +255,8 @@ pub fn allocation_query( ").param().sql(" AS blocks_per_extent, ").param().sql(" AS extent_count, NULL AS port, - ").param().sql(" AS read_only + ").param().sql(" AS read_only, + FALSE as deleting FROM shuffled_candidate_datasets") // Only select the *additional* number of candidate regions for the required // redundancy level @@ -368,7 +369,7 @@ pub fn allocation_query( .sql(" inserted_regions AS ( INSERT INTO region - (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only) + (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting) SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql(" FROM candidate_regions WHERE diff --git a/nexus/db-queries/src/db/queries/volume.rs b/nexus/db-queries/src/db/queries/volume.rs deleted file mode 100644 index e7fe832a82..0000000000 --- a/nexus/db-queries/src/db/queries/volume.rs +++ /dev/null @@ -1,114 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Helper queries for working with volumes. - -use crate::db; -use crate::db::pool::DbConnection; -use diesel::expression::is_aggregate; -use diesel::expression::ValidGrouping; -use diesel::pg::Pg; -use diesel::query_builder::AstPass; -use diesel::query_builder::Query; -use diesel::query_builder::QueryFragment; -use diesel::query_builder::QueryId; -use diesel::sql_types; -use diesel::Column; -use diesel::Expression; -use diesel::QueryResult; -use diesel::RunQueryDsl; - -/// Produces a query fragment that will conditionally reduce the volume -/// references for region_snapshot rows whose snapshot_addr column is part of a -/// list. -/// -/// The output should look like: -/// -/// ```sql -/// update region_snapshot set -/// volume_references = volume_references - 1, -/// deleting = case when volume_references = 1 -/// then true -/// else false -/// end -/// where -/// snapshot_addr in ('a1', 'a2', 'a3') and -/// volume_references >= 1 and -/// deleting = false -/// returning * -/// ``` -#[must_use = "Queries must be executed"] -pub struct ConditionallyDecreaseReferences { - snapshot_addrs: Vec, -} - -impl ConditionallyDecreaseReferences { - pub fn new(snapshot_addrs: Vec) -> Self { - Self { snapshot_addrs } - } -} - -impl QueryId for ConditionallyDecreaseReferences { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} - -impl QueryFragment for ConditionallyDecreaseReferences { - fn walk_ast<'a>(&'a self, mut out: AstPass<'_, 'a, Pg>) -> QueryResult<()> { - use db::schema::region_snapshot::dsl; - - out.push_sql("UPDATE "); - dsl::region_snapshot.walk_ast(out.reborrow())?; - out.push_sql(" SET "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" - 1, "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = CASE WHEN "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = 1 THEN TRUE ELSE FALSE END WHERE "); - out.push_identifier(dsl::snapshot_addr::NAME)?; - out.push_sql(" IN ("); - - // If self.snapshot_addrs is empty, this query fragment will - // intentionally not update any region_snapshot rows. - for (i, snapshot_addr) in self.snapshot_addrs.iter().enumerate() { - out.push_bind_param::(snapshot_addr)?; - if i == self.snapshot_addrs.len() - 1 { - out.push_sql(" "); - } else { - out.push_sql(", "); - } - } - - out.push_sql(") AND "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" >= 1 AND "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = false RETURNING *"); - - Ok(()) - } -} - -impl Expression for ConditionallyDecreaseReferences { - type SqlType = sql_types::Array; -} - -impl ValidGrouping - for ConditionallyDecreaseReferences -{ - type IsAggregate = is_aggregate::Never; -} - -impl RunQueryDsl for ConditionallyDecreaseReferences {} - -type SelectableSql = < - >::SelectExpression as diesel::Expression ->::SqlType; - -impl Query for ConditionallyDecreaseReferences { - type SqlType = SelectableSql; -} diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql index 9ee71b403f..f7b5354b42 100644 --- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -101,7 +102,8 @@ WITH $9 AS blocks_per_extent, $10 AS extent_count, NULL AS port, - $11 AS read_only + $11 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -211,7 +213,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -223,7 +226,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -238,7 +242,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -301,7 +306,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -331,7 +337,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql index 369410c68c..9083967714 100644 --- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -99,7 +100,8 @@ WITH $8 AS blocks_per_extent, $9 AS extent_count, NULL AS port, - $10 AS read_only + $10 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -209,7 +211,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -221,7 +224,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -236,7 +240,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -299,7 +304,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -329,7 +335,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql index 9251139c4e..589d6976a8 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -112,7 +113,8 @@ WITH $10 AS blocks_per_extent, $11 AS extent_count, NULL AS port, - $12 AS read_only + $12 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -222,7 +224,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -234,7 +237,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -249,7 +253,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -312,7 +317,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -342,7 +348,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql index c8aa8adf2e..3e1b047773 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -11,7 +11,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting FROM region WHERE @@ -110,7 +111,8 @@ WITH $9 AS blocks_per_extent, $10 AS extent_count, NULL AS port, - $11 AS read_only + $11 AS read_only, + false AS deleting FROM shuffled_candidate_datasets LIMIT @@ -220,7 +222,8 @@ WITH blocks_per_extent, extent_count, port, - read_only + read_only, + deleting ) SELECT candidate_regions.id, @@ -232,7 +235,8 @@ WITH candidate_regions.blocks_per_extent, candidate_regions.extent_count, candidate_regions.port, - candidate_regions.read_only + candidate_regions.read_only, + candidate_regions.deleting FROM candidate_regions WHERE @@ -247,7 +251,8 @@ WITH region.blocks_per_extent, region.extent_count, region.port, - region.read_only + region.read_only, + region.deleting ), updated_datasets AS ( @@ -310,7 +315,8 @@ WITH old_regions.blocks_per_extent, old_regions.extent_count, old_regions.port, - old_regions.read_only + old_regions.read_only, + old_regions.deleting FROM old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id ) @@ -340,7 +346,8 @@ UNION inserted_regions.blocks_per_extent, inserted_regions.extent_count, inserted_regions.port, - inserted_regions.read_only + inserted_regions.read_only, + inserted_regions.deleting FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 963f0bdcac..ad39777054 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -778,8 +778,7 @@ impl BackgroundTasksInitializer { "detect if region snapshots need replacement and begin the \ process", period: config.region_snapshot_replacement_start.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new(RegionSnapshotReplacementDetector::disabled( + task_impl: Box::new(RegionSnapshotReplacementDetector::new( datastore.clone(), sagas.clone(), )), @@ -795,13 +794,10 @@ impl BackgroundTasksInitializer { period: config .region_snapshot_replacement_garbage_collection .period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementGarbageCollect::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementGarbageCollect::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_garbage_collection, @@ -813,13 +809,10 @@ impl BackgroundTasksInitializer { "detect what volumes were affected by a region snapshot \ replacement, and run the step saga for them", period: config.region_snapshot_replacement_step.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFindAffected::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementFindAffected::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_step, @@ -831,10 +824,9 @@ impl BackgroundTasksInitializer { "complete a region snapshot replacement if all the steps are \ done", period: config.region_snapshot_replacement_finish.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFinishDetector::disabled(datastore), - ), + task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( + datastore, + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_finish, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 83078cb978..caa2fa7bed 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -19,17 +19,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, - disabled: bool, } impl RegionSnapshotReplacementFinishDetector { - #[allow(dead_code)] pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: false } - } - - pub fn disabled(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: true } + RegionSnapshotReplacementFinishDetector { datastore } } async fn transition_requests_to_done( @@ -159,10 +153,6 @@ impl BackgroundTask for RegionSnapshotReplacementFinishDetector { async move { let mut status = RegionSnapshotReplacementFinishStatus::default(); - if self.disabled { - return json!(status); - } - self.transition_requests_to_done(opctx, &mut status).await; json!(status) diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index eb171fda12..f3b1b68198 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -22,28 +22,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementGarbageCollect { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementGarbageCollect { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementGarbageCollect { datastore, sagas } } async fn send_garbage_collect_request( @@ -152,10 +135,6 @@ impl BackgroundTask for RegionSnapshotReplacementGarbageCollect { let mut status = RegionSnapshotReplacementGarbageCollectStatus::default(); - if self.disabled { - return json!(status); - } - self.clean_up_region_snapshot_replacement_volumes( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 8fd1e55975..bc739ecf27 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -29,20 +29,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementDetector { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementDetector { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: false } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: true } + RegionSnapshotReplacementDetector { datastore, sagas } } async fn send_start_request( @@ -246,10 +237,6 @@ impl BackgroundTask for RegionSnapshotReplacementDetector { async { let mut status = RegionSnapshotReplacementStartStatus::default(); - if self.disabled { - return json!(status); - } - self.create_requests_for_region_snapshots_on_expunged_disks( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index da05500a58..29878364e6 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -42,28 +42,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFindAffected { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementFindAffected { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementFindAffected { datastore, sagas } } async fn send_start_request( @@ -233,14 +216,16 @@ impl RegionSnapshotReplacementFindAffected { Ok(Some(region_snapshot)) => region_snapshot, Ok(None) => { + // If the associated region snapshot was deleted, then there + // are no more volumes that reference it. This is not an + // error! Continue processing the other requests. let s = format!( - "region snapshot {} {} {} not found!", + "region snapshot {} {} {} not found", request.old_dataset_id, request.old_region_id, request.old_snapshot_id, ); - error!(&log, "{s}"); - status.errors.push(s); + info!(&log, "{s}"); continue; } @@ -452,10 +437,6 @@ impl BackgroundTask for RegionSnapshotReplacementFindAffected { async move { let mut status = RegionSnapshotReplacementStepStatus::default(); - if self.disabled { - return json!(status); - } - // Importantly, clean old steps up before finding affected volumes! // Otherwise, will continue to find the snapshot in volumes to // delete, and will continue to see conflicts in next function. @@ -500,6 +481,19 @@ mod test { ) -> Uuid { let new_volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + snapshot_addr.clone(), + )) + .await + .unwrap(); + let volume_construction_request = VolumeConstructionRequest::Volume { id: new_volume_id, block_size: 0, diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index d2e3053668..7efe70da77 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -934,6 +934,34 @@ pub(crate) mod test { .is_none() } + async fn no_volume_resource_usage_records_exist( + datastore: &DataStore, + ) -> bool { + use nexus_db_queries::db::schema::volume_resource_usage::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + let rows = datastore + .transaction_retry_wrapper("no_volume_resource_usage_records_exist") + .transaction(&conn, |conn| async move { + conn.batch_execute_async( + nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, + ) + .await + .unwrap(); + + Ok(dsl::volume_resource_usage + .count() + .get_result_async::(&conn) + .await + .unwrap()) + }) + .await + .unwrap(); + + rows == 0 + } + async fn no_virtual_provisioning_resource_records_exist( datastore: &DataStore, ) -> bool { @@ -1030,6 +1058,7 @@ pub(crate) mod test { .await; assert!(no_disk_records_exist(datastore).await); assert!(no_volume_records_exist(datastore).await); + assert!(no_volume_resource_usage_records_exist(datastore).await); assert!( no_virtual_provisioning_resource_records_exist(datastore).await ); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 6d56732388..4e6c3e1e16 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -58,7 +58,6 @@ use crate::app::db::datastore::ReplacementTarget; use crate::app::db::datastore::VolumeReplaceResult; use crate::app::db::datastore::VolumeToDelete; use crate::app::db::datastore::VolumeWithTarget; -use crate::app::db::lookup::LookupPath; use crate::app::sagas::common_storage::find_only_new_region; use crate::app::sagas::declare_saga_actions; use crate::app::RegionAllocationStrategy; @@ -242,12 +241,21 @@ async fn rsrss_get_alloc_region_params( ); // Look up the existing snapshot - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + // Find the region to replace let db_region = osagactx .datastore() @@ -480,12 +488,22 @@ async fn rsrss_get_old_snapshot_volume_id( ¶ms.serialized_authn, ); - let (.., db_snapshot) = LookupPath::new(&opctx, &osagactx.datastore()) - .snapshot_id(params.request.old_snapshot_id) - .fetch() + // Look up the existing snapshot + let maybe_db_snapshot = osagactx + .datastore() + .snapshot_get(&opctx, params.request.old_snapshot_id) .await .map_err(ActionError::action_failed)?; + let Some(db_snapshot) = maybe_db_snapshot else { + return Err(ActionError::action_failed(Error::internal_error( + &format!( + "snapshot {} was hard deleted!", + params.request.old_snapshot_id + ), + ))); + }; + Ok(db_snapshot.volume_id) } @@ -510,6 +528,13 @@ async fn rsrss_create_fake_volume( gen: 0, opts: CrucibleOpts { id: new_volume_id, + // Do not put the new region ID here: it will be deleted during + // the associated garbage collection saga if + // `volume_replace_snapshot` does not perform the swap (which + // happens when the snapshot volume is deleted) and we still + // want it to exist when performing other replacement steps. If + // the replacement does occur, then the old address will swapped + // in here. target: vec![], lossy: false, flush_timeout: None, @@ -673,14 +698,15 @@ async fn rsrss_replace_snapshot_in_volume( } VolumeReplaceResult::ExistingVolumeDeleted => { - // Unwind the saga here to clean up the resources allocated during - // this saga. The associated background task will transition this - // request's state to Completed. - - Err(ActionError::action_failed(format!( - "existing volume {} deleted", - replacement_params.old_volume_id - ))) + // If the snapshot volume was deleted, we still want to proceed with + // replacing the rest of the uses of the region snapshot. Note this + // also covers the case where this saga node runs (performing the + // replacement), the executor crashes before it can record that + // success, and then before this node is rerun the snapshot is + // deleted. If this saga unwound here, that would violate the + // property of idempotency. + + Ok(()) } } } @@ -801,6 +827,7 @@ pub(crate) mod test { const SNAPSHOT_NAME: &str = "my-snap"; const PROJECT_NAME: &str = "springfield-squidport"; + /// Create four zpools, a disk, and a snapshot of that disk async fn prepare_for_test( cptestctx: &ControlPlaneTestContext, ) -> PrepareResult { diff --git a/nexus/src/app/sagas/volume_delete.rs b/nexus/src/app/sagas/volume_delete.rs index c5e1b49f61..12d48cb367 100644 --- a/nexus/src/app/sagas/volume_delete.rs +++ b/nexus/src/app/sagas/volume_delete.rs @@ -29,7 +29,7 @@ use super::NexusSaga; use crate::app::sagas::declare_saga_actions; use nexus_db_model::Dataset; use nexus_db_model::Region; -use nexus_db_model::RegionSnapshot; +use nexus_db_model::Volume; use nexus_db_queries::authn; use nexus_db_queries::db::datastore::CrucibleResources; use nexus_types::identity::Asset; @@ -330,8 +330,7 @@ async fn svd_delete_crucible_snapshot_records( Ok(()) } -type FreedCrucibleRegions = - Vec<(Dataset, Region, Option, Uuid)>; +type FreedCrucibleRegions = Vec<(Dataset, Region, Option)>; /// Deleting region snapshots in a previous saga node may have freed up regions /// that were deleted in the DB but couldn't be deleted by the Crucible Agent @@ -394,7 +393,7 @@ type FreedCrucibleRegions = /// The disk's volume has no read only resources, while the snapshot's volume /// does. The disk volume's targets are all regions (backed by downstairs that /// are read/write) while the snapshot volume's targets are all snapshots -/// (backed by volumes that are read-only). The two volumes are linked in the +/// (backed by downstairs that are read-only). The two volumes are linked in the /// sense that the snapshots from the second are contained *within* the regions /// of the first, reflecting the resource nesting from ZFS. This is also /// reflected in the REST endpoint that the Crucible agent uses: @@ -436,7 +435,7 @@ async fn svd_find_freed_crucible_regions( // Don't serialize the whole Volume, as the data field contains key material! Ok(freed_datasets_regions_and_volumes .into_iter() - .map(|x| (x.0, x.1, x.2, x.3.id())) + .map(|x| (x.0, x.1, x.2.map(|v: Volume| v.id()))) .collect()) } @@ -451,23 +450,7 @@ async fn svd_delete_freed_crucible_regions( let freed_datasets_regions_and_volumes = sagactx.lookup::("freed_crucible_regions")?; - for (dataset, region, region_snapshot, volume_id) in - freed_datasets_regions_and_volumes - { - if region_snapshot.is_some() { - // We cannot delete this region yet, the snapshot has not been - // deleted. This can occur when multiple volume delete sagas run - // concurrently: one will decrement the crucible resources (but - // hasn't made the appropriate DELETE calls to remove the running - // snapshots and snapshots yet), and the other will be here trying - // to delete the region. This race results in the crucible agent - // returning "must delete snapshots first" and causing saga unwinds. - // - // Another volume delete (probably the one racing with this one!) - // will pick up this region and remove it. - continue; - } - + for (dataset, region, volume_id) in freed_datasets_regions_and_volumes { // Send DELETE calls to the corresponding Crucible agents osagactx .nexus() @@ -496,14 +479,16 @@ async fn svd_delete_freed_crucible_regions( })?; // Remove volume DB record - osagactx.datastore().volume_hard_delete(volume_id).await.map_err( - |e| { - ActionError::action_failed(format!( - "failed to volume_hard_delete {}: {:?}", - volume_id, e, - )) - }, - )?; + if let Some(volume_id) = volume_id { + osagactx.datastore().volume_hard_delete(volume_id).await.map_err( + |e| { + ActionError::action_failed(format!( + "failed to volume_hard_delete {}: {:?}", + volume_id, e, + )) + }, + )?; + } } Ok(()) diff --git a/nexus/test-utils/src/background.rs b/nexus/test-utils/src/background.rs index 859796f556..b0474037a8 100644 --- a/nexus/test-utils/src/background.rs +++ b/nexus/test-utils/src/background.rs @@ -332,7 +332,7 @@ pub async fn run_replacement_tasks_to_completion( } }, &Duration::from_secs(1), - &Duration::from_secs(10), + &Duration::from_secs(20), ) .await .unwrap(); diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 4463267b42..939fd61249 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -2442,83 +2442,6 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( assert_eq!(datasets_and_regions.len(), REGION_REDUNDANCY_THRESHOLD); } -// Confirm that a region set can start at N, a region can be deleted, and the -// allocation CTE can bring the redundancy back to N. -#[nexus_test] -async fn test_region_allocation_after_delete( - cptestctx: &ControlPlaneTestContext, -) { - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = - OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - - // Create three 10 GiB zpools, each with one dataset. - let _disk_test = DiskTest::new(&cptestctx).await; - - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - - // Create a disk - let client = &cptestctx.external_client; - let _project_id = create_project_and_pool(client).await; - - let disk = create_disk(&client, PROJECT_NAME, DISK_NAME).await; - - // Assert disk has three allocated regions - let disk_id = disk.identity.id; - let (.., db_disk) = LookupPath::new(&opctx, &datastore) - .disk_id(disk_id) - .fetch() - .await - .unwrap_or_else(|_| panic!("test disk {:?} should exist", disk_id)); - - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); - - // Delete one of the regions - let region_to_delete: &nexus_db_model::Region = &allocated_regions[0].1; - datastore - .regions_hard_delete(&opctx.log, vec![region_to_delete.id()]) - .await - .unwrap(); - - // Assert disk's volume has one less allocated region - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD - 1); - - let region_total_size: ByteCount = ByteCount::try_from( - region_to_delete.block_size().to_bytes() - * region_to_delete.blocks_per_extent() - * region_to_delete.extent_count(), - ) - .unwrap(); - - // Rerun disk region allocation - datastore - .disk_region_allocate( - &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_delete.block_size().to_bytes() as u32, - ) - .unwrap(), - }, - region_total_size, - &RegionAllocationStrategy::Random { seed: None }, - ) - .await - .unwrap(); - - // Assert redundancy was restored - let allocated_regions = - datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); - assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); -} - #[nexus_test] async fn test_no_halt_disk_delete_one_region_on_expunged_agent( cptestctx: &ControlPlaneTestContext, diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 8765218d33..887afff20f 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -5,31 +5,58 @@ //! Tests that Nexus properly manages and cleans up Crucible resources //! associated with Volumes +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; +use diesel::ExpressionMethods; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; +use nexus_config::RegionAllocationStrategy; +use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_model::RegionSnapshotReplacementState; +use nexus_db_model::Volume; +use nexus_db_model::VolumeResourceUsage; +use nexus_db_model::VolumeResourceUsageRecord; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::CrucibleResources; +use nexus_db_queries::db::datastore::ExistingTarget; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; +use nexus_db_queries::db::datastore::ReplacementTarget; +use nexus_db_queries::db::datastore::VolumeReplaceResult; +use nexus_db_queries::db::datastore::VolumeToDelete; +use nexus_db_queries::db::datastore::VolumeWithTarget; +use nexus_db_queries::db::datastore::SQL_BATCH_SIZE; use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::pagination::paginated; +use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; +use nexus_test_utils::background::run_replacement_tasks_to_completion; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::resource_helpers::create_default_ip_pool; +use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::views; use nexus_types::identity::Asset; +use nexus_types::identity::Resource; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Name; use omicron_common::api::internal; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_test_utils::dev::poll::CondCheckError; use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; @@ -37,6 +64,8 @@ use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; +use std::collections::HashSet; +use std::net::SocketAddrV6; use std::sync::Arc; use uuid::Uuid; @@ -2182,38 +2211,38 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101:7]:19016"), + String::from("[fd00:1122:3344:101::7]:19016"), ), ( zpool1.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:102:7]:19016"), + String::from("[fd00:1122:3344:102::7]:19016"), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103:7]:19016"), + String::from("[fd00:1122:3344:103::7]:19016"), ), // second snapshot-create ( zpool0.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:101:7]:19016"), // duplicate! + String::from("[fd00:1122:3344:101::7]:19016"), // duplicate! ), ( zpool3.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:104:7]:19016"), + String::from("[fd00:1122:3344:104::7]:19016"), ), ( zpool2.datasets[0].id, Uuid::new_v4(), Uuid::new_v4(), - String::from("[fd00:1122:3344:103:7]:19017"), + String::from("[fd00:1122:3344:103::7]:19017"), ), ]; @@ -2286,36 +2315,47 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate the volume's region_snapshots got incremented by - // volume_create + // Also validate the volume's region_snapshots had volume resource usage + // records created by volume_create for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } - // Soft delete the volume, and validate that only three region_snapshot - // records are returned. + // Soft delete the volume, and validate that the volume's region_snapshots + // had their volume resource usage records deleted let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -2396,30 +2436,42 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate only the volume's region_snapshots got incremented by - // volume_create. + // Also validate only the new volume's region_snapshots had usage records + // created. for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } + for i in 3..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } // Soft delete the volume, and validate that only three region_snapshot @@ -2427,18 +2479,23 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); - // Make sure every region_snapshot is now 0, and deleting + // Make sure every region_snapshot has no usage records now for i in 0..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -3503,3 +3560,2761 @@ async fn test_cte_returns_regions(cptestctx: &ControlPlaneTestContext) { .collect::>(), ); } + +struct TestReadOnlyRegionReferenceUsage { + datastore: Arc, + + region: db::model::Region, + region_address: SocketAddrV6, + + first_volume_id: Uuid, + second_volume_id: Uuid, + + last_resources_to_delete: Option, +} + +impl TestReadOnlyRegionReferenceUsage { + pub async fn new(cptestctx: &ControlPlaneTestContext) -> Self { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let first_volume_id = Uuid::new_v4(); + let second_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: first_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + assert_eq!(datasets_and_regions.len(), 1); + + let (_, region) = &datasets_and_regions[0]; + + assert_eq!(region.volume_id(), first_volume_id); + assert!(region.read_only()); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(region.id(), 12345).await.unwrap(); + + let region_address = + datastore.region_addr(region.id()).await.unwrap().unwrap(); + + let region = datastore.get_region(region.id()).await.unwrap(); + + TestReadOnlyRegionReferenceUsage { + datastore: datastore.clone(), + + region, + region_address, + + first_volume_id, + second_volume_id, + + last_resources_to_delete: None, + } + } + + pub async fn create_first_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_only_first_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.first_volume_id); + } + + pub async fn validate_only_second_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.second_volume_id); + } + + pub async fn delete_first_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.first_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn delete_second_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.second_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn validate_no_usage_records(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + } + + pub async fn validate_region_returned_for_cleanup(&self) { + assert!(self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + pub async fn validate_region_not_returned_for_cleanup(&self) { + assert!(!self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + // read-only regions should never be returned by find_deleted_volume_regions + pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { + let deleted_volume_regions = + self.datastore.find_deleted_volume_regions().await.unwrap(); + + assert!(!deleted_volume_regions + .into_iter() + .any(|(_, r, _)| r.id() == self.region.id())); + } + + pub async fn create_first_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_both_volumes_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == self.first_volume_id)); + assert!(usage.iter().any(|r| r.volume_id == self.second_volume_id)); + } +} + +/// Assert that creating a volume with a read-only region in a subvolume creates +/// an appropriate usage record, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_usage_sanity( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume().await; + + // Validate only one volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // Now, soft-delete the volume, and make sure that the associated volume + // resource usage record is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating a volume with a read-only region in the ROP creates an +/// appropriate reference, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // It should be _not_ returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Now, soft-delete the volume, and make sure that read-only volume + // reference is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region creates the +/// appropriate references, and that deleting only one of those volumes does not +/// mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume().await; + harness.create_second_volume().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the first volume, and make sure that only one read-only + // volume reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region in the ROP +/// creates the appropriate references, and that deleting only one of those +/// volumes does not mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume_region_in_rop().await; + harness.create_second_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the volume, and make sure that only one read-only volume + // reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + // It should not be returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that a read-only region is properly reference counted and not +/// prematurely deleted: +/// +/// 1) create a disk, then a snapshot of that disk, then a disk from that +/// snapshot +/// +/// 2) issue a region snapshot replacement request for one of the region +/// snapshots in that snapshot, then run that process to completion +/// +/// 3) delete the snapshot +/// +/// 4) expect that the reference to the read-only region in the disk created +/// from the snapshot means that read-only region will not be cleaned up by +/// Nexus +/// +/// 5) clean up all the objects, and expect the crucible resources are properly +/// cleaned up +#[nexus_test] +async fn test_read_only_region_reference_counting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // The snapshot's allocated regions should have the one read-only region + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + let db_read_only_dataset = + datastore.dataset_get(read_only_region.dataset_id()).await.unwrap(); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Deleting the snapshot should _not_ cause the region to get deleted from + // CRDB + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + let post_delete_region = + datastore.get_region(read_only_region.id()).await.unwrap(); + assert_eq!(post_delete_region, *read_only_region); + + // or cause Nexus to send delete commands to the appropriate Crucible + // agent. + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Created + ); + + // Expect that there is one read-only region reference now, and that's from + // disk-from-snap + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk, and expect that does not alter the volume usage records + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk from snapshot, verify everything is cleaned up + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Destroyed + ); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +/// Assert that a snapshot of a volume with a read-only region is properly +/// reference counted. +#[nexus_test] +async fn test_read_only_region_reference_counting_layers( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // Grab the read-only region in the snapshot volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Take a snapshot of the disk-from-snapshot disk + + let double_snapshot = create_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + "double-snapshot", + ) + .await; + + // Assert correct volume usage records + + let (.., db_double_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(double_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "double_snapshot {:?} should exist", + double_snapshot.identity.id + ) + }); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + // Delete resources, assert volume resource usage records along the way + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("double-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_volume_replace_snapshot_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + // There won't be any regions for the snapshot volume, only region snapshots + + assert!(allocated_regions.is_empty()); + + // Get another region to use with volume_replace_snapshot + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: db_snapshot.id(), + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + allocated_regions.len() + 1, + ) + .await + .unwrap(); + + // Get the newly allocated region + + let mut new_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(new_allocated_regions.len(), 1); + + let (_, new_region) = + new_allocated_regions.pop().expect("we just checked the length!"); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(new_region.id(), 12345).await.unwrap(); + + // Create a blank region to use as the "volume to delete" + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert the correct volume resource usage record before the replacement: + // nothing should reference the newly allocated region yet + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + // Perform replacement of the first region in the snapshot + + let existing = datastore + .region_snapshot_get( + disk_allocated_regions[0].1.dataset_id(), + disk_allocated_regions[0].1.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let replacement = datastore + .region_addr(new_region.id()) + .await + .expect("new region has address!") + .unwrap(); + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(existing.snapshot_addr.parse().unwrap()), + ReplacementTarget(replacement), + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the volume resource usage record after volume_replace_snapshot: + // the new region should have a usage for the snapshot's volume + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_snapshot.volume_id); + + // Now, reverse the replacement + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(replacement), // swapped! + ReplacementTarget(existing.snapshot_addr.parse().unwrap()), // swapped! + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the new region's volume resource usage record now references the + // volume to delete + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); +} + +/// Test that the `volume_remove_rop` function correctly updates volume resource +/// usage records +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: + // both the snapshot volume and disk_from_snapshot volume should have usage + // records for the three region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: + // the snapshot volume should still have usage records for the three region + // snapshots, and now so should the volume to delete + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + } +} + +/// Test that the `volume_remove_rop` function only updates volume resource +/// usage records for the volume being operated on +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting_no_modify_others( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_another_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(another_disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "another_disk_from_snapshot {:?} should exist", + another_disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: the + // snapshot volume, disk_from_snapshot volume, and + // another_disk_from_snapshot volume should have usage records for the three + // region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: the + // snapshot volume and another_disk_from_snapshot volume should still have + // usage records for the three region snapshots, and now so should the + // volume to delete. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } +} + +async fn delete_all_volume_resource_usage_records(datastore: &DataStore) { + use db::schema::volume_resource_usage::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::delete(dsl::volume_resource_usage) + .filter(dsl::usage_id.ne(Uuid::new_v4())) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn perform_migration(datastore: &DataStore) { + const MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL: &str = include_str!( + "../../../schema/crdb/crucible-ref-count-records/up08.sql" + ); + + assert!(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL + .contains("INSERT INTO volume_resource_usage")); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + // To make sure that the migration is idempotent, perform it twice + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); + + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn get_volume_resource_usage_records( + datastore: &DataStore, +) -> HashSet { + use db::schema::volume_resource_usage::dsl; + + let mut records: Vec = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + while let Some(p) = paginator.next() { + let page = paginated( + dsl::volume_resource_usage, + dsl::usage_id, + &p.current_pagparams(), + ) + .get_results_async::(&*conn) + .await + .unwrap(); + + paginator = p.found_batch(&page, &|r| r.usage_id); + + for record in page { + records.push(record); + } + } + + records + .into_iter() + .map(|mut record: VolumeResourceUsageRecord| { + // Zero out usage_id for comparison + record.usage_id = Uuid::nil(); + record + }) + .collect() +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk + + create_disk(&client, PROJECT_NAME, "disk").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a snapshot + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a disk from that snapshot + + create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the snapshot + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the disk from snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_soft_delete_volume( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot from that disk, then an image based on + // that snapshot + + create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let params = params::ImageCreate { + identity: IdentityMetadataCreateParams { + name: "windows99".parse().unwrap(), + description: String::from("as soon as we get CSM support!"), + }, + source: params::ImageSource::Snapshot { id: snapshot.identity.id }, + os: "windows98".to_string(), + version: "se".to_string(), + }; + + let images_url = format!("/v1/images?project={}", PROJECT_NAME); + NexusRequest::objects_post(client, &images_url, ¶ms) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap(); + + // Soft-delete the snapshot's volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let resources = + datastore.soft_delete_volume(db_snapshot.volume_id).await.unwrap(); + + // Assert that the region snapshots did not have deleted set to true + + assert!(datastore + .snapshots_to_delete(&resources) + .await + .unwrap() + .is_empty()); + + // This means that the snapshot volume is soft-deleted, make sure the + // migration does not make usage records for it! + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( + cptestctx: &ControlPlaneTestContext, +) { + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let mut iter = disk_test.zpools(); + let zpool0 = iter.next().expect("Expected four zpools"); + let zpool1 = iter.next().expect("Expected four zpools"); + let zpool2 = iter.next().expect("Expected four zpools"); + let zpool3 = iter.next().expect("Expected four zpools"); + + // (dataset_id, region_id, snapshot_id, snapshot_addr) + let region_snapshots = vec![ + ( + zpool0.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101::7]:19016"), + ), + ( + zpool1.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:102::7]:19016"), + ), + ( + zpool2.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103::7]:19016"), + ), + ( + zpool3.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:104::7]:19016"), + ), + ]; + + for i in 0..4 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + // Create two volumes, one with the first three region snapshots, one with + // the last three region snapshots + + let first_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[0].3.clone(), + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + let second_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + region_snapshots[3].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Deleting the first volume should only return region_snapshot[0] for + // deletion. + + let resources = + datastore.soft_delete_volume(first_volume_id).await.unwrap(); + + let snapshots_to_delete = + datastore.snapshots_to_delete(&resources).await.unwrap(); + + assert_eq!(snapshots_to_delete.len(), 1); + + let region_snapshot_to_delete = &snapshots_to_delete[0].1; + + assert_eq!(region_snapshot_to_delete.dataset_id, region_snapshots[0].0); + assert_eq!(region_snapshot_to_delete.region_id, region_snapshots[0].1); + assert_eq!(region_snapshot_to_delete.snapshot_id, region_snapshots[0].2); + assert_eq!(region_snapshot_to_delete.snapshot_addr, region_snapshots[0].3); + assert_eq!(region_snapshot_to_delete.volume_references, 0); + assert_eq!(region_snapshot_to_delete.deleting, true); + + // Test the migration does not incorrectly think a region snapshot with + // deleting = true is used by any volume + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_double_layer_with_read_only_region_delete( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, snapshot, then two disks from those snapshots + // 2) Replace one of the region snapshots in that snapshot volume with a + // read-only region + // 3) Delete in the following order: disk, snapshot, then two disks from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let _another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's targets, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Finally, delete another-disk-from-snapshot + + NexusRequest::object_delete( + client, + &get_disk_url("another-disk-from-snapshot"), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete another-disk-from-snapshot"); + + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_double_layer_snapshot_with_read_only_region_delete_2( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create disk, then a snapshot + // 2) Replace two of the region snapshots in that snapshot volume with + // read-only regions + // 3) Create a disk from the snapshot + // 4) Replace the last of the region snapshots in that snapshot volume with + // a read-only region + // 5) Delete in the following order: disk, snapshot, then the disk from the + // snapshot - after each delete, verify that crucible resources were not + // prematurely deleted + // 6) At the end, assert that all Crucible resources were cleaned up + + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Perform region snapshot replacement for two of the snapshot's targets, + // causing two read-only regions to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + let request_id = request.id; + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + wait_for_condition( + || { + let datastore = datastore.clone(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + async move { + let request = datastore + .get_region_snapshot_replacement_request_by_id( + &opctx, request_id, + ) + .await + .unwrap(); + + let state = request.replacement_state; + + if state == RegionSnapshotReplacementState::Complete { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + } + }, + &std::time::Duration::from_millis(500), + &std::time::Duration::from_secs(60), + ) + .await + .expect("request transitioned to expected state"); + + let (dataset, region) = &allocated_regions[1]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Create a disk from the snapshot + + let _disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Replace the last of the region snapshot targets + + let (dataset, region) = &allocated_regions[2]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete the disk and snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + // Delete disk-from-snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Assert everything was cleaned up + + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_no_zombie_region_snapshots(cptestctx: &ControlPlaneTestContext) { + // 1) Create disk, then a snapshot + // 2) Delete the disk + // 3) Create a volume that uses the snapshot volume as a read-only parent + // 4) Test that a race of the following steps does not cause a region + // snapshot to have volume usage records even though it is marked for + // deletion: + // + // a) delete the snapshot volume + // b) delete the volume created in step 3 + // c) create another volume that uses the snapshot volume as a read-only + // parent + + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create disk, then a snapshot + + let _disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Delete the disk + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // Create a volume that uses the snapshot volume as a read-only parent + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let snapshot_volume: Volume = datastore + .volume_get(db_snapshot.volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let snapshot_vcr: VolumeConstructionRequest = + serde_json::from_str(snapshot_volume.data()).unwrap(); + + let step_3_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + step_3_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_3_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(snapshot_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the snapshot volume + + let cr = datastore.soft_delete_volume(db_snapshot.volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the volume created as part of step 3 + + let cr = datastore.soft_delete_volume(step_3_volume_id).await.unwrap(); + + // Assert that region snapshots _are_ returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(!datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the snapshot volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(snapshot_vcr.clone())), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} + +#[nexus_test] +async fn test_no_zombie_read_only_regions(cptestctx: &ControlPlaneTestContext) { + // 1) Create a volume with three read-only regions + // 2) Create another volume that uses the first step's volume as a read-only + // parent + // 3) Test that a race of the following steps does not cause a region to + // have volume usage records: + // + // a) delete the step 1 volume + // b) delete the step 2 volume + // c) create another volume that uses the step 1 volume as a read-only + // parent + + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + // Create a volume with three read-only regions + + let step_1_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: step_1_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 3, + ) + .await + .unwrap(); + + // We're not sending allocation requests to any simulated crucible agent, so + // fill in a random port here. + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + datastore.region_set_port(region.id(), 20000 + i as u16).await.unwrap(); + } + + let region_addrs: Vec = vec![ + datastore + .region_addr(datasets_and_regions[0].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[1].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[2].1.id()) + .await + .unwrap() + .unwrap(), + ]; + + // Assert they're all unique + + assert_ne!(region_addrs[0], region_addrs[1]); + assert_ne!(region_addrs[0], region_addrs[2]); + assert_ne!(region_addrs[1], region_addrs[2]); + + // Mimic what a snapshot volume has: three read-only regions in a volume's + // subvolume, not the read-only parent + + datastore + .volume_create(nexus_db_model::Volume::new( + step_1_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_1_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_addrs[0].to_string(), + region_addrs[1].to_string(), + region_addrs[2].to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Create another volume that uses the first step's volume as a read-only + // parent + + let step_1_volume: Volume = datastore + .volume_get(step_1_volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let step_1_vcr: VolumeConstructionRequest = + serde_json::from_str(step_1_volume.data()).unwrap(); + + let step_2_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_2_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_2_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the step 1 volume + + let cr = datastore.soft_delete_volume(step_1_volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the step 2 volume + + let cr = datastore.soft_delete_volume(step_2_volume_id).await.unwrap(); + + // Assert that the read-only regions _are_ returned for clean-up + + assert!(!datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the step 1 volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr)), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} + +#[nexus_test] +async fn test_no_zombie_read_write_regions( + cptestctx: &ControlPlaneTestContext, +) { + // 1) Create a volume with three read-write regions + // 2) Create another volume that uses the first step's volume as a read-only + // parent + // 3) Test that a race of the following steps does not cause a region to + // have volume usage records: + // + // a) delete the step 1 volume + // b) delete the step 2 volume + // c) create another volume that uses the step 1 volume as a read-only + // parent + + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + // Create a volume with three read-only regions + + let step_1_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: step_1_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 3, + ) + .await + .unwrap(); + + // We're not sending allocation requests to any simulated crucible agent, so + // fill in a random port here. + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + datastore.region_set_port(region.id(), 20000 + i as u16).await.unwrap(); + } + + let region_addrs: Vec = vec![ + datastore + .region_addr(datasets_and_regions[0].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[1].1.id()) + .await + .unwrap() + .unwrap(), + datastore + .region_addr(datasets_and_regions[2].1.id()) + .await + .unwrap() + .unwrap(), + ]; + + // Assert they're all unique + + assert_ne!(region_addrs[0], region_addrs[1]); + assert_ne!(region_addrs[0], region_addrs[2]); + assert_ne!(region_addrs[1], region_addrs[2]); + + // Mimic what a snapshot volume has: three read-only regions in a volume's + // subvolume, not the read-only parent + + datastore + .volume_create(nexus_db_model::Volume::new( + step_1_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_1_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_addrs[0].to_string(), + region_addrs[1].to_string(), + region_addrs[2].to_string(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Create another volume that uses the first step's volume as a read-only + // parent + + let step_1_volume: Volume = datastore + .volume_get(step_1_volume_id) + .await + .expect("volume_get without error") + .expect("volume exists"); + + let step_1_vcr: VolumeConstructionRequest = + serde_json::from_str(step_1_volume.data()).unwrap(); + + let step_2_volume_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + step_2_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: step_2_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr.clone())), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Soft-delete the step 1 volume + + let cr = datastore.soft_delete_volume(step_1_volume_id).await.unwrap(); + + // Assert that no resources are returned for clean-up + + assert!(datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Soft-delete the step 2 volume + + let cr = datastore.soft_delete_volume(step_2_volume_id).await.unwrap(); + + // Assert that the read-only regions _are_ returned for clean-up + + assert!(!datastore.regions_to_delete(&cr).await.unwrap().is_empty()); + assert!(datastore.snapshots_to_delete(&cr).await.unwrap().is_empty()); + + // Pretend that there's a racing call to volume_create that has a volume + // that uses the step 1 volume as a read-only parent. This call should + // fail! + + let racing_volume_id = Uuid::new_v4(); + let result = datastore + .volume_create(nexus_db_model::Volume::new( + racing_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: racing_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(step_1_vcr)), + }) + .unwrap(), + )) + .await; + + assert!(result.is_err()); +} diff --git a/schema/crdb/crucible-ref-count-records/up01.sql b/schema/crdb/crucible-ref-count-records/up01.sql new file mode 100644 index 0000000000..b28e61135b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up01.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); diff --git a/schema/crdb/crucible-ref-count-records/up02.sql b/schema/crdb/crucible-ref-count-records/up02.sql new file mode 100644 index 0000000000..31fa35c3b8 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up02.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); diff --git a/schema/crdb/crucible-ref-count-records/up03.sql b/schema/crdb/crucible-ref-count-records/up03.sql new file mode 100644 index 0000000000..03940aabb0 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up03.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); diff --git a/schema/crdb/crucible-ref-count-records/up04.sql b/schema/crdb/crucible-ref-count-records/up04.sql new file mode 100644 index 0000000000..c020499418 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up04.sql @@ -0,0 +1,37 @@ +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + region_id UUID, + + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id), + + CONSTRAINT exactly_one_usage_source CHECK ( + ( + (usage_type = 'read_only_region') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NULL AND + region_snapshot_region_id IS NULL AND + region_snapshot_snapshot_id IS NULL + ) + ) + OR + ( + (usage_type = 'region_snapshot') AND + (region_id IS NULL) AND + ( + region_snapshot_dataset_id IS NOT NULL AND + region_snapshot_region_id IS NOT NULL AND + region_snapshot_snapshot_id IS NOT NULL + ) + ) + ) +); diff --git a/schema/crdb/crucible-ref-count-records/up05.sql b/schema/crdb/crucible-ref-count-records/up05.sql new file mode 100644 index 0000000000..a88a3fc6fd --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up05.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); diff --git a/schema/crdb/crucible-ref-count-records/up06.sql b/schema/crdb/crucible-ref-count-records/up06.sql new file mode 100644 index 0000000000..3262b194be --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up06.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up07.sql b/schema/crdb/crucible-ref-count-records/up07.sql new file mode 100644 index 0000000000..91c788be1b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up07.sql @@ -0,0 +1,8 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up08.sql b/schema/crdb/crucible-ref-count-records/up08.sql new file mode 100644 index 0000000000..49374301ec --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up08.sql @@ -0,0 +1,31 @@ +/* Construct a UUIDv4 deterministically from the existing data in the region + snapshot records to populate the volume resource usage records */ +INSERT INTO volume_resource_usage ( + SELECT + gen_random_uuid() AS usage_id, + volume.id AS volume_id, + 'region_snapshot' AS usage_type, + NULL AS region_id, + region_snapshot.dataset_id AS region_snapshot_dataset_id, + region_snapshot.region_id AS region_snapshot_region_id, + region_snapshot.snapshot_id AS region_snapshot_snapshot_id + FROM + volume + JOIN + region_snapshot + ON + volume.data like ('%' || region_snapshot.snapshot_addr || '%') + LEFT JOIN + volume_resource_usage + ON + volume_resource_usage.volume_id = volume.id AND + volume_resource_usage.usage_type = 'region_snapshot' AND + volume_resource_usage.region_id IS NULL AND + volume_resource_usage.region_snapshot_region_id = region_snapshot.region_id AND + volume_resource_usage.region_snapshot_dataset_id = region_snapshot.dataset_id AND + volume_resource_usage.region_snapshot_snapshot_id = region_snapshot.snapshot_id + WHERE + volume.time_deleted is NULL AND + region_snapshot.deleting = false AND + volume_resource_usage.usage_id IS NULL +); diff --git a/schema/crdb/crucible-ref-count-records/up09.sql b/schema/crdb/crucible-ref-count-records/up09.sql new file mode 100644 index 0000000000..be56ed401c --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up09.sql @@ -0,0 +1,2 @@ +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); diff --git a/schema/crdb/crucible-ref-count-records/up10.sql b/schema/crdb/crucible-ref-count-records/up10.sql new file mode 100644 index 0000000000..5d7390e124 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up10.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region ADD COLUMN IF NOT EXISTS deleting BOOLEAN NOT NULL DEFAULT false; diff --git a/schema/crdb/crucible-ref-count-records/up11.sql b/schema/crdb/crucible-ref-count-records/up11.sql new file mode 100644 index 0000000000..ef7845c8d0 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up11.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.region ALTER COLUMN deleting DROP DEFAULT; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index b095c4c89a..f689c7e9f7 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -619,6 +619,8 @@ CREATE INDEX IF NOT EXISTS lookup_dataset_by_zpool on omicron.public.dataset ( id ) WHERE pool_id IS NOT NULL AND time_deleted IS NULL; +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); + /* * A region of space allocated to Crucible Downstairs, within a dataset. */ @@ -641,7 +643,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.region ( port INT4, - read_only BOOL NOT NULL + read_only BOOL NOT NULL, + + deleting BOOL NOT NULL ); /* @@ -664,6 +668,9 @@ CREATE INDEX IF NOT EXISTS lookup_regions_missing_ports on omicron.public.region (id) WHERE port IS NULL; +CREATE INDEX IF NOT EXISTS lookup_regions_by_read_only + on omicron.public.region (read_only); + /* * A snapshot of a region, within a dataset. */ @@ -698,6 +705,10 @@ CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_region_id on omicron.public region_id ); +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); + /* * Index on volume_references and snapshot_addr for crucible * resource accounting lookup @@ -4589,6 +4600,78 @@ CREATE INDEX IF NOT EXISTS lookup_bgp_config_by_bgp_announce_set_id ON omicron.p ) WHERE time_deleted IS NULL; +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); + +/* + * This table records when a Volume makes use of a read-only resource. When + * there are no more entries for a particular read-only resource, then that + * resource can be garbage collected. + */ +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + /* + * This column contains a non-NULL value when the usage type is read_only + * region + */ + region_id UUID, + + /* + * These columns contain non-NULL values when the usage type is region + * snapshot + */ + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id), + + CONSTRAINT exactly_one_usage_source CHECK ( + ( + (usage_type = 'read_only_region') AND + (region_id IS NOT NULL) AND + ( + region_snapshot_dataset_id IS NULL AND + region_snapshot_region_id IS NULL AND + region_snapshot_snapshot_id IS NULL + ) + ) + OR + ( + (usage_type = 'region_snapshot') AND + (region_id IS NULL) AND + ( + region_snapshot_dataset_id IS NOT NULL AND + region_snapshot_region_id IS NOT NULL AND + region_snapshot_snapshot_id IS NOT NULL + ) + ) + ) +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); + +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); /* * Keep this at the end of file so that the database does not contain a version @@ -4601,7 +4684,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '113.0.0', NULL) + (TRUE, NOW(), NOW(), '114.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 589ba87700..8369499658 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -124,7 +124,7 @@ impl CrucibleDataInner { key_pem: None, root_pem: None, source: None, - read_only: false, + read_only: params.source.is_some(), }; let old = self.regions.insert(id, region.clone());