From ae4282a3aa5adaa5c831af38af82ca84cf9076b6 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 4 Oct 2024 16:29:13 +0000 Subject: [PATCH] Implement record based Crucible reference counting Crucible volumes are created by layering read-write regions over a hierarchy of read-only resources. Originally only a region snapshot could be used as a read-only resource for a volume. With the introduction of read-only regions (created during the region snapshot replacement process) this is no longer true! Read-only resources can be used by many volumes, and because of this they need to have a reference count so they can be deleted when they're not referenced anymore. The region_snapshot table uses a `volume_references` column, which counts how many uses there are. The region table does not have this column, and more over a simple integer works for reference counting but does not tell you _what_ volume that use is from. This can be determined (see omdb's validate volume references command) but it's information that is tossed out, as Nexus knows what volumes use what resources! Instead, record what read-only resources a volume uses in a new table. As part of the schema change to add the new `volume_resource_usage` table, a migration is included that will create the appropriate records for all region snapshots. In testing, a few bugs were found: the worst being that read-only regions did not have their read_only column set to true. This would be a problem if read-only regions are created, but they're currently only created during region snapshot replacement. To detect if any of these regions were created, find all regions that were allocated for a snapshot volume: SELECT id FROM region WHERE volume_id IN (SELECT volume_id FROM snapshot); A similar bug was found in the simulated Crucible agent. This commit also reverts #6728, enabling region snapshot replacement again - it was disabled due to a lack of read-only region reference counting, so it can be enabled once again. --- nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 16 + nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/volume_resource_usage.rs | 132 ++ nexus/db-queries/src/db/datastore/mod.rs | 4 +- nexus/db-queries/src/db/datastore/region.rs | 2 +- nexus/db-queries/src/db/datastore/volume.rs | 1576 ++++++++++--- nexus/db-queries/src/db/queries/mod.rs | 1 - nexus/db-queries/src/db/queries/volume.rs | 114 - nexus/src/app/background/init.rs | 32 +- .../region_snapshot_replacement_finish.rs | 12 +- ...on_snapshot_replacement_garbage_collect.rs | 23 +- .../region_snapshot_replacement_start.rs | 15 +- .../tasks/region_snapshot_replacement_step.rs | 36 +- nexus/src/app/sagas/disk_create.rs | 17 + .../region_snapshot_replacement_start.rs | 1 + .../integration_tests/volume_management.rs | 2091 ++++++++++++++++- .../crdb/crucible-ref-count-records/up01.sql | 1 + .../crdb/crucible-ref-count-records/up02.sql | 3 + .../crdb/crucible-ref-count-records/up03.sql | 4 + .../crdb/crucible-ref-count-records/up04.sql | 15 + .../crdb/crucible-ref-count-records/up05.sql | 3 + .../crdb/crucible-ref-count-records/up06.sql | 3 + .../crdb/crucible-ref-count-records/up07.sql | 8 + .../crdb/crucible-ref-count-records/up08.sql | 29 + schema/crdb/dbinit.sql | 45 +- sled-agent/src/sim/storage.rs | 2 +- 27 files changed, 3678 insertions(+), 512 deletions(-) create mode 100644 nexus/db-model/src/volume_resource_usage.rs delete mode 100644 nexus/db-queries/src/db/queries/volume.rs create mode 100644 schema/crdb/crucible-ref-count-records/up01.sql create mode 100644 schema/crdb/crucible-ref-count-records/up02.sql create mode 100644 schema/crdb/crucible-ref-count-records/up03.sql create mode 100644 schema/crdb/crucible-ref-count-records/up04.sql create mode 100644 schema/crdb/crucible-ref-count-records/up05.sql create mode 100644 schema/crdb/crucible-ref-count-records/up06.sql create mode 100644 schema/crdb/crucible-ref-count-records/up07.sql create mode 100644 schema/crdb/crucible-ref-count-records/up08.sql diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 250fbcb369..7473d5f6b1 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -107,6 +107,7 @@ mod vmm; mod vni; mod volume; mod volume_repair; +mod volume_resource_usage; mod vpc; mod vpc_firewall_rule; mod vpc_route; @@ -215,6 +216,7 @@ pub use vmm_state::*; pub use vni::*; pub use volume::*; pub use volume_repair::*; +pub use volume_resource_usage::*; pub use vpc::*; pub use vpc_firewall_rule::*; pub use vpc_route::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index d9e2c43e75..3eeee099eb 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1967,3 +1967,19 @@ joinable!(instance_ssh_key -> instance (instance_id)); allow_tables_to_appear_in_same_query!(sled, sled_instance); joinable!(network_interface -> probe (parent_id)); + +table! { + volume_resource_usage (usage_id) { + usage_id -> Uuid, + + volume_id -> Uuid, + + usage_type -> crate::VolumeResourceUsageTypeEnum, + + region_id -> Nullable, + + region_snapshot_dataset_id -> Nullable, + region_snapshot_region_id -> Nullable, + region_snapshot_snapshot_id -> Nullable, + } +} diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 12e03e6d4e..c9398114c1 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(107, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(108, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(108, "crucible-ref-count-records"), KnownVersion::new(107, "add-instance-boot-disk"), KnownVersion::new(106, "dataset-kinds-update"), KnownVersion::new(105, "inventory-nvme-firmware"), diff --git a/nexus/db-model/src/volume_resource_usage.rs b/nexus/db-model/src/volume_resource_usage.rs new file mode 100644 index 0000000000..0095baf471 --- /dev/null +++ b/nexus/db-model/src/volume_resource_usage.rs @@ -0,0 +1,132 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use crate::schema::volume_resource_usage; +use uuid::Uuid; + +impl_enum_type!( + #[derive(SqlType, Debug, QueryId)] + #[diesel( + postgres_type(name = "volume_resource_usage_type", schema = "public") + )] + pub struct VolumeResourceUsageTypeEnum; + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq, Eq, Hash)] + #[diesel(sql_type = VolumeResourceUsageTypeEnum)] + pub enum VolumeResourceUsageType; + + ReadOnlyRegion => b"read_only_region" + RegionSnapshot => b"region_snapshot" +); + +/// Crucible volumes are created by layering read-write regions over a hierarchy +/// of read-only resources. Originally only a region snapshot could be used as a +/// read-only resource for a volume. With the introduction of read-only regions +/// (created during the region snapshot replacement process) this is no longer +/// true. +/// +/// Read-only resources can be used by many volumes, and because of this they +/// need to have a reference count so they can be deleted when they're not +/// referenced anymore. The region_snapshot table uses a `volume_references` +/// column, which counts how many uses there are. The region table does not have +/// this column, and more over a simple integer works for reference counting but +/// does not tell you _what_ volume that use is from. This can be determined +/// (see omdb's validate volume references command) but it's information that is +/// tossed out, as Nexus knows what volumes use what resources! Instead, record +/// what read-only resources a volume uses here. +/// +/// Note: users should not use this object directly, and instead use the +/// [`VolumeResourceUsage`] enum, which is type-safe and will convert to and +/// from a [`VolumeResourceUsageRecord`] when interacting with the DB. +#[derive( + Queryable, Insertable, Debug, Clone, Selectable, PartialEq, Eq, Hash, +)] +#[diesel(table_name = volume_resource_usage)] +pub struct VolumeResourceUsageRecord { + pub usage_id: Uuid, + + pub volume_id: Uuid, + + pub usage_type: VolumeResourceUsageType, + + pub region_id: Option, + + pub region_snapshot_dataset_id: Option, + pub region_snapshot_region_id: Option, + pub region_snapshot_snapshot_id: Option, +} + +#[derive(Debug, Clone)] +pub enum VolumeResourceUsage { + ReadOnlyRegion { region_id: Uuid }, + + RegionSnapshot { dataset_id: Uuid, region_id: Uuid, snapshot_id: Uuid }, +} + +impl VolumeResourceUsageRecord { + pub fn new(volume_id: Uuid, usage: VolumeResourceUsage) -> Self { + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::ReadOnlyRegion, + + region_id: Some(region_id), + + region_snapshot_dataset_id: None, + region_snapshot_region_id: None, + region_snapshot_snapshot_id: None, + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => VolumeResourceUsageRecord { + usage_id: Uuid::new_v4(), + volume_id, + usage_type: VolumeResourceUsageType::RegionSnapshot, + + region_id: None, + + region_snapshot_dataset_id: Some(dataset_id), + region_snapshot_region_id: Some(region_id), + region_snapshot_snapshot_id: Some(snapshot_id), + }, + } + } +} + +impl From for VolumeResourceUsage { + fn from(record: VolumeResourceUsageRecord) -> VolumeResourceUsage { + match record.usage_type { + VolumeResourceUsageType::ReadOnlyRegion => { + VolumeResourceUsage::ReadOnlyRegion { + region_id: record + .region_id + .expect("valid read-only region usage record"), + } + } + + VolumeResourceUsageType::RegionSnapshot => { + VolumeResourceUsage::RegionSnapshot { + dataset_id: record + .region_snapshot_dataset_id + .expect("valid region snapshot usage record"), + + region_id: record + .region_snapshot_region_id + .expect("valid region snapshot usage record"), + + snapshot_id: record + .region_snapshot_snapshot_id + .expect("valid region snapshot usage record"), + } + } + } + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index ec317c184f..7aaff8ccb4 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -793,7 +793,7 @@ mod test { } #[derive(Debug)] - struct TestDatasets { + pub(crate) struct TestDatasets { // eligible and ineligible aren't currently used, but are probably handy // for the future. #[allow(dead_code)] @@ -810,7 +810,7 @@ mod test { type SledToDatasetMap = HashMap>; impl TestDatasets { - async fn create( + pub(crate) async fn create( opctx: &OpContext, datastore: Arc, num_eligible_sleds: usize, diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index df733af5b5..f555e0af6b 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -264,7 +264,7 @@ impl DataStore { block_size, blocks_per_extent, extent_count, - read_only: false, + read_only: maybe_snapshot_id.is_some(), }, allocation_strategy, num_regions_required, diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 3bd0ef41ed..d7d6b10081 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -7,6 +7,7 @@ use super::DataStore; use crate::db; use crate::db::datastore::OpContext; +use crate::db::datastore::RunnableQuery; use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; @@ -22,9 +23,11 @@ use crate::db::model::UpstairsRepairNotification; use crate::db::model::UpstairsRepairNotificationType; use crate::db::model::UpstairsRepairProgress; use crate::db::model::Volume; +use crate::db::model::VolumeResourceUsage; +use crate::db::model::VolumeResourceUsageRecord; +use crate::db::model::VolumeResourceUsageType; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; -use crate::db::queries::volume::*; use crate::db::DbConnection; use crate::transaction_retry::OptionalError; use anyhow::bail; @@ -53,6 +56,7 @@ use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; use std::collections::VecDeque; +use std::net::AddrParseError; use std::net::SocketAddr; use std::net::SocketAddrV6; use uuid::Uuid; @@ -91,19 +95,187 @@ enum VolumeGetError { InvalidVolume(String), } +#[derive(Debug, thiserror::Error)] +enum VolumeCreationError { + #[error("Error from Volume creation: {0}")] + Public(Error), + + #[error("Serde error during Volume creation: {0}")] + SerdeError(#[from] serde_json::Error), + + #[error("Address parsing error during Volume creation: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), +} + impl DataStore { - pub async fn volume_create(&self, volume: Volume) -> CreateResult { + async fn volume_create_txn( + conn: &async_bb8_diesel::Connection, + err: OptionalError, + volume: Volume, + crucible_targets: CrucibleTargets, + ) -> Result { use db::schema::volume::dsl; - #[derive(Debug, thiserror::Error)] - enum VolumeCreationError { - #[error("Error from Volume creation: {0}")] - Public(Error), + let maybe_volume: Option = dsl::volume + .filter(dsl::id.eq(volume.id())) + .select(Volume::as_select()) + .first_async(conn) + .await + .optional()?; - #[error("Serde error during Volume creation: {0}")] - SerdeError(#[from] serde_json::Error), + // If the volume existed already, return it and do not increase usage + // counts. + if let Some(volume) = maybe_volume { + return Ok(volume); + } + + let volume: Volume = diesel::insert_into(dsl::volume) + .values(volume.clone()) + .returning(Volume::as_returning()) + .get_result_async(conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeCreationError::Public(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Volume, + volume.id().to_string().as_str(), + ), + )) + }) + })?; + + // Increase the usage count for the read-only Crucible resources + use db::schema::volume_resource_usage::dsl as ru_dsl; + + for read_only_target in crucible_targets.read_only_targets { + let sub_err = OptionalError::new(); + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &sub_err, + &read_only_target, + ) + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(VolumeCreationError::AddressParseError(sub_err)) + } else { + e + } + })?; + + match maybe_usage { + Some(usage) => { + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(VolumeResourceUsageRecord::new( + volume.id(), + usage, + )) + .execute_async(conn) + .await?; + } + + None => { + // Something went wrong, bail out - we can't create this + // Volume if we can't record its resource usage correctly + return Err(err.bail( + VolumeCreationError::CouldNotFindResource( + read_only_target, + ), + )); + } + } + } + + Ok(volume) + } + + async fn target_to_region( + conn: &async_bb8_diesel::Connection, + err: &OptionalError, + target: &str, + read_only: bool, + ) -> Result, diesel::result::Error> { + let address: SocketAddrV6 = target.parse().map_err(|e| err.bail(e))?; + let ip: db::model::Ipv6Addr = address.ip().into(); + + // Match region by dataset id and target port + + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl as region_dsl; + + dataset_dsl::dataset + .inner_join( + region_dsl::region + .on(region_dsl::dataset_id.eq(dataset_dsl::id)), + ) + .filter(dataset_dsl::ip.eq(ip)) + .filter( + region_dsl::port + .eq(Some::(address.port().into())), + ) + .filter(region_dsl::read_only.eq(read_only)) + .select(Region::as_select()) + .get_result_async::(conn) + .await + .optional() + } + + async fn read_only_target_to_volume_resource_usage( + conn: &async_bb8_diesel::Connection, + err: &OptionalError, + read_only_target: &str, + ) -> Result, diesel::result::Error> { + // Easy case: it's a region snapshot, and we can match by the snapshot + // address directly + + let maybe_region_snapshot = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::snapshot_addr.eq(read_only_target.to_string())) + .filter(dsl::deleting.eq(false)) + .select(RegionSnapshot::as_select()) + .get_result_async::(conn) + .await + .optional()? + }; + + if let Some(region_snapshot) = maybe_region_snapshot { + return Ok(Some(VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + })); } + // Less easy case: it's a read-only region, and we have to match by + // dataset ip and region port + + let maybe_region = Self::target_to_region( + conn, + err, + read_only_target, + true, // read-only + ) + .await?; + + if let Some(region) = maybe_region { + return Ok(Some(VolumeResourceUsage::ReadOnlyRegion { + region_id: region.id(), + })); + } + + // If the resource was hard-deleted, or in the off chance that the + // region didn't have an assigned port, return None here. + Ok(None) + } + + pub async fn volume_create(&self, volume: Volume) -> CreateResult { // Grab all the targets that the volume construction request references. // Do this outside the transaction, as the data inside volume doesn't // change and this would simply add to the transaction time. @@ -132,67 +304,13 @@ impl DataStore { let crucible_targets = crucible_targets.clone(); let volume = volume.clone(); async move { - let maybe_volume: Option = dsl::volume - .filter(dsl::id.eq(volume.id())) - .select(Volume::as_select()) - .first_async(&conn) - .await - .optional()?; - - // If the volume existed already, return it and do not increase - // usage counts. - if let Some(volume) = maybe_volume { - return Ok(volume); - } - - // TODO do we need on_conflict do_nothing here? if the transaction - // model is read-committed, the SELECT above could return nothing, - // and the INSERT here could still result in a conflict. - // - // See also https://github.com/oxidecomputer/omicron/issues/1168 - let volume: Volume = diesel::insert_into(dsl::volume) - .values(volume.clone()) - .on_conflict(dsl::id) - .do_nothing() - .returning(Volume::as_returning()) - .get_result_async(&conn) - .await - .map_err(|e| { - err.bail_retryable_or_else(e, |e| { - VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Volume, - volume.id().to_string().as_str(), - ), - ), - ) - }) - })?; - - // Increase the usage count for Crucible resources according to the - // contents of the volume. - - // Increase the number of uses for each referenced region snapshot. - use db::schema::region_snapshot::dsl as rs_dsl; - for read_only_target in &crucible_targets.read_only_targets - { - diesel::update(rs_dsl::region_snapshot) - .filter( - rs_dsl::snapshot_addr - .eq(read_only_target.clone()), - ) - .filter(rs_dsl::deleting.eq(false)) - .set( - rs_dsl::volume_references - .eq(rs_dsl::volume_references + 1), - ) - .execute_async(&conn) - .await?; - } - - Ok(volume) + Self::volume_create_txn( + &conn, + err, + volume, + crucible_targets, + ) + .await } }) .await @@ -200,10 +318,22 @@ impl DataStore { if let Some(err) = err.take() { match err { VolumeCreationError::Public(err) => err, + VolumeCreationError::SerdeError(err) => { Error::internal_error(&format!( - "Transaction error: {}", - err + "SerdeError error: {err}" + )) + } + + VolumeCreationError::CouldNotFindResource(s) => { + Error::internal_error(&format!( + "CouldNotFindResource error: {s}" + )) + } + + VolumeCreationError::AddressParseError(err) => { + Error::internal_error(&format!( + "AddressParseError error: {err}" )) } } @@ -213,18 +343,27 @@ impl DataStore { }) } - /// Return a `Option` based on id, even if it's soft deleted. - pub async fn volume_get( - &self, + async fn volume_get_impl( + conn: &async_bb8_diesel::Connection, volume_id: Uuid, - ) -> LookupResult> { + ) -> Result, diesel::result::Error> { use db::schema::volume::dsl; dsl::volume .filter(dsl::id.eq(volume_id)) .select(Volume::as_select()) - .first_async::(&*self.pool_connection_unauthorized().await?) + .first_async::(conn) .await .optional() + } + + /// Return a `Option` based on id, even if it's soft deleted. + pub async fn volume_get( + &self, + volume_id: Uuid, + ) -> LookupResult> { + let conn = self.pool_connection_unauthorized().await?; + Self::volume_get_impl(&conn, volume_id) + .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -241,6 +380,105 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + fn volume_usage_records_for_resource_query( + resource: VolumeResourceUsage, + ) -> impl RunnableQuery { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + dsl::volume_resource_usage + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .into_boxed() + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => dsl::volume_resource_usage + .filter( + dsl::usage_type.eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .into_boxed(), + } + } + + /// For a given VolumeResourceUsage, return all found usage records for it. + pub async fn volume_usage_records_for_resource( + &self, + resource: VolumeResourceUsage, + ) -> ListResultVec { + let conn = self.pool_connection_unauthorized().await?; + + Self::volume_usage_records_for_resource_query(resource) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// When moving a resource from one volume to another, call this to update + /// the corresponding volume resource usage record + pub async fn swap_volume_usage_records_for_resources( + conn: &async_bb8_diesel::Connection, + resource: VolumeResourceUsage, + from_volume_id: Uuid, + to_volume_id: Uuid, + ) -> Result<(), diesel::result::Error> { + use db::schema::volume_resource_usage::dsl; + + match resource { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = diesel::update(dsl::volume_resource_usage) + .filter( + dsl::usage_type + .eq(VolumeResourceUsageType::RegionSnapshot), + ) + .filter(dsl::region_snapshot_dataset_id.eq(dataset_id)) + .filter(dsl::region_snapshot_region_id.eq(region_id)) + .filter(dsl::region_snapshot_snapshot_id.eq(snapshot_id)) + .filter(dsl::volume_id.eq(from_volume_id)) + .set(dsl::volume_id.eq(to_volume_id)) + .execute_async(conn) + .await?; + + if updated_rows == 0 { + return Err(diesel::result::Error::NotFound); + } + } + } + + Ok(()) + } + async fn volume_checkout_allowed( reason: &VolumeCheckoutReason, vcr: &VolumeConstructionRequest, @@ -781,17 +1019,33 @@ impl DataStore { } /// Find regions for deleted volumes that do not have associated region - /// snapshots. + /// snapshots and are not being used by any other non-deleted volumes pub async fn find_deleted_volume_regions( &self, ) -> ListResultVec<(Dataset, Region, Option, Volume)> { + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("find_deleted_volume_regions") + .transaction(&conn, |conn| async move { + Self::find_deleted_volume_regions_txn(&conn).await + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + async fn find_deleted_volume_regions_txn( + conn: &async_bb8_diesel::Connection, + ) -> Result< + Vec<(Dataset, Region, Option, Volume)>, + diesel::result::Error, + > { use db::schema::dataset::dsl as dataset_dsl; use db::schema::region::dsl as region_dsl; use db::schema::region_snapshot::dsl; use db::schema::volume::dsl as volume_dsl; + use db::schema::volume_resource_usage::dsl as ru_dsl; // Find all regions and datasets - region_dsl::region + let tuples_superset = region_dsl::region .inner_join( volume_dsl::volume.on(region_dsl::volume_id.eq(volume_dsl::id)), ) @@ -824,9 +1078,29 @@ impl DataStore { Option::::as_select(), Volume::as_select(), )) - .load_async(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .load_async(conn) + .await?; + + // Filter out read-only regions that are still being used by volumes + let mut tuples = Vec::with_capacity(tuples_superset.len()); + + for (dataset, region, region_snapshot, volume) in tuples_superset { + let region_usage_left = ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(ru_dsl::region_id.eq(region.id())) + .count() + .get_result_async::(conn) + .await?; + + if region_usage_left == 0 { + tuples.push((dataset, region, region_snapshot, volume)); + } + } + + Ok(tuples) } pub async fn read_only_resources_associated_with_volume( @@ -857,8 +1131,15 @@ enum SoftDeleteTransactionError { #[error("Serde error decreasing Crucible resources: {0}")] SerdeError(#[from] serde_json::Error), - #[error("Updated {0} database rows, expected 1")] - UnexpectedDatabaseUpdate(usize), + #[error("Updated {0} database rows in {1}, expected 1")] + UnexpectedDatabaseUpdate(usize, String), + + // XXX is this an error? delete volume anyway, else we're stuck? + #[error("Could not match resource to {0}")] + CouldNotFindResource(String), + + #[error("Address parsing error during Volume soft-delete: {0}")] + AddressParseError(#[from] AddrParseError), } impl DataStore { @@ -923,8 +1204,8 @@ impl DataStore { .map_err(|e| err.bail(e.into()))?; // Grab all the targets that the volume construction request references. - // Do this _inside_ the transaction, as the read-only parent data inside - // volume can change as a result of region snapshot replacement. + // Do this _inside_ the transaction, as the data inside volumes can + // change as a result of region / region snapshot replacement. let crucible_targets = { let mut crucible_targets = CrucibleTargets::default(); read_only_resources_associated_with_volume( @@ -934,52 +1215,237 @@ impl DataStore { crucible_targets }; - // Decrease the number of references for each region snapshot that a - // volume references, returning the updated rows. - let updated_region_snapshots = ConditionallyDecreaseReferences::new( - crucible_targets.read_only_targets, - ) - .get_results_async::(conn) - .await?; + // Decrease the number of references for each resource that a volume + // references, collecting the regions and region snapshots that were + // freed up for deletion. - // Return all regions for the volume to be cleaned up. - let regions: Vec = { - use db::schema::region::dsl as region_dsl; - use db::schema::region_snapshot::dsl as region_snapshot_dsl; + let num_read_write_subvolumes = count_read_write_sub_volumes(&vcr); - region_dsl::region - .left_join( - region_snapshot_dsl::region_snapshot - .on(region_snapshot_dsl::region_id.eq(region_dsl::id)), - ) - .filter( - region_snapshot_dsl::volume_references - .eq(0) - .or(region_snapshot_dsl::volume_references.is_null()), + let mut regions: Vec = + Vec::with_capacity(3 * num_read_write_subvolumes); + + let mut region_snapshots: Vec = + Vec::with_capacity(crucible_targets.read_only_targets.len()); + + // First, grab read-write regions - they're not shared, but they are + // not candidates for deletion if there are region snapshots + { + let mut read_write_targets = + Vec::with_capacity(3 * num_read_write_subvolumes); + read_write_resources_associated_with_volume( + &vcr, + &mut read_write_targets, + ); + + for target in read_write_targets { + let sub_err = OptionalError::new(); + + let maybe_region = Self::target_to_region( + conn, &sub_err, &target, false, // read-write ) - .filter(region_dsl::volume_id.eq(volume_id)) - .select(Region::as_select()) - .get_results_async(conn) - .await? - .into_iter() - .map(|region| region.id()) - .collect() - }; + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(SoftDeleteTransactionError::AddressParseError( + sub_err, + )) + } else { + e + } + })?; - // Return the region snapshots that had their volume_references updated - // to 0 (and had the deleting flag set) to be cleaned up. - let region_snapshots = updated_region_snapshots - .into_iter() - .filter(|region_snapshot| { - region_snapshot.volume_references == 0 - && region_snapshot.deleting - }) - .map(|region_snapshot| RegionSnapshotV3 { - dataset: region_snapshot.dataset_id, - region: region_snapshot.region_id, - snapshot: region_snapshot.snapshot_id, - }) - .collect(); + let Some(region) = maybe_region else { + return Err(err.bail( + SoftDeleteTransactionError::CouldNotFindResource( + format!("could not find resource for {target}"), + ), + )); + }; + + // Filter out regions that have any region-snapshots + let region_snapshot_count: i64 = { + use db::schema::region_snapshot::dsl; + dsl::region_snapshot + .filter(dsl::region_id.eq(region.id())) + .count() + .get_result_async::(conn) + .await? + }; + + if region_snapshot_count == 0 { + regions.push(region.id()); + } + } + } + + for read_only_target in &crucible_targets.read_only_targets { + use db::schema::volume_resource_usage::dsl as ru_dsl; + + let sub_err = OptionalError::new(); + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + conn, + &sub_err, + read_only_target, + ) + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(SoftDeleteTransactionError::AddressParseError( + sub_err, + )) + } else { + e + } + })?; + + let Some(usage) = maybe_usage else { + return Err(err.bail( + SoftDeleteTransactionError::CouldNotFindResource(format!( + "could not find resource for {read_only_target}" + )), + )); + }; + + // For each read-only resource, remove the associated volume + // resource usage record for this volume. Only return a resource for + // deletion if no more associated volume usage records are found. + match usage { + VolumeResourceUsage::ReadOnlyRegion { region_id } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::ReadOnlyRegion, + ), + ) + .filter(ru_dsl::region_id.eq(Some(region_id))) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage (region)".into(), + ) + )); + } + + let region_usage_left = ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type + .eq(VolumeResourceUsageType::ReadOnlyRegion), + ) + .filter(ru_dsl::region_id.eq(region_id)) + .count() + .get_result_async::(conn) + .await?; + + if region_usage_left == 0 { + regions.push(region_id); + } + } + + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + } => { + let updated_rows = + diesel::delete(ru_dsl::volume_resource_usage) + .filter(ru_dsl::volume_id.eq(volume_id)) + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + updated_rows, + "volume_resource_usage \ + (region_snapshot)".into(), + ) + )); + } + + let region_snapshot_usage_left = + ru_dsl::volume_resource_usage + .filter( + ru_dsl::usage_type.eq( + VolumeResourceUsageType::RegionSnapshot, + ), + ) + .filter( + ru_dsl::region_snapshot_dataset_id + .eq(Some(dataset_id)), + ) + .filter( + ru_dsl::region_snapshot_region_id + .eq(Some(region_id)), + ) + .filter( + ru_dsl::region_snapshot_snapshot_id + .eq(Some(snapshot_id)), + ) + .count() + .get_result_async::(conn) + .await?; + + if region_snapshot_usage_left == 0 { + // Don't forget to set `deleting`! see: omicron#4095 + { + use db::schema::region_snapshot::dsl; + let updated_rows = + diesel::update(dsl::region_snapshot) + .filter(dsl::dataset_id.eq(dataset_id)) + .filter(dsl::region_id.eq(region_id)) + .filter(dsl::snapshot_id.eq(snapshot_id)) + .filter( + dsl::snapshot_addr + .eq(read_only_target.clone()), + ) + .filter(dsl::deleting.eq(false)) + .set(dsl::deleting.eq(true)) + .execute_async(conn) + .await?; + + if updated_rows != 1 { + return Err(err.bail( + SoftDeleteTransactionError::UnexpectedDatabaseUpdate( + updated_rows, + "setting deleting".into(), + ) + )); + } + } + + region_snapshots.push(RegionSnapshotV3 { + dataset: dataset_id, + region: region_id, + snapshot: snapshot_id, + }); + } + } + } + } let resources_to_delete = CrucibleResources::V3(CrucibleResourcesV3 { regions, @@ -1005,6 +1471,7 @@ impl DataStore { return Err(err.bail( SoftDeleteTransactionError::UnexpectedDatabaseUpdate( updated_rows, + "volume".into(), ), )); } @@ -1053,11 +1520,20 @@ impl DataStore { ) -> Result { #[derive(Debug, thiserror::Error)] enum RemoveReadOnlyParentError { - #[error("Serde error removing read only parent: {0}")] + #[error("Error removing read-only parent: {0}")] + Public(Error), + + #[error("Serde error removing read-only parent: {0}")] SerdeError(#[from] serde_json::Error), #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), + + #[error("Address parsing error during ROP removal: {0}")] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), } // In this single transaction: @@ -1080,8 +1556,8 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); async move { - // Grab the volume in question. If the volume record was already - // deleted then we can just return. + // Grab the volume in question. If the volume record was + // already deleted then we can just return. let volume = { use db::schema::volume::dsl; @@ -1100,8 +1576,8 @@ impl DataStore { }; if volume.time_deleted.is_some() { - // this volume is deleted, so let whatever is deleting - // it clean it up. + // this volume is deleted, so let whatever is + // deleting it clean it up. return Ok(false); } else { // A volume record exists, and was not deleted, we @@ -1110,8 +1586,8 @@ impl DataStore { } }; - // If a read_only_parent exists, remove it from volume_id, and - // attach it to temp_volume_id. + // If a read_only_parent exists, remove it from volume_id, + // and attach it to temp_volume_id. let vcr: VolumeConstructionRequest = serde_json::from_str( volume.data() @@ -1179,6 +1655,7 @@ impl DataStore { sub_volumes: vec![], read_only_parent, }; + let rop_volume_data = serde_json::to_string( &rop_vcr @@ -1188,6 +1665,7 @@ impl DataStore { e, )) })?; + // Update the temp_volume_id with the volume // data that contains the read_only_parent. let num_updated = @@ -1197,20 +1675,73 @@ impl DataStore { .set(volume_dsl::data.eq(rop_volume_data)) .execute_async(&conn) .await?; + if num_updated != 1 { return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); } + + // Update the volume resource usage record for + // every read-only resource in the ROP + let crucible_targets = { + let mut crucible_targets = CrucibleTargets::default(); + read_only_resources_associated_with_volume( + &rop_vcr, + &mut crucible_targets, + ); + crucible_targets + }; + + for read_only_target in crucible_targets.read_only_targets { + let sub_err = OptionalError::new(); + + let maybe_usage = Self::read_only_target_to_volume_resource_usage( + &conn, + &sub_err, + &read_only_target, + ) + .await + .map_err(|e| { + if let Some(sub_err) = sub_err.take() { + err.bail(RemoveReadOnlyParentError::AddressParseError(sub_err)) + } else { + e + } + })?; + + let Some(usage) = maybe_usage else { + return Err(err.bail( + RemoveReadOnlyParentError::CouldNotFindResource(format!( + "could not find resource for {read_only_target}" + )), + )); + }; + + Self::swap_volume_usage_records_for_resources( + &conn, + usage, + volume_id, + temp_volume_id, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + RemoveReadOnlyParentError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + } + Ok(true) } } - VolumeConstructionRequest::File { id: _, block_size: _, path: _ } - | VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _ } - | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { + + VolumeConstructionRequest::File { .. } + | VolumeConstructionRequest::Region { .. } + | VolumeConstructionRequest::Url { .. } => { // Volume has a format that does not contain ROPs Ok(false) } @@ -1608,7 +2139,7 @@ pub struct CrucibleResourcesV2 { pub snapshots_to_delete: Vec, } -#[derive(Debug, Default, Serialize, Deserialize)] +#[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] pub struct RegionSnapshotV3 { dataset: Uuid, region: Uuid, @@ -1895,6 +2426,8 @@ pub struct VolumeReplacementParams { #[derive(Debug, Clone, Copy)] pub struct VolumeWithTarget(pub Uuid); +// Note: it would be easier to pass around strings, but comparison could fail +// due to formatting issues, so pass around SocketAddrV6 here #[derive(Debug, Clone, Copy)] pub struct ExistingTarget(pub SocketAddrV6); @@ -2242,6 +2775,18 @@ impl DataStore { #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), + + #[error( + "Address parsing error during Volume snapshot \ + replacement: {0}" + )] + AddressParseError(#[from] AddrParseError), + + #[error("Could not match read-only resource to {0}")] + CouldNotFindResource(String), + + #[error("Multiple volume resource usage records for {0}")] + MultipleResourceUsageRecords(String), } let err = OptionalError::new(); @@ -2249,8 +2794,10 @@ impl DataStore { self.transaction_retry_wrapper("volume_replace_snapshot") .transaction(&conn, |conn| { let err = err.clone(); + async move { use db::schema::volume::dsl as volume_dsl; + use db::schema::volume_resource_usage::dsl as ru_dsl; // Grab the old volume first let maybe_old_volume = { @@ -2409,24 +2956,188 @@ impl DataStore { read_only_parent: None, }; - let volume_data = serde_json::to_string(&vcr) + let volume_data = serde_json::to_string(&vcr) + .map_err(|e| { + err.bail(VolumeReplaceSnapshotError::SerdeError(e)) + })?; + + // Update the volume to delete data + let num_updated = + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_to_delete_id.0)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(volume_data)) + .execute_async(&conn) + .await?; + + if num_updated != 1 { + return Err(err.bail( + VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( + num_updated, 1, + ) + )); + } + + // Update the appropriate volume resource usage records - it + // could either be a read-only region or a region snapshot, + // so determine what it is first + + let sub_err = OptionalError::new(); + let maybe_existing_usage = Self::read_only_target_to_volume_resource_usage( + &conn, + &sub_err, + &existing.0.to_string(), + ) + .await + .map_err(|e| if let Some(sub_err) = sub_err.take() { + err.bail(VolumeReplaceSnapshotError::AddressParseError( + sub_err + )) + } else { + e + } + )?; + + let Some(existing_usage) = maybe_existing_usage else { + return Err(err.bail( + VolumeReplaceSnapshotError::CouldNotFindResource( + format!( + "could not find resource for {}", + existing.0, + ) + )) + ); + }; + + // The "existing" target moved into the volume to delete + + Self::swap_volume_usage_records_for_resources( + &conn, + existing_usage, + volume_id.0, + volume_to_delete_id.0, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + + let sub_err = OptionalError::new(); + let maybe_replacement_usage = + Self::read_only_target_to_volume_resource_usage( + &conn, + &sub_err, + &replacement.0.to_string(), + ) + .await + .map_err(|e| if let Some(sub_err) = sub_err.take() { + err.bail(VolumeReplaceSnapshotError::AddressParseError( + sub_err + )) + } else { + e + } + )?; + + let Some(replacement_usage) = maybe_replacement_usage else { + return Err(err.bail( + VolumeReplaceSnapshotError::CouldNotFindResource( + format!( + "could not find resource for {}", + existing.0, + ) + )) + ); + }; + + // This function may be called with a replacement volume + // that is completely blank, to be filled in later by this + // function. `volume_create` will have been called but will + // not have added any volume resource usage records, because + // it was blank! + // + // The indention leaving this transaction is that the + // correct volume resource usage records exist, so if this + // is the case, create a new record. + // + // If the replacement volume usage records exist, then + // perform a swap instead. + + let existing_replacement_volume_usage_records = + Self::volume_usage_records_for_resource_query( + replacement_usage.clone(), + ) + .load_async(&conn) + .await .map_err(|e| { - err.bail(VolumeReplaceSnapshotError::SerdeError(e)) - })?; + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })? + // XXX be smart enough to .filter the above query + .into_iter() + .filter(|record| record.volume_id == volume_to_delete_id.0) + .count(); + + // The "replacement" target moved into the volume + + if existing_replacement_volume_usage_records == 0 { + // No matching record + let new_record = VolumeResourceUsageRecord::new( + volume_id.0, + replacement_usage, + ); - // Update the volume to delete data - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_to_delete_id.0)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(volume_data)) + diesel::insert_into(ru_dsl::volume_resource_usage) + .values(new_record) .execute_async(&conn) - .await?; - - if num_updated != 1 { + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + } else if existing_replacement_volume_usage_records == 1 { + // One matching record: perform swap + Self::swap_volume_usage_records_for_resources( + &conn, + replacement_usage, + volume_to_delete_id.0, + volume_id.0, + ) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeReplaceSnapshotError::Public( + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + ) + }) + })?; + } else { + // More than one matching record! return Err(err.bail( - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate( - num_updated, 1, + VolumeReplaceSnapshotError::MultipleResourceUsageRecords( + format!("{replacement_usage:?}") ) )); } @@ -2440,19 +3151,13 @@ impl DataStore { match err { VolumeReplaceSnapshotError::Public(e) => e, - VolumeReplaceSnapshotError::SerdeError(_) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::SnapshotReplacementError(_) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) => { - Error::internal_error(&err.to_string()) - } - - VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) => { + VolumeReplaceSnapshotError::SerdeError(_) | + VolumeReplaceSnapshotError::SnapshotReplacementError(_) | + VolumeReplaceSnapshotError::UnexpectedReplacedTargets(_, _) | + VolumeReplaceSnapshotError::UnexpectedDatabaseUpdate(_, _) | + VolumeReplaceSnapshotError::AddressParseError(_) | + VolumeReplaceSnapshotError::CouldNotFindResource(_) | + VolumeReplaceSnapshotError::MultipleResourceUsageRecords(_) => { Error::internal_error(&err.to_string()) } } @@ -2463,7 +3168,7 @@ impl DataStore { } } -/// Return the targets from a VolumeConstructionRequest. +/// Return the read-only targets from a VolumeConstructionRequest. /// /// The targets of a volume construction request map to resources. pub fn read_only_resources_associated_with_volume( @@ -2508,6 +3213,65 @@ pub fn read_only_resources_associated_with_volume( } } +/// Return the read-write targets from a VolumeConstructionRequest. +/// +/// The targets of a volume construction request map to resources. +pub fn read_write_resources_associated_with_volume( + vcr: &VolumeConstructionRequest, + targets: &mut Vec, +) { + let mut parts: VecDeque<&VolumeConstructionRequest> = VecDeque::new(); + parts.push_back(&vcr); + + while let Some(vcr_part) = parts.pop_front() { + match vcr_part { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + for sub_volume in sub_volumes { + parts.push_back(sub_volume); + } + + // No need to look under read-only parent + } + + VolumeConstructionRequest::Url { .. } => { + // no action required + } + + VolumeConstructionRequest::Region { opts, .. } => { + if !opts.read_only { + for target in &opts.target { + targets.push(target.clone()); + } + } + } + + VolumeConstructionRequest::File { .. } => { + // no action required + } + } + } +} + +/// Return the number of read-write subvolumes in a VolumeConstructionRequest. +pub fn count_read_write_sub_volumes(vcr: &VolumeConstructionRequest) -> usize { + match vcr { + VolumeConstructionRequest::Volume { sub_volumes, .. } => { + sub_volumes.len() + } + + VolumeConstructionRequest::Url { .. } => 0, + + VolumeConstructionRequest::Region { .. } => { + // We don't support a pure Region VCR at the volume + // level in the database, so this choice should + // never be encountered. + panic!("Region not supported as a top level volume"); + } + + VolumeConstructionRequest::File { .. } => 0, + } +} + /// Returns true if the sub-volumes of a Volume are all read-only pub fn volume_is_read_only( vcr: &VolumeConstructionRequest, @@ -2780,8 +3544,14 @@ impl DataStore { mod tests { use super::*; + use crate::db::datastore::test::TestDatasets; use crate::db::datastore::test_utils::datastore_test; + use crate::db::datastore::REGION_REDUNDANCY_THRESHOLD; + use nexus_config::RegionAllocationStrategy; + use nexus_db_model::SqlU16; use nexus_test_utils::db::test_setup_database; + use nexus_types::external_api::params::DiskSource; + use omicron_common::api::external::ByteCount; use omicron_test_utils::dev; use sled_agent_client::types::CrucibleOpts; @@ -2896,47 +3666,81 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_region"); let log = logctx.log.new(o!()); let mut db = test_setup_database(&log).await; - let (_opctx, db_datastore) = datastore_test(&logctx, &db).await; + let (opctx, db_datastore) = datastore_test(&logctx, &db).await; + let conn = db_datastore.pool_connection_unauthorized().await.unwrap(); - // Insert four Region records (three, plus one additionally allocated) + let _test_datasets = TestDatasets::create( + &opctx, + db_datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); - let new_volume_id = Uuid::new_v4(); - - let mut region_and_volume_ids = [ - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), volume_id), - (Uuid::new_v4(), new_volume_id), - ]; + let volume_to_delete_id = Uuid::new_v4(); - { - let conn = db_datastore.pool_connection_for_tests().await.unwrap(); + let datasets_and_regions = db_datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); - for i in 0..4 { - let (_, volume_id) = region_and_volume_ids[i]; + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); - let region = Region::new( - Uuid::new_v4(), // dataset id - volume_id, - 512_i64.try_into().unwrap(), - 10, - 10, - 10001, - false, - ); + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); - region_and_volume_ids[i].0 = region.id(); + let address: SocketAddrV6 = + db_datastore.region_addr(region.id()).await.unwrap().unwrap(); - use nexus_db_model::schema::region::dsl; - diesel::insert_into(dsl::region) - .values(region.clone()) - .execute_async(&*conn) - .await - .unwrap(); - } + region_addresses.push(address.to_string()); } + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + false, // read-write + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = db_datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + let _volume = db_datastore .volume_create(nexus_db_model::Volume::new( volume_id, @@ -2951,9 +3755,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // target to replace - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), // target to replace + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -2974,26 +3778,19 @@ mod tests { // Replace one - let target = region_and_volume_ids[0]; - let replacement = region_and_volume_ids[3]; - let volume_replace_region_result = db_datastore .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, ) .await @@ -3020,9 +3817,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd55:1122:3344:101::1]:11111"), // replaced - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + replacement_region_addr.to_string(), // replaced + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3043,19 +3840,15 @@ mod tests { .volume_replace_region( /* target */ db::datastore::VolumeReplacementParams { - volume_id: target.1, - region_id: replacement.0, - region_addr: "[fd55:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id, + region_id: replacement_region.id(), + region_addr: replacement_region_addr, }, /* replacement */ db::datastore::VolumeReplacementParams { - volume_id: replacement.1, - region_id: target.0, - region_addr: "[fd00:1122:3344:101::1]:11111" - .parse() - .unwrap(), + volume_id: volume_to_delete_id, + region_id: datasets_and_regions[0].1.id(), + region_addr: region_addresses[0].parse().unwrap(), }, ) .await @@ -3082,9 +3875,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), // back to what it was - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), // back to what it was + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3109,13 +3902,126 @@ mod tests { let logctx = dev::test_setup_log("test_volume_replace_snapshot"); let log = logctx.log.new(o!()); let mut db = test_setup_database(&log).await; - let (_opctx, db_datastore) = datastore_test(&logctx, &db).await; + let (opctx, db_datastore) = datastore_test(&logctx, &db).await; + let conn = db_datastore.pool_connection_for_tests().await.unwrap(); - // Insert two volumes: one with the target to replace, and one temporary - // "volume to delete" that's blank. + let _test_datasets = TestDatasets::create( + &opctx, + db_datastore.clone(), + REGION_REDUNDANCY_THRESHOLD, + ) + .await; let volume_id = Uuid::new_v4(); let volume_to_delete_id = Uuid::new_v4(); + + let datasets_and_regions = db_datastore + .disk_region_allocate( + &opctx, + volume_id, + &DiskSource::Blank { block_size: 512.try_into().unwrap() }, + ByteCount::from_gibibytes_u32(1), + &&RegionAllocationStrategy::RandomWithDistinctSleds { + seed: None, + }, + ) + .await + .unwrap(); + + let mut region_addresses: Vec = + Vec::with_capacity(datasets_and_regions.len()); + + for (i, (_, region)) in datasets_and_regions.iter().enumerate() { + // `disk_region_allocate` won't put any ports in, so add fake ones + // here + use nexus_db_model::schema::region::dsl; + diesel::update(dsl::region) + .filter(dsl::id.eq(region.id())) + .set(dsl::port.eq(Some::((100 + i as u16).into()))) + .execute_async(&*conn) + .await + .unwrap(); + + let address: SocketAddrV6 = + db_datastore.region_addr(region.id()).await.unwrap().unwrap(); + + region_addresses.push(address.to_string()); + } + + // Manually create a replacement region at the first dataset + let replacement_region = { + let (dataset, region) = &datasets_and_regions[0]; + let region = Region::new( + dataset.id(), + volume_to_delete_id, + region.block_size().try_into().unwrap(), + region.blocks_per_extent(), + region.extent_count(), + 111, + true, // read-only + ); + + use nexus_db_model::schema::region::dsl; + diesel::insert_into(dsl::region) + .values(region.clone()) + .execute_async(&*conn) + .await + .unwrap(); + + region + }; + + let replacement_region_addr: SocketAddrV6 = db_datastore + .region_addr(replacement_region.id()) + .await + .unwrap() + .unwrap(); + + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + let region_snapshots = [ + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + ), + RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + ), + ]; + + db_datastore + .region_snapshot_create(region_snapshots[0].clone()) + .await + .unwrap(); + db_datastore + .region_snapshot_create(region_snapshots[1].clone()) + .await + .unwrap(); + db_datastore + .region_snapshot_create(region_snapshots[2].clone()) + .await + .unwrap(); + + // Insert two volumes: one with the target to replace, and one temporary + // "volume to delete" that's blank. Validate the pre-replacement volume + // resource usage records. + let rop_id = Uuid::new_v4(); db_datastore @@ -3132,9 +4038,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3156,9 +4062,9 @@ mod tests { id: rop_id, target: vec![ // target to replace - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3177,6 +4083,22 @@ mod tests { .await .unwrap(); + for region_snapshot in ®ion_snapshots { + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + db_datastore .volume_create(nexus_db_model::Volume::new( volume_to_delete_id, @@ -3191,21 +4113,33 @@ mod tests { .await .unwrap(); + // `volume_create` above was called with a blank volume, so no usage + // record will have been created for the read-only region + + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + // Do the replacement let volume_replace_snapshot_result = db_datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd00:1122:3344:104::1]:400".parse().unwrap()), - ReplacementTarget( - "[fd55:1122:3344:101::1]:111".parse().unwrap(), - ), + ExistingTarget(address_1.parse().unwrap()), + ReplacementTarget(replacement_region_addr), VolumeToDelete(volume_to_delete_id), ) .await .unwrap(); - assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done,); + assert_eq!(volume_replace_snapshot_result, VolumeReplaceResult::Done); // Ensure the shape of the resulting VCRs @@ -3227,9 +4161,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3251,9 +4185,9 @@ mod tests { id: rop_id, target: vec![ // target replaced - String::from("[fd55:1122:3344:101::1]:111"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + replacement_region_addr.to_string(), + address_2.clone(), + address_3.clone(), ], lossy: false, flush_timeout: None, @@ -3293,7 +4227,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replaced target stashed here - String::from("[fd00:1122:3344:104::1]:400"), + address_1.clone(), ], lossy: false, flush_timeout: None, @@ -3309,15 +4243,54 @@ mod tests { }, ); + // Validate the post-replacement volume resource usage records + + for (i, region_snapshot) in region_snapshots.iter().enumerate() { + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + + match i { + 0 => { + assert_eq!(usage[0].volume_id, volume_to_delete_id); + } + + 1 | 2 => { + assert_eq!(usage[0].volume_id, volume_id); + } + + _ => panic!("out of range"), + } + } + + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + // Now undo the replacement. Note volume ID is not swapped. let volume_replace_snapshot_result = db_datastore .volume_replace_snapshot( VolumeWithTarget(volume_id), - ExistingTarget("[fd55:1122:3344:101::1]:111".parse().unwrap()), - ReplacementTarget( - "[fd00:1122:3344:104::1]:400".parse().unwrap(), - ), + ExistingTarget(replacement_region_addr), + ReplacementTarget(address_1.parse().unwrap()), VolumeToDelete(volume_to_delete_id), ) .await @@ -3344,9 +4317,9 @@ mod tests { opts: CrucibleOpts { id: volume_id, target: vec![ - String::from("[fd00:1122:3344:101::1]:11111"), - String::from("[fd00:1122:3344:102::1]:22222"), - String::from("[fd00:1122:3344:103::1]:33333"), + region_addresses[0].clone(), + region_addresses[1].clone(), + region_addresses[2].clone(), ], lossy: false, flush_timeout: None, @@ -3368,9 +4341,7 @@ mod tests { id: rop_id, target: vec![ // back to what it was - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1, address_2, address_3, ], lossy: false, flush_timeout: None, @@ -3410,7 +4381,7 @@ mod tests { id: volume_to_delete_id, target: vec![ // replacement stashed here - String::from("[fd55:1122:3344:101::1]:111"), + replacement_region_addr.to_string(), ], lossy: false, flush_timeout: None, @@ -3426,6 +4397,36 @@ mod tests { }, ); + // Validate the post-post-replacement volume resource usage records + + for region_snapshot in ®ion_snapshots { + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); + } + + let usage = db_datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: replacement_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); + db.cleanup().await.unwrap(); logctx.cleanup_successful(); } @@ -3440,6 +4441,41 @@ mod tests { let volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + let address_1 = String::from("[fd00:1122:3344:104::1]:400"); + let address_2 = String::from("[fd00:1122:3344:105::1]:401"); + let address_3 = String::from("[fd00:1122:3344:106::1]:402"); + + db_datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_1.clone(), + )) + .await + .unwrap(); + db_datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_2.clone(), + )) + .await + .unwrap(); + db_datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + address_3.clone(), + )) + .await + .unwrap(); + // case where the needle is found db_datastore @@ -3458,9 +4494,9 @@ mod tests { opts: CrucibleOpts { id: Uuid::new_v4(), target: vec![ - String::from("[fd00:1122:3344:104::1]:400"), - String::from("[fd00:1122:3344:105::1]:401"), - String::from("[fd00:1122:3344:106::1]:402"), + address_1.clone(), + address_2, + address_3, ], lossy: false, flush_timeout: None, @@ -3482,7 +4518,7 @@ mod tests { let volumes = db_datastore .find_volumes_referencing_socket_addr( &opctx, - "[fd00:1122:3344:104::1]:400".parse().unwrap(), + address_1.parse().unwrap(), ) .await .unwrap(); diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index 02800e3a3c..5f34c7cfb3 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -14,7 +14,6 @@ pub mod network_interface; pub mod oximeter; pub mod region_allocation; pub mod virtual_provisioning_collection_update; -pub mod volume; pub mod vpc; pub mod vpc_subnet; diff --git a/nexus/db-queries/src/db/queries/volume.rs b/nexus/db-queries/src/db/queries/volume.rs deleted file mode 100644 index e7fe832a82..0000000000 --- a/nexus/db-queries/src/db/queries/volume.rs +++ /dev/null @@ -1,114 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Helper queries for working with volumes. - -use crate::db; -use crate::db::pool::DbConnection; -use diesel::expression::is_aggregate; -use diesel::expression::ValidGrouping; -use diesel::pg::Pg; -use diesel::query_builder::AstPass; -use diesel::query_builder::Query; -use diesel::query_builder::QueryFragment; -use diesel::query_builder::QueryId; -use diesel::sql_types; -use diesel::Column; -use diesel::Expression; -use diesel::QueryResult; -use diesel::RunQueryDsl; - -/// Produces a query fragment that will conditionally reduce the volume -/// references for region_snapshot rows whose snapshot_addr column is part of a -/// list. -/// -/// The output should look like: -/// -/// ```sql -/// update region_snapshot set -/// volume_references = volume_references - 1, -/// deleting = case when volume_references = 1 -/// then true -/// else false -/// end -/// where -/// snapshot_addr in ('a1', 'a2', 'a3') and -/// volume_references >= 1 and -/// deleting = false -/// returning * -/// ``` -#[must_use = "Queries must be executed"] -pub struct ConditionallyDecreaseReferences { - snapshot_addrs: Vec, -} - -impl ConditionallyDecreaseReferences { - pub fn new(snapshot_addrs: Vec) -> Self { - Self { snapshot_addrs } - } -} - -impl QueryId for ConditionallyDecreaseReferences { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} - -impl QueryFragment for ConditionallyDecreaseReferences { - fn walk_ast<'a>(&'a self, mut out: AstPass<'_, 'a, Pg>) -> QueryResult<()> { - use db::schema::region_snapshot::dsl; - - out.push_sql("UPDATE "); - dsl::region_snapshot.walk_ast(out.reborrow())?; - out.push_sql(" SET "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" - 1, "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = CASE WHEN "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" = 1 THEN TRUE ELSE FALSE END WHERE "); - out.push_identifier(dsl::snapshot_addr::NAME)?; - out.push_sql(" IN ("); - - // If self.snapshot_addrs is empty, this query fragment will - // intentionally not update any region_snapshot rows. - for (i, snapshot_addr) in self.snapshot_addrs.iter().enumerate() { - out.push_bind_param::(snapshot_addr)?; - if i == self.snapshot_addrs.len() - 1 { - out.push_sql(" "); - } else { - out.push_sql(", "); - } - } - - out.push_sql(") AND "); - out.push_identifier(dsl::volume_references::NAME)?; - out.push_sql(" >= 1 AND "); - out.push_identifier(dsl::deleting::NAME)?; - out.push_sql(" = false RETURNING *"); - - Ok(()) - } -} - -impl Expression for ConditionallyDecreaseReferences { - type SqlType = sql_types::Array; -} - -impl ValidGrouping - for ConditionallyDecreaseReferences -{ - type IsAggregate = is_aggregate::Never; -} - -impl RunQueryDsl for ConditionallyDecreaseReferences {} - -type SelectableSql = < - >::SelectExpression as diesel::Expression ->::SqlType; - -impl Query for ConditionallyDecreaseReferences { - type SqlType = SelectableSql; -} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 69221779ee..8ae3be84af 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -778,8 +778,7 @@ impl BackgroundTasksInitializer { "detect if region snapshots need replacement and begin the \ process", period: config.region_snapshot_replacement_start.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new(RegionSnapshotReplacementDetector::disabled( + task_impl: Box::new(RegionSnapshotReplacementDetector::new( datastore.clone(), sagas.clone(), )), @@ -795,13 +794,10 @@ impl BackgroundTasksInitializer { period: config .region_snapshot_replacement_garbage_collection .period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementGarbageCollect::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementGarbageCollect::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_garbage_collection, @@ -813,13 +809,10 @@ impl BackgroundTasksInitializer { "detect what volumes were affected by a region snapshot \ replacement, and run the step saga for them", period: config.region_snapshot_replacement_step.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFindAffected::disabled( - datastore.clone(), - sagas.clone(), - ), - ), + task_impl: Box::new(RegionSnapshotReplacementFindAffected::new( + datastore.clone(), + sagas.clone(), + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_step, @@ -831,10 +824,9 @@ impl BackgroundTasksInitializer { "complete a region snapshot replacement if all the steps are \ done", period: config.region_snapshot_replacement_finish.period_secs, - // XXX temporarily disabled, see oxidecomputer/omicron#6353 - task_impl: Box::new( - RegionSnapshotReplacementFinishDetector::disabled(datastore), - ), + task_impl: Box::new(RegionSnapshotReplacementFinishDetector::new( + datastore, + )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], activator: task_region_snapshot_replacement_finish, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs index 83078cb978..caa2fa7bed 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_finish.rs @@ -19,17 +19,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFinishDetector { datastore: Arc, - disabled: bool, } impl RegionSnapshotReplacementFinishDetector { - #[allow(dead_code)] pub fn new(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: false } - } - - pub fn disabled(datastore: Arc) -> Self { - RegionSnapshotReplacementFinishDetector { datastore, disabled: true } + RegionSnapshotReplacementFinishDetector { datastore } } async fn transition_requests_to_done( @@ -159,10 +153,6 @@ impl BackgroundTask for RegionSnapshotReplacementFinishDetector { async move { let mut status = RegionSnapshotReplacementFinishStatus::default(); - if self.disabled { - return json!(status); - } - self.transition_requests_to_done(opctx, &mut status).await; json!(status) diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs index eb171fda12..f3b1b68198 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_garbage_collect.rs @@ -22,28 +22,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementGarbageCollect { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementGarbageCollect { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementGarbageCollect { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementGarbageCollect { datastore, sagas } } async fn send_garbage_collect_request( @@ -152,10 +135,6 @@ impl BackgroundTask for RegionSnapshotReplacementGarbageCollect { let mut status = RegionSnapshotReplacementGarbageCollectStatus::default(); - if self.disabled { - return json!(status); - } - self.clean_up_region_snapshot_replacement_volumes( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs index 8fd1e55975..bc739ecf27 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_start.rs @@ -29,20 +29,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementDetector { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementDetector { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: false } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementDetector { datastore, sagas, disabled: true } + RegionSnapshotReplacementDetector { datastore, sagas } } async fn send_start_request( @@ -246,10 +237,6 @@ impl BackgroundTask for RegionSnapshotReplacementDetector { async { let mut status = RegionSnapshotReplacementStartStatus::default(); - if self.disabled { - return json!(status); - } - self.create_requests_for_region_snapshots_on_expunged_disks( opctx, &mut status, diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs index da05500a58..c402c8fc92 100644 --- a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs +++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs @@ -42,28 +42,11 @@ use std::sync::Arc; pub struct RegionSnapshotReplacementFindAffected { datastore: Arc, sagas: Arc, - disabled: bool, } impl RegionSnapshotReplacementFindAffected { - #[allow(dead_code)] pub fn new(datastore: Arc, sagas: Arc) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: false, - } - } - - pub fn disabled( - datastore: Arc, - sagas: Arc, - ) -> Self { - RegionSnapshotReplacementFindAffected { - datastore, - sagas, - disabled: true, - } + RegionSnapshotReplacementFindAffected { datastore, sagas } } async fn send_start_request( @@ -452,10 +435,6 @@ impl BackgroundTask for RegionSnapshotReplacementFindAffected { async move { let mut status = RegionSnapshotReplacementStepStatus::default(); - if self.disabled { - return json!(status); - } - // Importantly, clean old steps up before finding affected volumes! // Otherwise, will continue to find the snapshot in volumes to // delete, and will continue to see conflicts in next function. @@ -500,6 +479,19 @@ mod test { ) -> Uuid { let new_volume_id = Uuid::new_v4(); + // need to add region snapshot objects to satisfy volume create + // transaction's search for resources + + datastore + .region_snapshot_create(RegionSnapshot::new( + Uuid::new_v4(), + Uuid::new_v4(), + Uuid::new_v4(), + snapshot_addr.clone(), + )) + .await + .unwrap(); + let volume_construction_request = VolumeConstructionRequest::Volume { id: new_volume_id, block_size: 0, diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index d2e3053668..8261785ada 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -934,6 +934,22 @@ pub(crate) mod test { .is_none() } + async fn no_volume_resource_usage_records_exist( + datastore: &DataStore, + ) -> bool { + use nexus_db_queries::db::schema::volume_resource_usage::dsl; + + let rows = dsl::volume_resource_usage + .count() + .get_result_async::( + &*datastore.pool_connection_for_tests().await.unwrap(), + ) + .await + .unwrap(); + + rows == 0 + } + async fn no_virtual_provisioning_resource_records_exist( datastore: &DataStore, ) -> bool { @@ -1030,6 +1046,7 @@ pub(crate) mod test { .await; assert!(no_disk_records_exist(datastore).await); assert!(no_volume_records_exist(datastore).await); + assert!(no_volume_resource_usage_records_exist(datastore).await); assert!( no_virtual_provisioning_resource_records_exist(datastore).await ); diff --git a/nexus/src/app/sagas/region_snapshot_replacement_start.rs b/nexus/src/app/sagas/region_snapshot_replacement_start.rs index 6d56732388..d615b55612 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_start.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_start.rs @@ -801,6 +801,7 @@ pub(crate) mod test { const SNAPSHOT_NAME: &str = "my-snap"; const PROJECT_NAME: &str = "springfield-squidport"; + /// Create four zpools, a disk, and a snapshot of that disk async fn prepare_for_test( cptestctx: &ControlPlaneTestContext, ) -> PrepareResult { diff --git a/nexus/tests/integration_tests/volume_management.rs b/nexus/tests/integration_tests/volume_management.rs index 8765218d33..5a03a6b95a 100644 --- a/nexus/tests/integration_tests/volume_management.rs +++ b/nexus/tests/integration_tests/volume_management.rs @@ -5,24 +5,46 @@ //! Tests that Nexus properly manages and cleans up Crucible resources //! associated with Volumes +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; +use diesel::ExpressionMethods; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; +use nexus_config::RegionAllocationStrategy; +use nexus_db_model::RegionSnapshotReplacement; +use nexus_db_model::VolumeResourceUsage; +use nexus_db_model::VolumeResourceUsageRecord; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::CrucibleResources; +use nexus_db_queries::db::datastore::ExistingTarget; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; +use nexus_db_queries::db::datastore::ReplacementTarget; +use nexus_db_queries::db::datastore::VolumeReplaceResult; +use nexus_db_queries::db::datastore::VolumeToDelete; +use nexus_db_queries::db::datastore::VolumeWithTarget; +use nexus_db_queries::db::datastore::SQL_BATCH_SIZE; use nexus_db_queries::db::lookup::LookupPath; +use nexus_db_queries::db::pagination::paginated; +use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; +use nexus_test_utils::background::run_replacement_tasks_to_completion; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::resource_helpers::create_default_ip_pool; +use nexus_test_utils::resource_helpers::create_disk; +use nexus_test_utils::resource_helpers::create_disk_from_snapshot; use nexus_test_utils::resource_helpers::create_project; +use nexus_test_utils::resource_helpers::create_snapshot; use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::params; use nexus_types::external_api::views; use nexus_types::identity::Asset; +use nexus_types::identity::Resource; use omicron_common::api::external::ByteCount; use omicron_common::api::external::Disk; use omicron_common::api::external::IdentityMetadataCreateParams; @@ -30,6 +52,7 @@ use omicron_common::api::external::Name; use omicron_common::api::internal; use omicron_uuid_kinds::DownstairsKind; use omicron_uuid_kinds::DownstairsRegionKind; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::UpstairsKind; use omicron_uuid_kinds::UpstairsRepairKind; @@ -37,6 +60,8 @@ use omicron_uuid_kinds::UpstairsSessionKind; use rand::prelude::SliceRandom; use rand::{rngs::StdRng, SeedableRng}; use sled_agent_client::types::{CrucibleOpts, VolumeConstructionRequest}; +use std::collections::HashSet; +use std::net::SocketAddrV6; use std::sync::Arc; use uuid::Uuid; @@ -2286,36 +2311,47 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate the volume's region_snapshots got incremented by - // volume_create + // Also validate the volume's region_snapshots had volume resource usage + // records created by volume_create for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } - // Soft delete the volume, and validate that only three region_snapshot - // records are returned. + // Soft delete the volume, and validate that the volume's region_snapshots + // had their volume resource usage records deleted let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -2396,30 +2432,42 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { .unwrap(); assert_eq!(crucible_targets.read_only_targets.len(), 3); - // Also validate only the volume's region_snapshots got incremented by - // volume_create. + // Also validate only the new volume's region_snapshots had usage records + // created. for i in 0..3 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } + for i in 3..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 1); - assert_eq!(region_snapshot.deleting, false); + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_id); } // Soft delete the volume, and validate that only three region_snapshot @@ -2427,18 +2475,23 @@ async fn test_keep_your_targets_straight(cptestctx: &ControlPlaneTestContext) { let cr = datastore.soft_delete_volume(volume_id).await.unwrap(); - // Make sure every region_snapshot is now 0, and deleting + // Make sure every region_snapshot has no usage records now for i in 0..6 { let (dataset_id, region_id, snapshot_id, _) = region_snapshots[i]; - let region_snapshot = datastore - .region_snapshot_get(dataset_id, region_id, snapshot_id) + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id, + region_id, + snapshot_id, + }, + ) .await - .unwrap() .unwrap(); - assert_eq!(region_snapshot.volume_references, 0); - assert_eq!(region_snapshot.deleting, true); + assert!(usage.is_empty()); } let datasets_and_regions = datastore.regions_to_delete(&cr).await.unwrap(); @@ -3503,3 +3556,1977 @@ async fn test_cte_returns_regions(cptestctx: &ControlPlaneTestContext) { .collect::>(), ); } + +struct TestReadOnlyRegionReferenceUsage { + datastore: Arc, + + region: db::model::Region, + region_address: SocketAddrV6, + + first_volume_id: Uuid, + second_volume_id: Uuid, + + last_resources_to_delete: Option, +} + +impl TestReadOnlyRegionReferenceUsage { + pub async fn new(cptestctx: &ControlPlaneTestContext) -> Self { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let first_volume_id = Uuid::new_v4(); + let second_volume_id = Uuid::new_v4(); + let snapshot_id = Uuid::new_v4(); + + let datasets_and_regions = datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: first_volume_id, + snapshot_id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + assert_eq!(datasets_and_regions.len(), 1); + + let (_, region) = &datasets_and_regions[0]; + + assert_eq!(region.volume_id(), first_volume_id); + assert!(region.read_only()); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(region.id(), 12345).await.unwrap(); + + let region_address = + datastore.region_addr(region.id()).await.unwrap().unwrap(); + + let region = datastore.get_region(region.id()).await.unwrap(); + + TestReadOnlyRegionReferenceUsage { + datastore: datastore.clone(), + + region, + region_address, + + first_volume_id, + second_volume_id, + + last_resources_to_delete: None, + } + } + + pub async fn create_first_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_only_first_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.first_volume_id); + } + + pub async fn validate_only_second_volume_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, self.second_volume_id); + } + + pub async fn delete_first_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.first_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn delete_second_volume(&mut self) { + let resources_to_delete = self + .datastore + .soft_delete_volume(self.second_volume_id) + .await + .unwrap(); + + self.last_resources_to_delete = Some(resources_to_delete); + } + + pub async fn validate_no_usage_records(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + } + + pub async fn validate_region_returned_for_cleanup(&self) { + assert!(self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + pub async fn validate_region_not_returned_for_cleanup(&self) { + assert!(!self + .datastore + .regions_to_delete(&self.last_resources_to_delete.as_ref().unwrap()) + .await + .unwrap() + .into_iter() + .any(|(_, r)| r.id() == self.region.id())); + } + + pub async fn region_returned_by_find_deleted_volume_regions(&self) { + let deleted_volume_regions = + self.datastore.find_deleted_volume_regions().await.unwrap(); + + assert!(deleted_volume_regions + .into_iter() + .any(|(_, r, _, _)| r.id() == self.region.id())); + } + + pub async fn region_not_returned_by_find_deleted_volume_regions(&self) { + let deleted_volume_regions = + self.datastore.find_deleted_volume_regions().await.unwrap(); + + assert!(!deleted_volume_regions + .into_iter() + .any(|(_, r, _, _)| r.id() == self.region.id())); + } + + pub async fn create_first_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn create_second_volume_region_in_rop(&self) { + self.datastore + .volume_create(nexus_db_model::Volume::new( + self.second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: self.second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: self.region.blocks_per_extent(), + extent_count: self.region.extent_count() as u32, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![self.region_address.to_string()], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + } + + pub async fn validate_both_volumes_referenced(&self) { + let usage = self + .datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: self.region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == self.first_volume_id)); + assert!(usage.iter().any(|r| r.volume_id == self.second_volume_id)); + } +} + +/// Assert that creating a volume with a read-only region in a subvolume creates +/// an appropriate usage record, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_usage_sanity( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume().await; + + // Validate only one volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // Now, soft-delete the volume, and make sure that the associated volume + // resource usage record is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should be returned by find_deleted_volume_regions. + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating a volume with a read-only region in the ROP creates an +/// appropriate reference, and that deleting that volume removes it +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create one volume referencing the harness' read-only region + + harness.create_first_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage record was created + + harness.validate_only_first_volume_referenced().await; + + // It should be _not_ returned by find_deleted_volume_regions. + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Now, soft-delete the volume, and make sure that read-only volume + // reference is gone too. + + harness.delete_first_volume().await; + + harness.validate_no_usage_records().await; + + // If the read-only volume references for a read-only region are gone, then + // it should be returned for cleanup. + + harness.validate_region_returned_for_cleanup().await; + + // It should be returned by find_deleted_volume_regions. + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region creates the +/// appropriate references, and that deleting only one of those volumes does not +/// mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume().await; + harness.create_second_volume().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the first volume, and make sure that only one read-only + // volume reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that creating multiple volumes with a read-only region in the ROP +/// creates the appropriate references, and that deleting only one of those +/// volumes does not mean the read-only region gets cleaned up +#[nexus_test] +async fn test_read_only_region_reference_sanity_rop_multi( + cptestctx: &ControlPlaneTestContext, +) { + let mut harness = TestReadOnlyRegionReferenceUsage::new(cptestctx).await; + + // Create two volumes this time + + harness.create_first_volume_region_in_rop().await; + harness.create_second_volume_region_in_rop().await; + + // Validate that the appropriate volume resource usage records were created + + harness.validate_both_volumes_referenced().await; + + // Now, soft-delete the volume, and make sure that only one read-only volume + // reference is gone. + + harness.delete_first_volume().await; + + harness.validate_only_second_volume_referenced().await; + + // If any read-only volume reference remains, then the region should not be + // returned for deletion, and it still should not be returned by + // `find_deleted_volume_regions`. + + harness.validate_region_not_returned_for_cleanup().await; + + harness.region_not_returned_by_find_deleted_volume_regions().await; + + // Deleting the second volume should free up the read-only region for + // deletion + + harness.delete_second_volume().await; + + harness.validate_no_usage_records().await; + + harness.validate_region_returned_for_cleanup().await; + + harness.region_returned_by_find_deleted_volume_regions().await; +} + +/// Assert that a read-only region is properly reference counted and not +/// prematurely deleted: +/// +/// 1) create a disk, then a snapshot of that disk, then a disk from that +/// snapshot +/// +/// 2) issue a region snapshot replacement request for one of the region +/// snapshots in that snapshot, then run that process to completion +/// +/// 3) delete the snapshot +/// +/// 4) expect that the reference to the read-only region in the disk created +/// from the snapshot means that read-only region will not be cleaned up by +/// Nexus +/// +/// 5) clean up all the objects, and expect the crucible resources are properly +/// cleaned up +#[nexus_test] +async fn test_read_only_region_reference_counting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // The snapshot's allocated regions should have the one read-only region + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + let db_read_only_dataset = + datastore.dataset_get(read_only_region.dataset_id()).await.unwrap(); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Deleting the snapshot should _not_ cause the region to get deleted from + // CRDB + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + let post_delete_region = + datastore.get_region(read_only_region.id()).await.unwrap(); + assert_eq!(post_delete_region, *read_only_region); + + // or cause Nexus to send delete commands to the appropriate Crucible + // agent. + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Created + ); + + // Expect that there is one read-only region reference now, and that's from + // disk-from-snap + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk, and expect that does not alter the volume usage records + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_disk_from_snapshot.volume_id); + + // Delete the disk from snapshot, verify everything is cleaned up + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + assert_eq!( + cptestctx + .sled_agent + .sled_agent + .get_crucible_dataset( + TypedUuid::from_untyped_uuid(db_read_only_dataset.pool_id), + db_read_only_dataset.id(), + ) + .await + .get(crucible_agent_client::types::RegionId( + read_only_region.id().to_string() + )) + .await + .unwrap() + .state, + crucible_agent_client::types::State::Destroyed + ); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +/// Assert that a snapshot of a volume with a read-only region is properly +/// reference counted. +#[nexus_test] +async fn test_read_only_region_reference_counting_layers( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let internal_client = &cptestctx.internal_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Four zpools are required for region replacement or region snapshot + // replacement + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Perform region snapshot replacement for one of the snapshot's regions, + // causing a read-only region to be created. + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 3); + + let (dataset, region) = &allocated_regions[0]; + + let request = RegionSnapshotReplacement::new( + dataset.id(), + region.id(), + snapshot.identity.id, + ); + + datastore + .insert_region_snapshot_replacement_request(&opctx, request) + .await + .unwrap(); + + run_replacement_tasks_to_completion(&internal_client).await; + + // Grab the read-only region in the snapshot volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + let (_, read_only_region) = &allocated_regions[0]; + assert!(read_only_region.read_only()); + + // The disk-from-snap VCR should also reference that read-only region + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let read_only_region_address: SocketAddrV6 = + nexus.region_addr(&opctx.log, read_only_region.id()).await.unwrap(); + + assert!(datastore + .find_volumes_referencing_socket_addr( + &opctx, + read_only_region_address.into() + ) + .await + .unwrap() + .iter() + .any(|volume| volume.id() == db_disk_from_snapshot.volume_id)); + + // Expect that there are two read-only region references now: one in the + // snapshot volume, and one in the disk-from-snap volume. + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + + // Take a snapshot of the disk-from-snapshot disk + + let double_snapshot = create_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + "double-snapshot", + ) + .await; + + // Assert correct volume usage records + + let (.., db_double_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(double_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "double_snapshot {:?} should exist", + double_snapshot.identity.id + ) + }); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + // Delete resources, assert volume resource usage records along the way + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert!(usage.iter().any(|r| r.volume_id == db_double_snapshot.volume_id)); + + NexusRequest::object_delete(client, &get_snapshot_url("double-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + assert!(!disk_test.crucible_resources_deleted().await); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { + region_id: read_only_region.id(), + }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + NexusRequest::object_delete(client, &get_disk_url("disk")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk"); + + // Assert everything was cleaned up + assert!(disk_test.crucible_resources_deleted().await); +} + +#[nexus_test] +async fn test_volume_replace_snapshot_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + // There won't be any regions for the snapshot volume, only region snapshots + + assert!(allocated_regions.is_empty()); + + // Get another region to use with volume_replace_snapshot + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: db_snapshot.id(), + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), + }, + &RegionAllocationStrategy::Random { seed: None }, + allocated_regions.len() + 1, + ) + .await + .unwrap(); + + // Get the newly allocated region + + let mut new_allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(new_allocated_regions.len(), 1); + + let (_, new_region) = + new_allocated_regions.pop().expect("we just checked the length!"); + + // We're not sending the allocation request to any simulated crucible agent, + // so fill in a random port here. + datastore.region_set_port(new_region.id(), 12345).await.unwrap(); + + // Create a blank region to use as the "volume to delete" + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + // Assert the correct volume resource usage record before the replacement: + // nothing should reference the newly allocated region yet + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert!(usage.is_empty()); + + // Perform replacement of the first region in the snapshot + + let existing = datastore + .region_snapshot_get( + disk_allocated_regions[0].1.dataset_id(), + disk_allocated_regions[0].1.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let replacement = datastore + .region_addr(new_region.id()) + .await + .expect("new region has address!") + .unwrap(); + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(existing.snapshot_addr.parse().unwrap()), + ReplacementTarget(replacement), + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the volume resource usage record after volume_replace_snapshot: + // the new region should have a usage for the snapshot's volume + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, db_snapshot.volume_id); + + // Now, reverse the replacement + + let replacement_result = datastore + .volume_replace_snapshot( + VolumeWithTarget(db_snapshot.volume_id), + ExistingTarget(replacement), // swapped! + ReplacementTarget(existing.snapshot_addr.parse().unwrap()), // swapped! + VolumeToDelete(volume_to_delete_id), + ) + .await + .unwrap(); + + match replacement_result { + VolumeReplaceResult::Done => { + // ok! + } + + _ => { + panic!("replacement result was {replacement_result:?}"); + } + } + + // Assert the new region's volume resource usage record now references the + // volume to delete + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::ReadOnlyRegion { region_id: new_region.id() }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 1); + assert_eq!(usage[0].volume_id, volume_to_delete_id); +} + +/// Test that the `volume_remove_rop` function correctly updates volume resource +/// usage records +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: + // both the snapshot volume and disk_from_snapshot volume should have usage + // records for the three region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: + // the snapshot volume should still have usage records for the three region + // snapshots, and now so should the volume to delete + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 2); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + } +} + +/// Test that the `volume_remove_rop` function only updates volume resource +/// usage records for the volume being operated on +#[nexus_test] +async fn test_volume_remove_rop_respects_accounting_no_modify_others( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot of that disk, then a disk from that + // snapshot. + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk.identity.id) + .fetch() + .await + .unwrap_or_else(|_| panic!("disk {:?} should exist", disk.identity.id)); + + let disk_allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + + assert_eq!(disk_allocated_regions.len(), 3); + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_disk_from_snapshot) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "disk_from_snapshot {:?} should exist", + disk_from_snapshot.identity.id + ) + }); + + let another_disk_from_snapshot = create_disk_from_snapshot( + &client, + PROJECT_NAME, + "another-disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + let (.., db_another_disk_from_snapshot) = + LookupPath::new(&opctx, &datastore) + .disk_id(another_disk_from_snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!( + "another_disk_from_snapshot {:?} should exist", + another_disk_from_snapshot.identity.id + ) + }); + + // Assert the correct volume resource usage records before the removal: the + // snapshot volume, disk_from_snapshot volume, and + // another_disk_from_snapshot volume should have usage records for the three + // region snapshots. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_disk_from_snapshot.volume_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } + + // Remove the ROP from disk-from-snapshot + + let volume_to_delete_id = Uuid::new_v4(); + + datastore + .volume_create(nexus_db_model::Volume::new( + volume_to_delete_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: volume_to_delete_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: None, + }) + .unwrap(), + )) + .await + .unwrap(); + + let result = datastore + .volume_remove_rop(db_disk_from_snapshot.volume_id, volume_to_delete_id) + .await + .unwrap(); + + // Assert that there was a removal + + assert!(result); + + // Assert the correct volume resource usage records after the removal: the + // snapshot volume and another_disk_from_snapshot volume should still have + // usage records for the three region snapshots, and now so should the + // volume to delete. + + for (_, disk_allocated_region) in &disk_allocated_regions { + let region_snapshot = datastore + .region_snapshot_get( + disk_allocated_region.dataset_id(), + disk_allocated_region.id(), + db_snapshot.id(), + ) + .await + .expect("region snapshot exists!") + .unwrap(); + + let usage = datastore + .volume_usage_records_for_resource( + VolumeResourceUsage::RegionSnapshot { + dataset_id: region_snapshot.dataset_id, + region_id: region_snapshot.region_id, + snapshot_id: region_snapshot.snapshot_id, + }, + ) + .await + .unwrap(); + + assert_eq!(usage.len(), 3); + assert!(usage.iter().any(|r| r.volume_id == db_snapshot.volume_id)); + assert!(usage.iter().any(|r| r.volume_id == volume_to_delete_id)); + assert!(usage + .iter() + .any(|r| r.volume_id == db_another_disk_from_snapshot.volume_id)); + } +} + +async fn delete_all_volume_resource_usage_records(datastore: &DataStore) { + use db::schema::volume_resource_usage::dsl; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + diesel::delete(dsl::volume_resource_usage) + .filter(dsl::usage_id.ne(Uuid::new_v4())) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn perform_migration(datastore: &DataStore) { + const MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL: &str = include_str!( + "../../../schema/crdb/crucible-ref-count-records/up08.sql" + ); + + assert!(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL + .contains("INSERT INTO volume_resource_usage")); + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + // To make sure that the migration is idempotent, perform it twice + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); + + diesel::sql_query(MIGRATION_TO_REF_COUNT_WITH_RECORDS_SQL) + .execute_async(&*conn) + .await + .unwrap(); +} + +async fn get_volume_resource_usage_records( + datastore: &DataStore, +) -> HashSet { + use db::schema::volume_resource_usage::dsl; + + let mut records: Vec = Vec::new(); + let mut paginator = Paginator::new(SQL_BATCH_SIZE); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + while let Some(p) = paginator.next() { + let page = paginated( + dsl::volume_resource_usage, + dsl::usage_id, + &p.current_pagparams(), + ) + .get_results_async::(&*conn) + .await + .unwrap(); + + paginator = p.found_batch(&page, &|r| r.usage_id); + + for record in page { + records.push(record); + } + } + + records + .into_iter() + .map(|mut record: VolumeResourceUsageRecord| { + // Zero out usage_id for comparison + record.usage_id = Uuid::nil(); + record + }) + .collect() +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk + + create_disk(&client, PROJECT_NAME, "disk").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a snapshot + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Create a disk from that snapshot + + create_disk_from_snapshot( + &client, + PROJECT_NAME, + "disk-from-snapshot", + snapshot.identity.id, + ) + .await; + + // Test migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the snapshot + + NexusRequest::object_delete(client, &get_snapshot_url("snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); + + // Delete the disk from snapshot + + NexusRequest::object_delete(client, &get_disk_url("disk-from-snapshot")) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("failed to delete disk-from-snapshot"); + + // Test the migration + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_soft_delete_volume( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + create_project_and_pool(client).await; + + // Create a disk, then a snapshot from that disk, then an image based on + // that snapshot + + create_disk(&client, PROJECT_NAME, "disk").await; + + let snapshot = + create_snapshot(&client, PROJECT_NAME, "disk", "snapshot").await; + + let params = params::ImageCreate { + identity: IdentityMetadataCreateParams { + name: "windows99".parse().unwrap(), + description: String::from("as soon as we get CSM support!"), + }, + source: params::ImageSource::Snapshot { id: snapshot.identity.id }, + os: "windows98".to_string(), + version: "se".to_string(), + }; + + let images_url = format!("/v1/images?project={}", PROJECT_NAME); + NexusRequest::objects_post(client, &images_url, ¶ms) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap(); + + // Soft-delete the snapshot's volume + + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot.identity.id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("snapshot {:?} should exist", snapshot.identity.id) + }); + + let resources = + datastore.soft_delete_volume(db_snapshot.volume_id).await.unwrap(); + + // Assert that the region snapshots did not have deleted set to true + + assert!(datastore + .snapshots_to_delete(&resources) + .await + .unwrap() + .is_empty()); + + // This means that the snapshot volume is soft-deleted, make sure the + // migration does not make usage records for it! + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} + +#[nexus_test] +async fn test_migrate_to_ref_count_with_records_region_snapshot_deleting( + cptestctx: &ControlPlaneTestContext, +) { + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let datastore = nexus.datastore(); + + let disk_test = DiskTestBuilder::new(&cptestctx) + .on_specific_sled(cptestctx.first_sled()) + .with_zpool_count(4) + .build() + .await; + + let mut iter = disk_test.zpools(); + let zpool0 = iter.next().expect("Expected four zpools"); + let zpool1 = iter.next().expect("Expected four zpools"); + let zpool2 = iter.next().expect("Expected four zpools"); + let zpool3 = iter.next().expect("Expected four zpools"); + + // (dataset_id, region_id, snapshot_id, snapshot_addr) + let region_snapshots = vec![ + ( + zpool0.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:101:7]:19016"), + ), + ( + zpool1.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:102:7]:19016"), + ), + ( + zpool2.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:103:7]:19016"), + ), + ( + zpool3.datasets[0].id, + Uuid::new_v4(), + Uuid::new_v4(), + String::from("[fd00:1122:3344:104:7]:19016"), + ), + ]; + + for i in 0..4 { + let (dataset_id, region_id, snapshot_id, snapshot_addr) = + ®ion_snapshots[i]; + + datastore + .region_snapshot_create(nexus_db_model::RegionSnapshot { + dataset_id: *dataset_id, + region_id: *region_id, + snapshot_id: *snapshot_id, + snapshot_addr: snapshot_addr.clone(), + volume_references: 0, + deleting: false, + }) + .await + .unwrap(); + } + + // Create two volumes, one with the first three region snapshots, one with + // the last three region snapshots + + let first_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + first_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: first_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[0].3.clone(), + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + let second_volume_id = Uuid::new_v4(); + datastore + .volume_create(nexus_db_model::Volume::new( + second_volume_id, + serde_json::to_string(&VolumeConstructionRequest::Volume { + id: second_volume_id, + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new( + VolumeConstructionRequest::Region { + block_size: 512, + blocks_per_extent: 1, + extent_count: 1, + gen: 1, + opts: CrucibleOpts { + id: Uuid::new_v4(), + target: vec![ + region_snapshots[1].3.clone(), + region_snapshots[2].3.clone(), + region_snapshots[3].3.clone(), + ], + lossy: false, + flush_timeout: None, + key: None, + cert_pem: None, + key_pem: None, + root_cert_pem: None, + control: None, + read_only: true, + }, + }, + )), + }) + .unwrap(), + )) + .await + .unwrap(); + + // Deleting the first volume should only return region_snapshot[0] for + // deletion. + + let resources = + datastore.soft_delete_volume(first_volume_id).await.unwrap(); + + let snapshots_to_delete = + datastore.snapshots_to_delete(&resources).await.unwrap(); + + assert_eq!(snapshots_to_delete.len(), 1); + + let region_snapshot_to_delete = &snapshots_to_delete[0].1; + + assert_eq!(region_snapshot_to_delete.dataset_id, region_snapshots[0].0); + assert_eq!(region_snapshot_to_delete.region_id, region_snapshots[0].1); + assert_eq!(region_snapshot_to_delete.snapshot_id, region_snapshots[0].2); + assert_eq!(region_snapshot_to_delete.snapshot_addr, region_snapshots[0].3); + assert_eq!(region_snapshot_to_delete.volume_references, 0); + assert_eq!(region_snapshot_to_delete.deleting, true); + + // Test the migration does not incorrectly think a region snapshot with + // deleting = true is used by any volume + + let records_before = get_volume_resource_usage_records(&datastore).await; + + delete_all_volume_resource_usage_records(&datastore).await; + perform_migration(&datastore).await; + + let records_after = get_volume_resource_usage_records(&datastore).await; + + assert_eq!(records_before, records_after); +} diff --git a/schema/crdb/crucible-ref-count-records/up01.sql b/schema/crdb/crucible-ref-count-records/up01.sql new file mode 100644 index 0000000000..b28e61135b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up01.sql @@ -0,0 +1 @@ +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); diff --git a/schema/crdb/crucible-ref-count-records/up02.sql b/schema/crdb/crucible-ref-count-records/up02.sql new file mode 100644 index 0000000000..31fa35c3b8 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up02.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); diff --git a/schema/crdb/crucible-ref-count-records/up03.sql b/schema/crdb/crucible-ref-count-records/up03.sql new file mode 100644 index 0000000000..03940aabb0 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up03.sql @@ -0,0 +1,4 @@ +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); diff --git a/schema/crdb/crucible-ref-count-records/up04.sql b/schema/crdb/crucible-ref-count-records/up04.sql new file mode 100644 index 0000000000..7d3bba3119 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up04.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + region_id UUID, + + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id) +); diff --git a/schema/crdb/crucible-ref-count-records/up05.sql b/schema/crdb/crucible-ref-count-records/up05.sql new file mode 100644 index 0000000000..a88a3fc6fd --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up05.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); diff --git a/schema/crdb/crucible-ref-count-records/up06.sql b/schema/crdb/crucible-ref-count-records/up06.sql new file mode 100644 index 0000000000..3262b194be --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up06.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up07.sql b/schema/crdb/crucible-ref-count-records/up07.sql new file mode 100644 index 0000000000..91c788be1b --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up07.sql @@ -0,0 +1,8 @@ +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); diff --git a/schema/crdb/crucible-ref-count-records/up08.sql b/schema/crdb/crucible-ref-count-records/up08.sql new file mode 100644 index 0000000000..9bcfebab76 --- /dev/null +++ b/schema/crdb/crucible-ref-count-records/up08.sql @@ -0,0 +1,29 @@ +INSERT INTO volume_resource_usage ( + SELECT + ( + OVERLAY( + OVERLAY( + MD5(volume.id::TEXT || dataset_id::TEXT || region_id::TEXT || snapshot_id::TEXT || snapshot_addr || volume_references::TEXT) + PLACING '4' from 13 + ) + PLACING TO_HEX(volume_references) from 17 + )::uuid + ) as usage_id, + volume.id AS volume_id, + 'region_snapshot' AS usage_type, + NULL AS region_id, + region_snapshot.dataset_id AS region_snapshot_dataset_id, + region_snapshot.region_id AS region_snapshot_region_id, + region_snapshot.snapshot_id AS region_snapshot_snapshot_id + FROM + volume + JOIN + region_snapshot + ON + volume.data like '%' || region_snapshot.snapshot_addr || '%' + WHERE + volume.time_deleted is NULL AND + region_snapshot.deleting = false +) +ON CONFLICT (usage_id) DO NOTHING +; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index a6cd9b38fe..bb2580a444 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -575,6 +575,8 @@ CREATE INDEX IF NOT EXISTS lookup_dataset_by_zpool on omicron.public.dataset ( id ) WHERE pool_id IS NOT NULL AND time_deleted IS NULL; +CREATE INDEX IF NOT EXISTS lookup_dataset_by_ip on omicron.public.dataset (ip); + /* * A region of space allocated to Crucible Downstairs, within a dataset. */ @@ -654,6 +656,10 @@ CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_region_id on omicron.public region_id ); +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_deleting on omicron.public.region_snapshot ( + deleting +); + /* * Index on volume_references and snapshot_addr for crucible * resource accounting lookup @@ -4409,6 +4415,43 @@ CREATE INDEX IF NOT EXISTS lookup_bgp_config_by_bgp_announce_set_id ON omicron.p ) WHERE time_deleted IS NULL; +CREATE TYPE IF NOT EXISTS omicron.public.volume_resource_usage_type AS ENUM ( + 'read_only_region', + 'region_snapshot' +); + +CREATE TABLE IF NOT EXISTS omicron.public.volume_resource_usage ( + usage_id UUID NOT NULL, + + volume_id UUID NOT NULL, + + usage_type omicron.public.volume_resource_usage_type NOT NULL, + + region_id UUID, + + region_snapshot_dataset_id UUID, + region_snapshot_region_id UUID, + region_snapshot_snapshot_id UUID, + + PRIMARY KEY (usage_id) +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_region on omicron.public.volume_resource_usage ( + region_id +); + +CREATE INDEX IF NOT EXISTS lookup_volume_resource_usage_by_snapshot on omicron.public.volume_resource_usage ( + region_snapshot_dataset_id, region_snapshot_region_id, region_snapshot_snapshot_id +); + +CREATE UNIQUE INDEX IF NOT EXISTS one_record_per_volume_resource_usage on omicron.public.volume_resource_usage ( + volume_id, + usage_type, + region_id, + region_snapshot_dataset_id, + region_snapshot_region_id, + region_snapshot_snapshot_id +); /* * Keep this at the end of file so that the database does not contain a version @@ -4421,7 +4464,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '107.0.0', NULL) + (TRUE, NOW(), NOW(), '108.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs index 589ba87700..8369499658 100644 --- a/sled-agent/src/sim/storage.rs +++ b/sled-agent/src/sim/storage.rs @@ -124,7 +124,7 @@ impl CrucibleDataInner { key_pem: None, root_pem: None, source: None, - read_only: false, + read_only: params.source.is_some(), }; let old = self.regions.insert(id, region.clone());