diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 617413f172..d413f9507a 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -14,6 +14,7 @@ use crate::db::pagination::paginated; use crate::db::pagination::Paginator; use crate::db::DbConnection; use crate::db::TransactionError; +use crate::transaction_retry::OptionalError; use anyhow::Context; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; @@ -46,6 +47,7 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintMetadata; use nexus_types::deployment::BlueprintPhysicalDisksConfig; use nexus_types::deployment::BlueprintTarget; +use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZonesConfig; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::external_api::views::SledState; @@ -60,6 +62,8 @@ use omicron_uuid_kinds::SledUuid; use std::collections::BTreeMap; use uuid::Uuid; +mod external_networking; + impl DataStore { /// List blueprints pub async fn blueprints_list( @@ -663,8 +667,12 @@ impl DataStore { .transaction_async(|conn| async move { // Ensure that blueprint we're about to delete is not the // current target. - let current_target = - self.blueprint_current_target_only(&conn).await?; + let current_target = self + .blueprint_current_target_only( + &conn, + SelectFlavor::Standard, + ) + .await?; if current_target.target_id == blueprint_id { return Err(TransactionError::CustomError( Error::conflict(format!( @@ -787,6 +795,147 @@ impl DataStore { Ok(()) } + /// Ensure all external networking IPs and service vNICs described by + /// `blueprint` are allocated (for in-service zones) or deallocated + /// (otherwise), conditional on `blueprint` being the current target + /// blueprint. + /// + /// This method may be safely executed from the blueprint executor RPW; the + /// condition on the current target blueprint ensures a Nexus attempting to + /// realize an out of date blueprint can't overwrite changes made by a Nexus + /// that realized the current target. + pub async fn blueprint_ensure_external_networking_resources( + &self, + opctx: &OpContext, + blueprint: &Blueprint, + ) -> Result<(), Error> { + self.blueprint_ensure_external_networking_resources_impl( + opctx, + blueprint, + #[cfg(test)] + None, + #[cfg(test)] + None, + ) + .await + } + + // The third and fourth arguments to this function only exist when run under + // test, and allows the calling test to control the general timing of the + // transaction executed by this method: + // + // 1. Check that `blueprint` is the current target blueprint + // 2. Set `target_check_done` is set to true (the test can wait on this) + // 3. Run remainder of transaction to allocate/deallocate resources + // 4. Wait until `return_on_completion` is set to true + // 5. Return + // + // If either of these arguments are `None`, steps 2 or 4 will be skipped. + async fn blueprint_ensure_external_networking_resources_impl( + &self, + opctx: &OpContext, + blueprint: &Blueprint, + #[cfg(test)] target_check_done: Option< + std::sync::Arc, + >, + #[cfg(test)] return_on_completion: Option< + std::sync::Arc, + >, + ) -> Result<(), Error> { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper( + "blueprint_ensure_external_networking_resources", + ) + .transaction(&conn, |conn| { + let err = err.clone(); + #[cfg(test)] + let target_check_done = target_check_done.clone(); + #[cfg(test)] + let return_on_completion = return_on_completion.clone(); + + async move { + // Bail out if `blueprint` isn't the current target. + let current_target = self + .blueprint_current_target_only( + &conn, + SelectFlavor::ForUpdate, + ) + .await + .map_err(|e| err.bail(e))?; + if current_target.target_id != blueprint.id { + return Err(err.bail(Error::invalid_request(format!( + "blueprint {} is not the current target blueprint ({})", + blueprint.id, current_target.target_id + )))); + } + + // See the comment on this method; this lets us notify our test + // caller that we've performed our target blueprint check. + #[cfg(test)] + { + use std::sync::atomic::Ordering; + if let Some(gate) = target_check_done { + gate.store(true, Ordering::SeqCst); + } + } + + // Deallocate external networking resources for + // non-externally-reachable zones before allocating resources + // for reachable zones. This will allow allocation to succeed if + // we are swapping an external IP between two zones (e.g., + // moving a specific external IP from an old external DNS zone + // to a new one). + self.ensure_zone_external_networking_deallocated_on_connection( + &conn, + &opctx.log, + blueprint + .all_omicron_zones_not_in( + BlueprintZoneFilter::ShouldBeExternallyReachable, + ) + .map(|(_sled_id, zone)| zone), + ) + .await + .map_err(|e| err.bail(e))?; + self.ensure_zone_external_networking_allocated_on_connection( + &conn, + opctx, + blueprint + .all_omicron_zones( + BlueprintZoneFilter::ShouldBeExternallyReachable, + ) + .map(|(_sled_id, zone)| zone), + ) + .await + .map_err(|e| err.bail(e))?; + + // See the comment on this method; this lets us wait until our + // test caller is ready for us to return. + #[cfg(test)] + { + use std::sync::atomic::Ordering; + use std::time::Duration; + if let Some(gate) = return_on_completion { + while !gate.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(50)).await; + } + } + } + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + /// Set the current target blueprint /// /// In order to become the target blueprint, `target`'s parent blueprint @@ -930,7 +1079,9 @@ impl DataStore { opctx.authorize(authz::Action::Read, &authz::BLUEPRINT_CONFIG).await?; let conn = self.pool_connection_authorized(opctx).await?; - let target = self.blueprint_current_target_only(&conn).await?; + let target = self + .blueprint_current_target_only(&conn, SelectFlavor::Standard) + .await?; // The blueprint for the current target cannot be deleted while it is // the current target, but it's possible someone else (a) made a new @@ -951,7 +1102,7 @@ impl DataStore { ) -> Result { opctx.authorize(authz::Action::Read, &authz::BLUEPRINT_CONFIG).await?; let conn = self.pool_connection_authorized(opctx).await?; - self.blueprint_current_target_only(&conn).await + self.blueprint_current_target_only(&conn, SelectFlavor::Standard).await } // Helper to fetch the current blueprint target (without fetching the entire @@ -961,13 +1112,26 @@ impl DataStore { async fn blueprint_current_target_only( &self, conn: &async_bb8_diesel::Connection, + select_flavor: SelectFlavor, ) -> Result { use db::schema::bp_target::dsl; - let current_target = dsl::bp_target - .order_by(dsl::version.desc()) - .first_async::(conn) - .await + let query_result = match select_flavor { + SelectFlavor::ForUpdate => { + dsl::bp_target + .order_by(dsl::version.desc()) + .for_update() + .first_async::(conn) + .await + } + SelectFlavor::Standard => { + dsl::bp_target + .order_by(dsl::version.desc()) + .first_async::(conn) + .await + } + }; + let current_target = query_result .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -984,6 +1148,14 @@ impl DataStore { } } +#[derive(Debug, Clone, Copy)] +enum SelectFlavor { + /// A normal `SELECT`. + Standard, + /// Acquire a database-level write lock via `SELECT ... FOR UPDATE`. + ForUpdate, +} + // Helper to create an `authz::Blueprint` for a specific blueprint ID fn authz_blueprint_from_id(blueprint_id: Uuid) -> authz::Blueprint { authz::Blueprint::new( @@ -1361,6 +1533,8 @@ mod tests { use omicron_common::address::Ipv6Subnet; use omicron_common::disk::DiskIdentity; use omicron_test_utils::dev; + use omicron_test_utils::dev::poll::wait_for_condition; + use omicron_test_utils::dev::poll::CondCheckError; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; @@ -1371,6 +1545,10 @@ mod tests { use slog::Logger; use std::mem; use std::net::Ipv6Addr; + use std::sync::atomic::AtomicBool; + use std::sync::atomic::Ordering; + use std::sync::Arc; + use std::time::Duration; static EMPTY_PLANNING_INPUT: Lazy = Lazy::new(|| PlanningInputBuilder::empty_input()); @@ -2061,6 +2239,199 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn test_ensure_external_networking_bails_on_bad_target() { + // Setup + let logctx = dev::test_setup_log( + "test_ensure_external_networking_bails_on_bad_target", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + // Create an initial blueprint and a child. + let blueprint1 = BlueprintBuilder::build_empty_with_sleds( + std::iter::empty(), + "test1", + ); + let blueprint2 = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint1, + &EMPTY_PLANNING_INPUT, + "test2", + ) + .expect("failed to create builder") + .build(); + + // Insert both into the blueprint table. + datastore.blueprint_insert(&opctx, &blueprint1).await.unwrap(); + datastore.blueprint_insert(&opctx, &blueprint2).await.unwrap(); + + let bp1_target = BlueprintTarget { + target_id: blueprint1.id, + enabled: true, + time_made_target: now_db_precision(), + }; + let bp2_target = BlueprintTarget { + target_id: blueprint2.id, + enabled: true, + time_made_target: now_db_precision(), + }; + + // Set bp1_target as the current target. + datastore + .blueprint_target_set_current(&opctx, bp1_target) + .await + .unwrap(); + + // Attempting to ensure the (empty) resources for bp1 should succeed. + datastore + .blueprint_ensure_external_networking_resources(&opctx, &blueprint1) + .await + .expect("ensured networking resources for empty blueprint 1"); + + // Attempting to ensure the (empty) resources for bp2 should fail, + // because it isn't the target blueprint. + let err = datastore + .blueprint_ensure_external_networking_resources(&opctx, &blueprint2) + .await + .expect_err("failed because blueprint 2 isn't the target"); + assert!( + err.to_string().contains("is not the current target blueprint"), + "unexpected error: {err}" + ); + + // Create flags to control method execution. + let target_check_done = Arc::new(AtomicBool::new(false)); + let return_on_completion = Arc::new(AtomicBool::new(false)); + + // Spawn a task to execute our method. + let mut ensure_resources_task = tokio::spawn({ + let datastore = datastore.clone(); + let opctx = + OpContext::for_tests(logctx.log.clone(), datastore.clone()); + let target_check_done = target_check_done.clone(); + let return_on_completion = return_on_completion.clone(); + async move { + datastore + .blueprint_ensure_external_networking_resources_impl( + &opctx, + &blueprint1, + Some(target_check_done), + Some(return_on_completion), + ) + .await + } + }); + + // Wait until `task` has proceeded past the point at which it's checked + // the target blueprint. + wait_for_condition( + || async { + if target_check_done.load(Ordering::SeqCst) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_millis(50), + &Duration::from_secs(10), + ) + .await + .expect("`target_check_done` not set to true"); + + // Spawn another task that tries to read the current target. This should + // block at the database level due to the `SELECT ... FOR UPDATE` inside + // `blueprint_ensure_external_networking_resources`. + let mut current_target_task = tokio::spawn({ + let datastore = datastore.clone(); + let opctx = + OpContext::for_tests(logctx.log.clone(), datastore.clone()); + async move { + datastore + .blueprint_target_get_current(&opctx) + .await + .expect("read current target") + } + }); + + // Spawn another task that tries to set the current target. This should + // block at the database level due to the `SELECT ... FOR UPDATE` inside + // `blueprint_ensure_external_networking_resources`. + let mut update_target_task = tokio::spawn({ + let datastore = datastore.clone(); + let opctx = + OpContext::for_tests(logctx.log.clone(), datastore.clone()); + async move { + datastore.blueprint_target_set_current(&opctx, bp2_target).await + } + }); + + // None of our spawned tasks should be able to make progress: + // `ensure_resources_task` is waiting for us to set + // `return_on_completion` to true, and the other two should be + // queued by Cockroach, because + // `blueprint_ensure_external_networking_resources` should have + // performed a `SELECT ... FOR UPDATE` on the current target, forcing + // the query that wants to change it to wait until the transaction + // completes. + // + // We'll somewhat haphazardly test this by trying to wait for any + // task to finish, and succeeding on a timeout of a few seconds. This + // could spuriously succeed if we're executing on a very overloaded + // system where we hit the timeout even though one of the tasks is + // actually making progress, but hopefully will fail often enough if + // we've gotten this wrong. + tokio::select! { + result = &mut ensure_resources_task => { + panic!( + "unexpected completion of \ + `blueprint_ensure_external_networking_resources`: \ + {result:?}", + ); + } + result = &mut update_target_task => { + panic!( + "unexpected completion of \ + `blueprint_target_set_current`: {result:?}", + ); + } + result = &mut current_target_task => { + panic!( + "unexpected completion of \ + `blueprint_target_get_current`: {result:?}", + ); + } + _ = tokio::time::sleep(Duration::from_secs(5)) => (), + } + + // Release `ensure_resources_task` to finish. + return_on_completion.store(true, Ordering::SeqCst); + + tokio::time::timeout(Duration::from_secs(10), ensure_resources_task) + .await + .expect( + "time out waiting for \ + `blueprint_ensure_external_networking_resources`", + ) + .expect("panic in `blueprint_ensure_external_networking_resources") + .expect("ensured networking resources for empty blueprint 2"); + + // Our other tasks should now also complete. + tokio::time::timeout(Duration::from_secs(10), update_target_task) + .await + .expect("time out waiting for `blueprint_target_set_current`") + .expect("panic in `blueprint_target_set_current") + .expect("updated target to blueprint 2"); + tokio::time::timeout(Duration::from_secs(10), current_target_task) + .await + .expect("time out waiting for `blueprint_target_get_current`") + .expect("panic in `blueprint_target_get_current"); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint .all_omicron_zones(BlueprintZoneFilter::All) diff --git a/nexus/reconfigurator/execution/src/external_networking.rs b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs similarity index 60% rename from nexus/reconfigurator/execution/src/external_networking.rs rename to nexus/db-queries/src/db/datastore/deployment/external_networking.rs index 3e98aa4ff0..b6ced8e2c5 100644 --- a/nexus/reconfigurator/execution/src/external_networking.rs +++ b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs @@ -5,17 +5,18 @@ //! Manages allocation and deallocation of external networking resources //! required for blueprint realization -use anyhow::bail; -use anyhow::Context; +use crate::context::OpContext; +use crate::db::fixed_data::vpc_subnet::DNS_VPC_SUBNET; +use crate::db::fixed_data::vpc_subnet::NEXUS_VPC_SUBNET; +use crate::db::fixed_data::vpc_subnet::NTP_VPC_SUBNET; +use crate::db::DataStore; +use crate::db::DbConnection; use nexus_db_model::IncompleteNetworkInterface; -use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::fixed_data::vpc_subnet::DNS_VPC_SUBNET; -use nexus_db_queries::db::fixed_data::vpc_subnet::NEXUS_VPC_SUBNET; -use nexus_db_queries::db::fixed_data::vpc_subnet::NTP_VPC_SUBNET; -use nexus_db_queries::db::DataStore; +use nexus_db_model::IpPool; use nexus_sled_agent_shared::inventory::ZoneKind; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::OmicronZoneExternalIp; +use omicron_common::api::external::Error; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; @@ -28,415 +29,394 @@ use slog::warn; use slog::Logger; use slog_error_chain::InlineErrorChain; -pub(crate) async fn ensure_zone_external_networking_allocated( - opctx: &OpContext, - datastore: &DataStore, - zones_to_allocate: impl Iterator, -) -> anyhow::Result<()> { - for z in zones_to_allocate { - let Some((external_ip, nic)) = z.zone_type.external_networking() else { - continue; - }; +impl DataStore { + pub(super) async fn ensure_zone_external_networking_allocated_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + opctx: &OpContext, + zones_to_allocate: impl Iterator, + ) -> Result<(), Error> { + // Looking up the service pool ID requires an opctx; we'll do this once + // up front and reuse the pool ID (which never changes) in the loop + // below. + let (_, pool) = self.ip_pools_service_lookup(opctx).await?; + + for z in zones_to_allocate { + let Some((external_ip, nic)) = z.zone_type.external_networking() + else { + continue; + }; - let log = opctx.log.new(slog::o!( - "action" => "allocate-external-networking", - "zone_kind" => z.zone_type.kind().report_str(), - "zone_id" => z.id.to_string(), - "ip" => format!("{external_ip:?}"), - "nic" => format!("{nic:?}"), - )); - - let kind = z.zone_type.kind(); - ensure_external_service_ip( - opctx, - datastore, - kind, - z.id, - external_ip, - &log, - ) - .await?; - ensure_service_nic(opctx, datastore, kind, z.id, nic, &log).await?; + let log = opctx.log.new(slog::o!( + "action" => "allocate-external-networking", + "zone_kind" => z.zone_type.kind().report_str(), + "zone_id" => z.id.to_string(), + "ip" => format!("{external_ip:?}"), + "nic" => format!("{nic:?}"), + )); + + let kind = z.zone_type.kind(); + self.ensure_external_service_ip( + conn, + &pool, + kind, + z.id, + external_ip, + &log, + ) + .await?; + self.ensure_service_nic(conn, kind, z.id, nic, &log).await?; + } + + Ok(()) } - Ok(()) -} + pub(super) async fn ensure_zone_external_networking_deallocated_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + log: &Logger, + zones_to_deallocate: impl Iterator, + ) -> Result<(), Error> { + for z in zones_to_deallocate { + let Some((external_ip, nic)) = z.zone_type.external_networking() + else { + continue; + }; -pub(crate) async fn ensure_zone_external_networking_deallocated( - opctx: &OpContext, - datastore: &DataStore, - zones_to_deallocate: impl Iterator, -) -> anyhow::Result<()> { - for z in zones_to_deallocate { - let Some((external_ip, nic)) = z.zone_type.external_networking() else { - continue; - }; + let kind = z.zone_type.kind(); + let log = log.new(slog::o!( + "action" => "deallocate-external-networking", + "zone_kind" => kind.report_str(), + "zone_id" => z.id.to_string(), + "ip" => format!("{external_ip:?}"), + "nic" => format!("{nic:?}"), + )); - let kind = z.zone_type.kind(); - let log = opctx.log.new(slog::o!( - "action" => "deallocate-external-networking", - "zone_kind" => kind.report_str(), - "zone_id" => z.id.to_string(), - "ip" => format!("{external_ip:?}"), - "nic" => format!("{nic:?}"), - )); - - let deleted_ip = datastore - .deallocate_external_ip(opctx, external_ip.id().into_untyped_uuid()) - .await - .with_context(|| { - format!( - "failed to delete external IP {external_ip:?} \ - for {} zone {}", - kind.report_str(), - z.id + let deleted_ip = self + .deallocate_external_ip_on_connection( + conn, + external_ip.id().into_untyped_uuid(), ) - })?; - if deleted_ip { - info!(log, "successfully deleted Omicron zone external IP"); - } else { - debug!(log, "Omicron zone external IP already deleted"); - } + .await?; + if deleted_ip { + info!(log, "successfully deleted Omicron zone external IP"); + } else { + debug!(log, "Omicron zone external IP already deleted"); + } - let deleted_nic = datastore - .service_delete_network_interface( - opctx, - z.id.into_untyped_uuid(), - nic.id, - ) - .await - .with_context(|| { - format!( - "failed to delete service VNIC {nic:?} for {} zone {}", - kind.report_str(), - z.id + let deleted_nic = self + .service_delete_network_interface_on_connection( + conn, + z.id.into_untyped_uuid(), + nic.id, ) - })?; - if deleted_nic { - info!(log, "successfully deleted Omicron zone vNIC"); - } else { - debug!(log, "Omicron zone vNIC already deleted"); + .await + .map_err(|err| err.into_external())?; + if deleted_nic { + info!(log, "successfully deleted Omicron zone vNIC"); + } else { + debug!(log, "Omicron zone vNIC already deleted"); + } } - } - Ok(()) -} - -// Helper function to determine whether a given external IP address is -// already allocated to a specific service zone. -async fn is_external_ip_already_allocated( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - zone_id: OmicronZoneUuid, - external_ip: OmicronZoneExternalIp, - log: &Logger, -) -> anyhow::Result { - // localhost is used by many components in the test suite. We can't use - // the normal path because normally a given external IP must only be - // used once. Just treat localhost in the test suite as though it's - // already allocated. We do the same in is_nic_already_allocated(). - if cfg!(test) && external_ip.ip().is_loopback() { - return Ok(true); + Ok(()) } - let allocated_ips = datastore - .external_ip_list_service(opctx, zone_id.into_untyped_uuid()) - .await - .with_context(|| { - format!( - "failed to look up external IPs for {} {zone_id}", - zone_kind.report_str() + // Helper function to determine whether a given external IP address is + // already allocated to a specific service zone. + async fn is_external_ip_already_allocated( + &self, + conn: &async_bb8_diesel::Connection, + zone_id: OmicronZoneUuid, + external_ip: OmicronZoneExternalIp, + log: &Logger, + ) -> Result { + // localhost is used by many components in the test suite. We can't use + // the normal path because normally a given external IP must only be + // used once. Just treat localhost in the test suite as though it's + // already allocated. We do the same in is_nic_already_allocated(). + if cfg!(any(test, feature = "testing")) + && external_ip.ip().is_loopback() + { + return Ok(true); + } + + let allocated_ips = self + .external_ip_list_service_on_connection( + conn, + zone_id.into_untyped_uuid(), ) - })?; + .await?; - // We expect to find either 0 or exactly 1 IP for any given zone. If 0, - // we know the IP isn't allocated; if 1, we'll check that it matches - // below. - let existing_ip = match allocated_ips.as_slice() { - [] => { - info!(log, "external IP allocation required for zone"); + // We expect to find either 0 or exactly 1 IP for any given zone. If 0, + // we know the IP isn't allocated; if 1, we'll check that it matches + // below. + let existing_ip = match allocated_ips.as_slice() { + [] => { + info!(log, "external IP allocation required for zone"); - return Ok(false); - } - [ip] => ip, - _ => { + return Ok(false); + } + [ip] => ip, + _ => { + warn!( + log, "zone has multiple IPs allocated"; + "allocated_ips" => ?allocated_ips, + ); + return Err(Error::invalid_request(format!( + "zone {zone_id} already has {} IPs allocated (expected 1)", + allocated_ips.len() + ))); + } + }; + + // We expect this to always succeed; a failure here means we've stored + // an Omicron zone IP in the database that can't be converted back to an + // Omicron zone IP! + let existing_ip = match OmicronZoneExternalIp::try_from(existing_ip) { + Ok(existing_ip) => existing_ip, + Err(err) => { + error!(log, "invalid IP in database for zone"; &err); + return Err(Error::invalid_request(format!( + "zone {zone_id} has invalid IP database record: {}", + InlineErrorChain::new(&err) + ))); + } + }; + + if existing_ip == external_ip { + info!(log, "found already-allocated external IP"); + Ok(true) + } else { warn!( - log, "zone has multiple IPs allocated"; - "allocated_ips" => ?allocated_ips, - ); - bail!( - "zone {zone_id} already has {} IPs allocated (expected 1)", - allocated_ips.len() - ); - } - }; - - // We expect this to always succeed; a failure here means we've stored - // an Omicron zone IP in the database that can't be converted back to an - // Omicron zone IP! - let existing_ip = match OmicronZoneExternalIp::try_from(existing_ip) { - Ok(existing_ip) => existing_ip, - Err(err) => { - error!(log, "invalid IP in database for zone"; &err); - bail!( - "zone {zone_id} has invalid IP database record: {}", - InlineErrorChain::new(&err) + log, "zone has unexpected IP allocated"; + "allocated_ip" => ?existing_ip, ); + return Err(Error::invalid_request(format!( + "zone {zone_id} has a different IP allocated ({existing_ip:?})", + ))); } - }; - - if existing_ip == external_ip { - info!(log, "found already-allocated external IP"); - Ok(true) - } else { - warn!( - log, "zone has unexpected IP allocated"; - "allocated_ip" => ?existing_ip, - ); - bail!("zone {zone_id} has a different IP allocated ({existing_ip:?})",); } -} -// Helper function to determine whether a given NIC is already allocated to -// a specific service zone. -async fn is_nic_already_allocated( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - zone_id: OmicronZoneUuid, - nic: &NetworkInterface, - log: &Logger, -) -> anyhow::Result { - // See the comment in is_external_ip_already_allocated(). - if cfg!(test) && nic.ip.is_loopback() { - return Ok(true); - } + // Helper function to determine whether a given NIC is already allocated to + // a specific service zone. + async fn is_nic_already_allocated( + &self, + conn: &async_bb8_diesel::Connection, + zone_id: OmicronZoneUuid, + nic: &NetworkInterface, + log: &Logger, + ) -> Result { + // See the comment in is_external_ip_already_allocated(). + if cfg!(any(test, feature = "testing")) && nic.ip.is_loopback() { + return Ok(true); + } - let allocated_nics = datastore - .service_list_network_interfaces(opctx, zone_id.into_untyped_uuid()) - .await - .with_context(|| { - format!( - "failed to look up NICs for {} {zone_id}", - zone_kind.report_str() + let allocated_nics = self + .service_list_network_interfaces_on_connection( + conn, + zone_id.into_untyped_uuid(), ) - })?; - - if !allocated_nics.is_empty() { - // All the service zones that want NICs only expect to have a single - // one. Bail out here if this zone already has one or more allocated - // NICs but not the one we think it needs. - // - // This doesn't check the allocated NIC's subnet against our NICs, - // because that would require an extra DB lookup. We'll assume if - // these main properties are correct, the subnet is too. - for allocated_nic in &allocated_nics { - if allocated_nic.ip.ip() == nic.ip - && *allocated_nic.mac == nic.mac - && *allocated_nic.slot == nic.slot - && allocated_nic.primary == nic.primary - { - info!(log, "found already-allocated NIC"); - return Ok(true); + .await?; + + if !allocated_nics.is_empty() { + // All the service zones that want NICs only expect to have a single + // one. Bail out here if this zone already has one or more allocated + // NICs but not the one we think it needs. + // + // This doesn't check the allocated NIC's subnet against our NICs, + // because that would require an extra DB lookup. We'll assume if + // these main properties are correct, the subnet is too. + for allocated_nic in &allocated_nics { + if allocated_nic.ip.ip() == nic.ip + && *allocated_nic.mac == nic.mac + && *allocated_nic.slot == nic.slot + && allocated_nic.primary == nic.primary + { + info!(log, "found already-allocated NIC"); + return Ok(true); + } } - } - - warn!( - log, "zone has unexpected NICs allocated"; - "allocated_nics" => ?allocated_nics, - ); - bail!( - "zone {zone_id} already has {} non-matching NIC(s) allocated", - allocated_nics.len() - ); - } + warn!( + log, "zone has unexpected NICs allocated"; + "allocated_nics" => ?allocated_nics, + ); - info!(log, "NIC allocation required for zone"); + return Err(Error::invalid_request(format!( + "zone {zone_id} already has {} non-matching NIC(s) allocated", + allocated_nics.len() + ))); + } - Ok(false) -} + info!(log, "NIC allocation required for zone"); -async fn ensure_external_service_ip( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - zone_id: OmicronZoneUuid, - external_ip: OmicronZoneExternalIp, - log: &Logger, -) -> anyhow::Result<()> { - // Only attempt to allocate `external_ip` if it isn't already assigned - // to this zone. - // - // Checking for the existing of the external IP and then creating it - // if not found inserts a classic TOCTOU race: what if another Nexus - // is running concurrently, we both check and see that the IP is not - // allocated, then both attempt to create it? We believe this is - // okay: the loser of the race (i.e., the one whose create tries to - // commit second) will fail to allocate the IP, which will bubble - // out and prevent realization of the current blueprint. That's - // exactly what we want if two Nexuses try to realize the same - // blueprint at the same time. - if is_external_ip_already_allocated( - opctx, - datastore, - zone_kind, - zone_id, - external_ip, - log, - ) - .await? - { - return Ok(()); + Ok(false) } - datastore - .external_ip_allocate_omicron_zone( - opctx, + + async fn ensure_external_service_ip( + &self, + conn: &async_bb8_diesel::Connection, + pool: &IpPool, + zone_kind: ZoneKind, + zone_id: OmicronZoneUuid, + external_ip: OmicronZoneExternalIp, + log: &Logger, + ) -> Result<(), Error> { + // Only attempt to allocate `external_ip` if it isn't already assigned + // to this zone. + // + // Checking for the existing of the external IP and then creating it + // if not found inserts a classic TOCTOU race: what if another Nexus + // is running concurrently, we both check and see that the IP is not + // allocated, then both attempt to create it? We believe this is + // okay: the loser of the race (i.e., the one whose create tries to + // commit second) will fail to allocate the IP, which will bubble + // out and prevent realization of the current blueprint. That's + // exactly what we want if two Nexuses try to realize the same + // blueprint at the same time. + if self + .is_external_ip_already_allocated(conn, zone_id, external_ip, log) + .await? + { + return Ok(()); + } + self.external_ip_allocate_omicron_zone_on_connection( + conn, + pool, zone_id, zone_kind, external_ip, ) - .await - .with_context(|| { - format!( - "failed to allocate IP to {} {zone_id}: {external_ip:?}", - zone_kind.report_str() - ) - })?; + .await?; - info!(log, "successfully allocated external IP"); + info!(log, "successfully allocated external IP"); - Ok(()) -} + Ok(()) + } -// All service zones with external connectivity get service vNICs. -async fn ensure_service_nic( - opctx: &OpContext, - datastore: &DataStore, - zone_kind: ZoneKind, - service_id: OmicronZoneUuid, - nic: &NetworkInterface, - log: &Logger, -) -> anyhow::Result<()> { - // We don't pass `nic.kind` into the database below, but instead - // explicitly call `service_create_network_interface`. Ensure this is - // indeed a service NIC. - match &nic.kind { - NetworkInterfaceKind::Instance { .. } => { - bail!("invalid NIC kind (expected service, got instance)") - } - NetworkInterfaceKind::Probe { .. } => { - bail!("invalid NIC kind (expected service, got probe)") + // All service zones with external connectivity get service vNICs. + async fn ensure_service_nic( + &self, + conn: &async_bb8_diesel::Connection, + zone_kind: ZoneKind, + service_id: OmicronZoneUuid, + nic: &NetworkInterface, + log: &Logger, + ) -> Result<(), Error> { + // We don't pass `nic.kind` into the database below, but instead + // explicitly call `service_create_network_interface`. Ensure this is + // indeed a service NIC. + match &nic.kind { + NetworkInterfaceKind::Instance { .. } => { + return Err(Error::invalid_request( + "invalid NIC kind (expected service, got instance)", + )); + } + NetworkInterfaceKind::Probe { .. } => { + return Err(Error::invalid_request( + "invalid NIC kind (expected service, got probe)", + )); + } + NetworkInterfaceKind::Service { .. } => (), } - NetworkInterfaceKind::Service { .. } => (), - } - let nic_subnet = match zone_kind { - ZoneKind::BoundaryNtp => &*NTP_VPC_SUBNET, - ZoneKind::ExternalDns => &*DNS_VPC_SUBNET, - ZoneKind::Nexus => &*NEXUS_VPC_SUBNET, - ZoneKind::Clickhouse - | ZoneKind::ClickhouseKeeper - | ZoneKind::CockroachDb - | ZoneKind::Crucible - | ZoneKind::CruciblePantry - | ZoneKind::InternalDns - | ZoneKind::InternalNtp - | ZoneKind::Oximeter => { - bail!("no VPC subnet available for {} zone", zone_kind.report_str()) + let nic_subnet = match zone_kind { + ZoneKind::BoundaryNtp => &*NTP_VPC_SUBNET, + ZoneKind::ExternalDns => &*DNS_VPC_SUBNET, + ZoneKind::Nexus => &*NEXUS_VPC_SUBNET, + ZoneKind::Clickhouse + | ZoneKind::ClickhouseKeeper + | ZoneKind::CockroachDb + | ZoneKind::Crucible + | ZoneKind::CruciblePantry + | ZoneKind::InternalDns + | ZoneKind::InternalNtp + | ZoneKind::Oximeter => { + return Err(Error::invalid_request(format!( + "no VPC subnet available for {} zone", + zone_kind.report_str() + ))); + } + }; + + // Only attempt to allocate `nic` if it isn't already assigned to this + // zone. + // + // This is subject to the same kind of TOCTOU race as described for IP + // allocation in `ensure_external_service_ip`, and we believe it's okay + // for the same reasons as described there. + if self.is_nic_already_allocated(conn, service_id, nic, log).await? { + return Ok(()); } - }; - - // Only attempt to allocate `nic` if it isn't already assigned to this - // zone. - // - // This is subject to the same kind of TOCTOU race as described for IP - // allocation in `ensure_external_service_ip`, and we believe it's okay - // for the same reasons as described there. - if is_nic_already_allocated( - opctx, datastore, zone_kind, service_id, nic, log, - ) - .await? - { - return Ok(()); - } - let nic_arg = IncompleteNetworkInterface::new_service( - nic.id, - service_id.into_untyped_uuid(), - nic_subnet.clone(), - IdentityMetadataCreateParams { - name: nic.name.clone(), - description: format!("{} service vNIC", zone_kind.report_str()), - }, - nic.ip, - nic.mac, - nic.slot, - ) - .with_context(|| { - format!( - "failed to convert NIC into IncompleteNetworkInterface: {nic:?}" - ) - })?; - let created_nic = datastore - .service_create_network_interface(opctx, nic_arg) - .await - .map_err(|err| err.into_external()) - .with_context(|| { - format!( - "failed to allocate NIC to {} {service_id}: {nic:?}", - zone_kind.report_str() - ) - })?; - - // We don't pass all the properties of `nic` into the create request - // above. Double-check that the properties the DB assigned match - // what we expect. - // - // We do not check `nic.vni`, because it's not stored in the - // database. (All services are given the constant vni - // `Vni::SERVICES_VNI`.) - if created_nic.primary != nic.primary || *created_nic.slot != nic.slot { - warn!( - log, "unexpected property on allocated NIC"; - "allocated_primary" => created_nic.primary, - "allocated_slot" => *created_nic.slot, - ); - - // Now what? We've allocated a NIC in the database but it's - // incorrect. Should we try to delete it? That would be best - // effort (we could fail to delete, or we could crash between - // creation and deletion). + let nic_arg = IncompleteNetworkInterface::new_service( + nic.id, + service_id.into_untyped_uuid(), + nic_subnet.clone(), + IdentityMetadataCreateParams { + name: nic.name.clone(), + description: format!("{} service vNIC", zone_kind.report_str()), + }, + nic.ip, + nic.mac, + nic.slot, + )?; + let created_nic = self + .create_network_interface_raw_conn(conn, nic_arg) + .await + .map_err(|err| err.into_external())?; + + // We don't pass all the properties of `nic` into the create request + // above. Double-check that the properties the DB assigned match + // what we expect. // - // We only expect services to have one NIC, so the only way it - // should be possible to get a different primary/slot value is - // if somehow this same service got a _different_ NIC allocated - // to it in the TOCTOU race window above. That should be - // impossible with the way we generate blueprints, so we'll just - // return a scary error here and expect to never see it. - bail!( - "database cleanup required: unexpected NIC ({created_nic:?}) \ - allocated for {} {service_id}", - zone_kind.report_str(), - ); - } + // We do not check `nic.vni`, because it's not stored in the + // database. (All services are given the constant vni + // `Vni::SERVICES_VNI`.) + if created_nic.primary != nic.primary || *created_nic.slot != nic.slot { + warn!( + log, "unexpected property on allocated NIC"; + "allocated_primary" => created_nic.primary, + "allocated_slot" => *created_nic.slot, + ); - info!(log, "successfully allocated service vNIC"); + // Now what? We've allocated a NIC in the database but it's + // incorrect. Should we try to delete it? That would be best + // effort (we could fail to delete, or we could crash between + // creation and deletion). + // + // We only expect services to have one NIC, so the only way it + // should be possible to get a different primary/slot value is + // if somehow this same service got a _different_ NIC allocated + // to it in the TOCTOU race window above. That should be + // impossible with the way we generate blueprints, so we'll just + // return a scary error here and expect to never see it. + return Err(Error::invalid_request(format!( + "database cleanup required: unexpected NIC ({created_nic:?}) \ + allocated for {} {service_id}", + zone_kind.report_str(), + ))); + } + + info!(log, "successfully allocated service vNIC"); - Ok(()) + Ok(()) + } } #[cfg(test)] mod tests { use super::*; + use crate::db::datastore::test_utils::datastore_test; + use crate::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; + use anyhow::Context as _; use async_bb8_diesel::AsyncSimpleConnection; use chrono::DateTime; use chrono::Utc; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use nexus_db_model::SqlU16; - use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; - use nexus_test_utils_macros::nexus_test; + use nexus_test_utils::db::test_setup_database; use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; @@ -455,6 +435,7 @@ mod tests { use omicron_common::api::external::MacAddr; use omicron_common::api::external::Vni; use omicron_common::zpool_name::ZpoolName; + use omicron_test_utils::dev; use omicron_uuid_kinds::ExternalIpUuid; use omicron_uuid_kinds::ZpoolUuid; use oxnet::IpNet; @@ -463,9 +444,6 @@ mod tests { use std::net::SocketAddr; use uuid::Uuid; - type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - struct Harness { external_ips_range: IpRange, external_ips: IpRangeIter, @@ -658,14 +636,11 @@ mod tests { ] } - async fn assert_ips_exist_in_datastore( - &self, - opctx: &OpContext, - datastore: &DataStore, - ) { + async fn assert_ips_exist_in_datastore(&self, datastore: &DataStore) { + let conn = datastore.pool_connection_for_tests().await.unwrap(); let db_nexus_ips = datastore - .external_ip_list_service( - &opctx, + .external_ip_list_service_on_connection( + &conn, self.nexus_id.into_untyped_uuid(), ) .await @@ -685,8 +660,8 @@ mod tests { assert_eq!(db_nexus_ips[0].last_port, SqlU16(65535)); let db_dns_ips = datastore - .external_ip_list_service( - &opctx, + .external_ip_list_service_on_connection( + &conn, self.dns_id.into_untyped_uuid(), ) .await @@ -709,8 +684,8 @@ mod tests { assert_eq!(db_dns_ips[0].last_port, SqlU16(65535)); let db_ntp_ips = datastore - .external_ip_list_service( - &opctx, + .external_ip_list_service_on_connection( + &conn, self.ntp_id.into_untyped_uuid(), ) .await @@ -735,14 +710,11 @@ mod tests { ); } - async fn assert_nics_exist_in_datastore( - &self, - opctx: &OpContext, - datastore: &DataStore, - ) { + async fn assert_nics_exist_in_datastore(&self, datastore: &DataStore) { + let conn = datastore.pool_connection_for_tests().await.unwrap(); let db_nexus_nics = datastore - .service_list_network_interfaces( - &opctx, + .service_list_network_interfaces_on_connection( + &conn, self.nexus_id.into_untyped_uuid(), ) .await @@ -761,8 +733,8 @@ mod tests { assert_eq!(db_nexus_nics[0].primary, self.nexus_nic.primary); let db_dns_nics = datastore - .service_list_network_interfaces( - &opctx, + .service_list_network_interfaces_on_connection( + &conn, self.dns_id.into_untyped_uuid(), ) .await @@ -781,8 +753,8 @@ mod tests { assert_eq!(db_dns_nics[0].primary, self.dns_nic.primary); let db_ntp_nics = datastore - .service_list_network_interfaces( - &opctx, + .service_list_network_interfaces_on_connection( + &conn, self.ntp_id.into_untyped_uuid(), ) .await @@ -898,21 +870,17 @@ mod tests { } } - #[nexus_test] - async fn test_allocate_external_networking( - cptestctx: &ControlPlaneTestContext, - ) { + #[tokio::test] + async fn test_allocate_external_networking() { // Set up. - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); + usdt::register_probes().unwrap(); + let logctx = dev::test_setup_log("test_service_ip_list"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; // Generate the test values we care about. let mut harness = Harness::new(); - harness.set_up_service_ip_pool(&opctx, datastore).await; + harness.set_up_service_ip_pool(&opctx, &datastore).await; // Build the `zones` map needed by `ensure_zone_resources_allocated`, // with an arbitrary sled_id. @@ -920,31 +888,33 @@ mod tests { // Initialize resource allocation: this should succeed and create all // the relevant db records. - ensure_zone_external_networking_allocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); // Check that the external IP and NIC records were created. - harness.assert_ips_exist_in_datastore(&opctx, datastore).await; - harness.assert_nics_exist_in_datastore(&opctx, datastore).await; + harness.assert_ips_exist_in_datastore(&datastore).await; + harness.assert_nics_exist_in_datastore(&datastore).await; // We should be able to run the function again with the same inputs, and // it should succeed without inserting any new records. - ensure_zone_external_networking_allocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); - harness.assert_ips_exist_in_datastore(&opctx, datastore).await; - harness.assert_nics_exist_in_datastore(&opctx, datastore).await; + datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); + harness.assert_ips_exist_in_datastore(&datastore).await; + harness.assert_nics_exist_in_datastore(&datastore).await; // Now that we've tested the happy path, try some requests that ought to // fail because the request includes an external IP that doesn't match @@ -1027,13 +997,14 @@ mod tests { }; // and check that we get the error we expect. - let err = ensure_zone_external_networking_allocated( - &opctx, - datastore, - mutated_zones.iter(), - ) - .await - .expect_err("unexpected success"); + let err = datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + mutated_zones.iter(), + ) + .await + .expect_err("unexpected success"); assert!( err.to_string().contains(&expected_error), "expected {expected_error:?}, got {err:#}" @@ -1085,9 +1056,9 @@ mod tests { { let expected_error = mutate_nic_fn(zone.id, nic); - let err = ensure_zone_external_networking_allocated( + let err = datastore.ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), &opctx, - datastore, mutated_zones.iter(), ) .await @@ -1111,9 +1082,9 @@ mod tests { { let expected_error = mutate_nic_fn(zone.id, nic); - let err = ensure_zone_external_networking_allocated( + let err = datastore.ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), &opctx, - datastore, mutated_zones.iter(), ) .await @@ -1137,9 +1108,9 @@ mod tests { { let expected_error = mutate_nic_fn(zone.id, nic); - let err = ensure_zone_external_networking_allocated( + let err = datastore.ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), &opctx, - datastore, mutated_zones.iter(), ) .await @@ -1154,23 +1125,23 @@ mod tests { } } } + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); } - #[nexus_test] - async fn test_deallocate_external_networking( - cptestctx: &ControlPlaneTestContext, - ) { + #[tokio::test] + async fn test_deallocate_external_networking() { // Set up. - let nexus = &cptestctx.server.server_context().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); + usdt::register_probes().unwrap(); + let logctx = dev::test_setup_log("test_service_ip_list"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; // Generate the test values we care about. let harness = Harness::new(); - harness.set_up_service_ip_pool(&opctx, datastore).await; + harness.set_up_service_ip_pool(&opctx, &datastore).await; // Build the `zones` map needed by `ensure_zone_resources_allocated`, // with an arbitrary sled_id. @@ -1178,45 +1149,52 @@ mod tests { // Initialize resource allocation: this should succeed and create all // the relevant db records. - ensure_zone_external_networking_allocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_allocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &opctx, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); // Check that the external IP and NIC records were created. - harness.assert_ips_exist_in_datastore(&opctx, datastore).await; - harness.assert_nics_exist_in_datastore(&opctx, datastore).await; + harness.assert_ips_exist_in_datastore(&datastore).await; + harness.assert_nics_exist_in_datastore(&datastore).await; // Deallocate resources: this should succeed and mark all relevant db // records deleted. - ensure_zone_external_networking_deallocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_deallocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &logctx.log, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); - harness.assert_ips_are_deleted_in_datastore(datastore).await; - harness.assert_nics_are_deleted_in_datastore(datastore).await; + harness.assert_ips_are_deleted_in_datastore(&datastore).await; + harness.assert_nics_are_deleted_in_datastore(&datastore).await; // This operation should be idempotent: we can run it again, and the // records remain deleted. - ensure_zone_external_networking_deallocated( - &opctx, - datastore, - zones.iter(), - ) - .await - .with_context(|| format!("{zones:#?}")) - .unwrap(); + datastore + .ensure_zone_external_networking_deallocated_on_connection( + &datastore.pool_connection_for_tests().await.unwrap(), + &logctx.log, + zones.iter(), + ) + .await + .with_context(|| format!("{zones:#?}")) + .unwrap(); + + harness.assert_ips_are_deleted_in_datastore(&datastore).await; + harness.assert_nics_are_deleted_in_datastore(&datastore).await; - harness.assert_ips_are_deleted_in_datastore(datastore).await; - harness.assert_nics_are_deleted_in_datastore(datastore).await; + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); } } diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index 9a3928dd58..4b7f4a3825 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -23,6 +23,7 @@ use crate::db::model::ExternalIp; use crate::db::model::FloatingIp; use crate::db::model::IncompleteExternalIp; use crate::db::model::IpKind; +use crate::db::model::IpPool; use crate::db::model::Name; use crate::db::pagination::paginated; use crate::db::pagination::Paginator; @@ -169,9 +170,9 @@ impl DataStore { } /// Fetch all external IP addresses of any kind for the provided service. - pub async fn external_ip_list_service( + pub async fn external_ip_list_service_on_connection( &self, - opctx: &OpContext, + conn: &async_bb8_diesel::Connection, service_id: Uuid, ) -> LookupResult> { use db::schema::external_ip::dsl; @@ -180,7 +181,7 @@ impl DataStore { .filter(dsl::parent_id.eq(service_id)) .filter(dsl::time_deleted.is_null()) .select(ExternalIp::as_select()) - .get_results_async(&*self.pool_connection_authorized(opctx).await?) + .get_results_async(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -329,6 +330,25 @@ impl DataStore { self.allocate_external_ip(opctx, data).await } + /// Variant of [Self::external_ip_allocate_omicron_zone] which may be called + /// from a transaction context. + pub(crate) async fn external_ip_allocate_omicron_zone_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + service_pool: &IpPool, + zone_id: OmicronZoneUuid, + zone_kind: ZoneKind, + external_ip: OmicronZoneExternalIp, + ) -> Result> { + let data = IncompleteExternalIp::for_omicron_zone( + service_pool.id(), + external_ip, + zone_id, + zone_kind, + ); + Self::allocate_external_ip_on_connection(conn, data).await + } + /// List one page of all external IPs allocated to internal services pub async fn external_ip_list_service_all( &self, @@ -636,6 +656,17 @@ impl DataStore { &self, opctx: &OpContext, ip_id: Uuid, + ) -> Result { + let conn = self.pool_connection_authorized(opctx).await?; + self.deallocate_external_ip_on_connection(&conn, ip_id).await + } + + /// Variant of [Self::deallocate_external_ip] which may be called from a + /// transaction context. + pub(crate) async fn deallocate_external_ip_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + ip_id: Uuid, ) -> Result { use db::schema::external_ip::dsl; let now = Utc::now(); @@ -644,7 +675,7 @@ impl DataStore { .filter(dsl::id.eq(ip_id)) .set(dsl::time_deleted.eq(now)) .check_if_exists::(ip_id) - .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .execute_and_check(conn) .await .map(|r| match r.status { UpdateStatus::Updated => true, diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index c5a8992cd2..1b1ff8a75b 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -162,30 +162,17 @@ impl DataStore { } /// List network interfaces associated with a given service. - pub async fn service_list_network_interfaces( + pub async fn service_list_network_interfaces_on_connection( &self, - opctx: &OpContext, + conn: &async_bb8_diesel::Connection, service_id: Uuid, ) -> ListResultVec { - // See the comment in `service_create_network_interface`. There's no - // obvious parent for a service network interface (as opposed to - // instance network interfaces, which require ListChildren on the - // instance to list). As a logical proxy, we check for listing children - // of the service IP pool. - let (authz_service_ip_pool, _) = - self.ip_pools_service_lookup(opctx).await?; - opctx - .authorize(authz::Action::ListChildren, &authz_service_ip_pool) - .await?; - use db::schema::service_network_interface::dsl; dsl::service_network_interface .filter(dsl::time_deleted.is_null()) .filter(dsl::service_id.eq(service_id)) .select(ServiceNetworkInterface::as_select()) - .get_results_async::( - &*self.pool_connection_authorized(opctx).await?, - ) + .get_results_async::(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -450,6 +437,26 @@ impl DataStore { .await .map_err(network_interface::DeleteError::External)?; + let conn = self + .pool_connection_authorized(opctx) + .await + .map_err(network_interface::DeleteError::External)?; + self.service_delete_network_interface_on_connection( + &conn, + service_id, + network_interface_id, + ) + .await + } + + /// Variant of [Self::service_delete_network_interface] which may be called + /// from a transaction context. + pub async fn service_delete_network_interface_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + service_id: Uuid, + network_interface_id: Uuid, + ) -> Result { let query = network_interface::DeleteQuery::new( NetworkInterfaceKind::Service, service_id, @@ -457,12 +464,7 @@ impl DataStore { ); query .clone() - .execute_and_check( - &*self - .pool_connection_authorized(opctx) - .await - .map_err(network_interface::DeleteError::External)?, - ) + .execute_and_check(conn) .await .map_err(|e| network_interface::DeleteError::from_diesel(e, &query)) } diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 69f80209c3..a531b66df4 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -43,6 +43,7 @@ async-bb8-diesel.workspace = true diesel.workspace = true httptest.workspace = true ipnet.workspace = true +nexus-db-queries = { workspace = true, features = ["testing"] } nexus-reconfigurator-planning.workspace = true nexus-reconfigurator-preparation.workspace = true nexus-inventory.workspace = true diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index e3d2019230..bb525b1b8b 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -28,7 +28,6 @@ use std::net::SocketAddrV6; mod cockroachdb; mod datasets; mod dns; -mod external_networking; mod omicron_physical_disks; mod omicron_zones; mod overridables; @@ -117,31 +116,14 @@ where "blueprint_id" => %blueprint.id ); - // Deallocate external networking resources for non-externally-reachable - // zones first. This will allow external networking resource allocation to - // succeed if we are swapping an external IP between two zones (e.g., moving - // a specific external IP from an old external DNS zone to a new one). - external_networking::ensure_zone_external_networking_deallocated( - &opctx, - datastore, - blueprint - .all_omicron_zones_not_in( - BlueprintZoneFilter::ShouldBeExternallyReachable, - ) - .map(|(_sled_id, zone)| zone), - ) - .await - .map_err(|err| vec![err])?; - - external_networking::ensure_zone_external_networking_allocated( - &opctx, - datastore, - blueprint - .all_omicron_zones(BlueprintZoneFilter::ShouldBeExternallyReachable) - .map(|(_sled_id, zone)| zone), - ) - .await - .map_err(|err| vec![err])?; + datastore + .blueprint_ensure_external_networking_resources(&opctx, blueprint) + .await + .map_err(|err| { + vec![anyhow!(err).context( + "failed to ensure external networking resources in database", + )] + })?; let sleds_by_id: BTreeMap = datastore .sled_list_all_batched(&opctx, SledFilter::InService) diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 460d74360d..ee780812ae 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -124,6 +124,7 @@ mod test { }; use nexus_db_queries::authn; use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::DataStore; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::BlueprintZoneFilter; @@ -150,7 +151,9 @@ mod test { type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; - fn create_blueprint( + async fn create_blueprint( + datastore: &DataStore, + opctx: &OpContext, blueprint_zones: BTreeMap, blueprint_disks: BTreeMap, dns_version: Generation, @@ -162,28 +165,46 @@ mod test { .copied() .map(|sled_id| (sled_id, SledState::Active)) .collect::>(); - ( - BlueprintTarget { - target_id: id, - enabled: true, - time_made_target: chrono::Utc::now(), - }, - Blueprint { - id, - blueprint_zones, - blueprint_disks, - sled_state, - cockroachdb_setting_preserve_downgrade: - CockroachDbPreserveDowngrade::DoNotModify, - parent_blueprint_id: None, - internal_dns_version: dns_version, - external_dns_version: dns_version, - cockroachdb_fingerprint: String::new(), - time_created: chrono::Utc::now(), - creator: "test".to_string(), - comment: "test blueprint".to_string(), - }, - ) + + // Ensure the blueprint we're creating is the current target (required + // for successful blueprint realization). This requires its parent to be + // the existing target, so fetch that first. + let current_target = datastore + .blueprint_target_get_current(opctx) + .await + .expect("fetched current target blueprint"); + + let target = BlueprintTarget { + target_id: id, + enabled: true, + time_made_target: chrono::Utc::now(), + }; + let blueprint = Blueprint { + id, + blueprint_zones, + blueprint_disks, + sled_state, + cockroachdb_setting_preserve_downgrade: + CockroachDbPreserveDowngrade::DoNotModify, + parent_blueprint_id: Some(current_target.target_id), + internal_dns_version: dns_version, + external_dns_version: dns_version, + cockroachdb_fingerprint: String::new(), + time_created: chrono::Utc::now(), + creator: "test".to_string(), + comment: "test blueprint".to_string(), + }; + + datastore + .blueprint_insert(opctx, &blueprint) + .await + .expect("inserted new blueprint"); + datastore + .blueprint_target_set_current(opctx, target) + .await + .expect("set new blueprint as current target"); + + (target, blueprint) } #[nexus_test(server = crate::Server)] @@ -253,11 +274,16 @@ mod test { // With a target blueprint having no zones, the task should trivially // complete and report a successful (empty) summary. let generation = Generation::new(); - let blueprint = Arc::new(create_blueprint( - BTreeMap::new(), - BTreeMap::new(), - generation, - )); + let blueprint = Arc::new( + create_blueprint( + &datastore, + &opctx, + BTreeMap::new(), + BTreeMap::new(), + generation, + ) + .await, + ); blueprint_tx.send(Some(blueprint)).unwrap(); let value = task.activate(&opctx).await; println!("activating with no zones: {:?}", value); @@ -300,13 +326,16 @@ mod test { // // TODO: add expunged zones to the test (should not be deployed). let mut blueprint = create_blueprint( + &datastore, + &opctx, BTreeMap::from([ (sled_id1, make_zones(BlueprintZoneDisposition::InService)), (sled_id2, make_zones(BlueprintZoneDisposition::Quiesced)), ]), BTreeMap::new(), generation, - ); + ) + .await; // Insert records for the zpools backing the datasets in these zones. for (sled_id, config) in