diff --git a/Cargo.lock b/Cargo.lock index 8f855f5219..13b3d6c74e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -250,7 +250,7 @@ checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" [[package]] name = "async-bb8-diesel" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/async-bb8-diesel?rev=1446f7e0c1f05f33a0581abd51fa873c7652ab61#1446f7e0c1f05f33a0581abd51fa873c7652ab61" +source = "git+https://github.com/oxidecomputer/async-bb8-diesel?rev=ed7ab5ef0513ba303d33efd41d3e9e381169d59b#ed7ab5ef0513ba303d33efd41d3e9e381169d59b" dependencies = [ "async-trait", "bb8", @@ -4047,6 +4047,7 @@ dependencies = [ "pem 1.1.1", "petgraph", "pq-sys", + "rand 0.8.5", "rcgen", "ref-cast", "regex", diff --git a/Cargo.toml b/Cargo.toml index 931d885e7a..c88502bb1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -144,7 +144,7 @@ api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" assert_cmd = "2.0.12" -async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "1446f7e0c1f05f33a0581abd51fa873c7652ab61" } +async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" } async-trait = "0.1.74" atomicwrites = "0.4.2" authz-macros = { path = "nexus/authz-macros" } diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 0f6d1b911e..85a6b3139c 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -232,7 +232,7 @@ impl SledUpdate { } /// A set of constraints that can be placed on operations that select a sled. -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct SledReservationConstraints { must_select_from: Vec, } diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 94e3a56abf..9d8afd1fea 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -32,6 +32,7 @@ oso.workspace = true paste.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" +rand.workspace = true ref-cast.workspace = true samael.workspace = true serde.workspace = true diff --git a/nexus/db-queries/src/db/collection_attach.rs b/nexus/db-queries/src/db/collection_attach.rs index ea4d9d5beb..fccc1aa324 100644 --- a/nexus/db-queries/src/db/collection_attach.rs +++ b/nexus/db-queries/src/db/collection_attach.rs @@ -563,12 +563,9 @@ where #[cfg(test)] mod test { use super::*; - use crate::db::{ - self, error::TransactionError, identity::Resource as IdentityResource, - }; + use crate::db::{self, identity::Resource as IdentityResource}; use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - ConnectionManager, + AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager, }; use chrono::Utc; use db_macros::Resource; @@ -999,22 +996,12 @@ mod test { .set(resource::dsl::collection_id.eq(collection_id)), ); - type TxnError = - TransactionError>; - let result = conn - .transaction_async(|conn| async move { - attach_query.attach_and_get_result_async(&conn).await.map_err( - |e| match e { - AttachError::DatabaseError(e) => TxnError::from(e), - e => TxnError::CustomError(e), - }, - ) - }) - .await; - // "attach_and_get_result" should return the "attached" resource. - let (returned_collection, returned_resource) = - result.expect("Attach should have worked"); + let (returned_collection, returned_resource) = attach_query + .attach_and_get_result_async(&conn) + .await + .expect("Attach should have worked"); + assert_eq!( returned_resource.collection_id.expect("Expected a collection ID"), collection_id diff --git a/nexus/db-queries/src/db/collection_detach_many.rs b/nexus/db-queries/src/db/collection_detach_many.rs index 8df6d4aed4..986cfb70b7 100644 --- a/nexus/db-queries/src/db/collection_detach_many.rs +++ b/nexus/db-queries/src/db/collection_detach_many.rs @@ -479,12 +479,9 @@ where mod test { use super::*; use crate::db::collection_attach::DatastoreAttachTarget; - use crate::db::{ - self, error::TransactionError, identity::Resource as IdentityResource, - }; + use crate::db::{self, identity::Resource as IdentityResource}; use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - ConnectionManager, + AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager, }; use chrono::Utc; use db_macros::Resource; @@ -919,21 +916,12 @@ mod test { .set(resource::dsl::collection_id.eq(Option::::None)), ); - type TxnError = - TransactionError>; - let result = conn - .transaction_async(|conn| async move { - detach_query.detach_and_get_result_async(&conn).await.map_err( - |e| match e { - DetachManyError::DatabaseError(e) => TxnError::from(e), - e => TxnError::CustomError(e), - }, - ) - }) - .await; - // "detach_and_get_result" should return the "detached" resource. - let returned_collection = result.expect("Detach should have worked"); + let returned_collection = detach_query + .detach_and_get_result_async(&conn) + .await + .expect("Detach should have worked"); + // The returned values should be the latest value in the DB. assert_eq!( returned_collection, diff --git a/nexus/db-queries/src/db/datastore/address_lot.rs b/nexus/db-queries/src/db/datastore/address_lot.rs index 97dfb59eba..5c2ffbf1d0 100644 --- a/nexus/db-queries/src/db/datastore/address_lot.rs +++ b/nexus/db-queries/src/db/datastore/address_lot.rs @@ -13,9 +13,9 @@ use crate::db::error::TransactionError; use crate::db::model::Name; use crate::db::model::{AddressLot, AddressLotBlock, AddressLotReservedBlock}; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl, Connection}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::{AsyncRunQueryDsl, Connection}; use chrono::Utc; -use diesel::result::Error as DieselError; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use diesel_dtrace::DTraceConnection; use ipnetwork::IpNetwork; @@ -45,11 +45,12 @@ impl DataStore { use db::schema::address_lot::dsl as lot_dsl; use db::schema::address_lot_block::dsl as block_dsl; - self.pool_connection_authorized(opctx) - .await? - // TODO https://github.com/oxidecomputer/omicron/issues/2811 - // Audit external networking database transaction usage - .transaction_async(|conn| async move { + let conn = self.pool_connection_authorized(opctx).await?; + + // TODO https://github.com/oxidecomputer/omicron/issues/2811 + // Audit external networking database transaction usage + self.transaction_retry_wrapper("address_lot_create") + .transaction(&conn, |conn| async move { let lot = AddressLot::new(¶ms.identity, params.kind.into()); let db_lot: AddressLot = @@ -81,15 +82,14 @@ impl DataStore { Ok(AddressLotCreateResult { lot: db_lot, blocks: db_blocks }) }) .await - .map_err(|e| match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( + .map_err(|e| { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::AddressLot, ¶ms.identity.name.as_str(), ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), + ) }) } @@ -113,47 +113,54 @@ impl DataStore { LotInUse, } - type TxnError = TransactionError; + let err = OptionalError::new(); // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - let rsvd: Vec = - rsvd_block_dsl::address_lot_rsvd_block - .filter(rsvd_block_dsl::address_lot_id.eq(id)) - .select(AddressLotReservedBlock::as_select()) - .limit(1) - .load_async(&conn) - .await?; - - if !rsvd.is_empty() { - Err(TxnError::CustomError(AddressLotDeleteError::LotInUse))?; - } + self.transaction_retry_wrapper("address_lot_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let rsvd: Vec = + rsvd_block_dsl::address_lot_rsvd_block + .filter(rsvd_block_dsl::address_lot_id.eq(id)) + .select(AddressLotReservedBlock::as_select()) + .limit(1) + .load_async(&conn) + .await?; + + if !rsvd.is_empty() { + return Err(err.bail(AddressLotDeleteError::LotInUse)); + } + + let now = Utc::now(); + diesel::update(lot_dsl::address_lot) + .filter(lot_dsl::time_deleted.is_null()) + .filter(lot_dsl::id.eq(id)) + .set(lot_dsl::time_deleted.eq(now)) + .execute_async(&conn) + .await?; - let now = Utc::now(); - diesel::update(lot_dsl::address_lot) - .filter(lot_dsl::time_deleted.is_null()) - .filter(lot_dsl::id.eq(id)) - .set(lot_dsl::time_deleted.eq(now)) - .execute_async(&conn) - .await?; - - diesel::delete(block_dsl::address_lot_block) - .filter(block_dsl::address_lot_id.eq(id)) - .execute_async(&conn) - .await?; - - Ok(()) - }) - .await - .map_err(|e| match e { - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - TxnError::CustomError(AddressLotDeleteError::LotInUse) => { - Error::invalid_request("lot is in use") - } - }) + diesel::delete(block_dsl::address_lot_block) + .filter(block_dsl::address_lot_id.eq(id)) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + AddressLotDeleteError::LotInUse => { + Error::invalid_request("lot is in use") + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn address_lot_list( diff --git a/nexus/db-queries/src/db/datastore/bgp.rs b/nexus/db-queries/src/db/datastore/bgp.rs index ff314a2564..28075b0ded 100644 --- a/nexus/db-queries/src/db/datastore/bgp.rs +++ b/nexus/db-queries/src/db/datastore/bgp.rs @@ -3,11 +3,11 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::Name; use crate::db::model::{BgpAnnounceSet, BgpAnnouncement, BgpConfig}; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use nexus_types::external_api::params; @@ -30,33 +30,33 @@ impl DataStore { use db::schema::{ bgp_announce_set, bgp_announce_set::dsl as announce_set_dsl, }; - let pool = self.pool_connection_authorized(opctx).await?; - - pool.transaction_async(|conn| async move { - let id: Uuid = match &config.bgp_announce_set_id { - NameOrId::Name(name) => { - announce_set_dsl::bgp_announce_set - .filter(bgp_announce_set::time_deleted.is_null()) - .filter(bgp_announce_set::name.eq(name.to_string())) - .select(bgp_announce_set::id) - .limit(1) - .first_async::(&conn) - .await? - } - NameOrId::Id(id) => *id, - }; - - let config = BgpConfig::from_config_create(config, id); - - let result = diesel::insert_into(dsl::bgp_config) - .values(config.clone()) - .returning(BgpConfig::as_returning()) - .get_result_async(&conn) - .await?; - Ok(result) - }) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_config_set") + .transaction(&conn, |conn| async move { + let id: Uuid = match &config.bgp_announce_set_id { + NameOrId::Name(name) => { + announce_set_dsl::bgp_announce_set + .filter(bgp_announce_set::time_deleted.is_null()) + .filter(bgp_announce_set::name.eq(name.to_string())) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await? + } + NameOrId::Id(id) => *id, + }; + + let config = BgpConfig::from_config_create(config, id); + + let result = diesel::insert_into(dsl::bgp_config) + .values(config.clone()) + .returning(BgpConfig::as_returning()) + .get_result_async(&conn) + .await?; + Ok(result) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn bgp_config_delete( @@ -74,54 +74,59 @@ impl DataStore { enum BgpConfigDeleteError { ConfigInUse, } - type TxnError = TransactionError; - - let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { - let name_or_id = sel.name_or_id.clone(); - - let id: Uuid = match name_or_id { - NameOrId::Id(id) => id, - NameOrId::Name(name) => { - bgp_config_dsl::bgp_config - .filter(bgp_config::name.eq(name.to_string())) - .select(bgp_config::id) - .limit(1) - .first_async::(&conn) - .await? - } - }; - - let count = - sps_bgp_peer_config_dsl::switch_port_settings_bgp_peer_config - .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) - .count() - .execute_async(&conn) - .await?; - - if count > 0 { - return Err(TxnError::CustomError( - BgpConfigDeleteError::ConfigInUse, - )); - } - diesel::update(bgp_config_dsl::bgp_config) - .filter(bgp_config_dsl::id.eq(id)) - .set(bgp_config_dsl::time_deleted.eq(Utc::now())) - .execute_async(&conn) - .await?; + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_config_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let name_or_id = sel.name_or_id.clone(); + + let id: Uuid = match name_or_id { + NameOrId::Id(id) => id, + NameOrId::Name(name) => { + bgp_config_dsl::bgp_config + .filter(bgp_config::name.eq(name.to_string())) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await? + } + }; + + let count = + sps_bgp_peer_config_dsl::switch_port_settings_bgp_peer_config + .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) + .count() + .execute_async(&conn) + .await?; + + if count > 0 { + return Err(err.bail(BgpConfigDeleteError::ConfigInUse)); + } + + diesel::update(bgp_config_dsl::bgp_config) + .filter(bgp_config_dsl::id.eq(id)) + .set(bgp_config_dsl::time_deleted.eq(Utc::now())) + .execute_async(&conn) + .await?; - Ok(()) - }) - .await - .map_err(|e| match e { - TxnError::CustomError(BgpConfigDeleteError::ConfigInUse) => { - Error::invalid_request("BGP config in use") - } - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + BgpConfigDeleteError::ConfigInUse => { + Error::invalid_request("BGP config in use") + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn bgp_config_get( @@ -131,7 +136,7 @@ impl DataStore { ) -> LookupResult { use db::schema::bgp_config; use db::schema::bgp_config::dsl; - let pool = self.pool_connection_authorized(opctx).await?; + let conn = self.pool_connection_authorized(opctx).await?; let name_or_id = name_or_id.clone(); @@ -140,14 +145,14 @@ impl DataStore { .filter(bgp_config::name.eq(name.to_string())) .select(BgpConfig::as_select()) .limit(1) - .first_async::(&*pool) + .first_async::(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)), NameOrId::Id(id) => dsl::bgp_config .filter(bgp_config::id.eq(id)) .select(BgpConfig::as_select()) .limit(1) - .first_async::(&*pool) + .first_async::(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)), }?; @@ -162,7 +167,7 @@ impl DataStore { ) -> ListResultVec { use db::schema::bgp_config::dsl; - let pool = self.pool_connection_authorized(opctx).await?; + let conn = self.pool_connection_authorized(opctx).await?; match pagparams { PaginatedBy::Id(pagparams) => { @@ -176,7 +181,7 @@ impl DataStore { } .filter(dsl::time_deleted.is_null()) .select(BgpConfig::as_select()) - .load_async(&*pool) + .load_async(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -195,47 +200,64 @@ impl DataStore { enum BgpAnnounceListError { AnnounceSetNotFound(Name), } - type TxnError = TransactionError; - - let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { - let name_or_id = sel.name_or_id.clone(); - - let announce_id: Uuid = match name_or_id { - NameOrId::Id(id) => id, - NameOrId::Name(name) => announce_set_dsl::bgp_announce_set - .filter(bgp_announce_set::time_deleted.is_null()) - .filter(bgp_announce_set::name.eq(name.to_string())) - .select(bgp_announce_set::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError( - BgpAnnounceListError::AnnounceSetNotFound( - Name::from(name.clone()), - ), - ) - })?, - }; - - let result = announce_dsl::bgp_announcement - .filter(announce_dsl::announce_set_id.eq(announce_id)) - .select(BgpAnnouncement::as_select()) - .load_async(&conn) - .await?; - - Ok(result) - }) - .await - .map_err(|e| match e { - TxnError::CustomError( - BgpAnnounceListError::AnnounceSetNotFound(name), - ) => Error::not_found_by_name(ResourceType::BgpAnnounceSet, &name), - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_announce_list") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let name_or_id = sel.name_or_id.clone(); + + let announce_id: Uuid = match name_or_id { + NameOrId::Id(id) => id, + NameOrId::Name(name) => { + announce_set_dsl::bgp_announce_set + .filter( + bgp_announce_set::time_deleted.is_null(), + ) + .filter( + bgp_announce_set::name.eq(name.to_string()), + ) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + err.bail_retryable_or( + e, + BgpAnnounceListError::AnnounceSetNotFound( + Name::from(name.clone()), + ) + ) + })? + } + }; + + let result = announce_dsl::bgp_announcement + .filter(announce_dsl::announce_set_id.eq(announce_id)) + .select(BgpAnnouncement::as_select()) + .load_async(&conn) + .await?; + + Ok(result) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + BgpAnnounceListError::AnnounceSetNotFound(name) => { + Error::not_found_by_name( + ResourceType::BgpAnnounceSet, + &name, + ) + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn bgp_create_announce_set( @@ -246,37 +268,39 @@ impl DataStore { use db::schema::bgp_announce_set::dsl as announce_set_dsl; use db::schema::bgp_announcement::dsl as bgp_announcement_dsl; - let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { - let bas: BgpAnnounceSet = announce.clone().into(); + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_create_announce_set") + .transaction(&conn, |conn| async move { + let bas: BgpAnnounceSet = announce.clone().into(); - let db_as: BgpAnnounceSet = - diesel::insert_into(announce_set_dsl::bgp_announce_set) - .values(bas.clone()) - .returning(BgpAnnounceSet::as_returning()) - .get_result_async::(&conn) - .await?; - - let mut db_annoucements = Vec::new(); - for a in &announce.announcement { - let an = BgpAnnouncement { - announce_set_id: db_as.id(), - address_lot_block_id: bas.identity.id, - network: a.network.into(), - }; - let an = - diesel::insert_into(bgp_announcement_dsl::bgp_announcement) - .values(an.clone()) - .returning(BgpAnnouncement::as_returning()) - .get_result_async::(&conn) + let db_as: BgpAnnounceSet = + diesel::insert_into(announce_set_dsl::bgp_announce_set) + .values(bas.clone()) + .returning(BgpAnnounceSet::as_returning()) + .get_result_async::(&conn) .await?; - db_annoucements.push(an); - } - Ok((db_as, db_annoucements)) - }) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + let mut db_annoucements = Vec::new(); + for a in &announce.announcement { + let an = BgpAnnouncement { + announce_set_id: db_as.id(), + address_lot_block_id: bas.identity.id, + network: a.network.into(), + }; + let an = diesel::insert_into( + bgp_announcement_dsl::bgp_announcement, + ) + .values(an.clone()) + .returning(BgpAnnouncement::as_returning()) + .get_result_async::(&conn) + .await?; + db_annoucements.push(an); + } + + Ok((db_as, db_annoucements)) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn bgp_delete_announce_set( @@ -295,57 +319,67 @@ impl DataStore { enum BgpAnnounceSetDeleteError { AnnounceSetInUse, } - type TxnError = TransactionError; - let pool = self.pool_connection_authorized(opctx).await?; + let conn = self.pool_connection_authorized(opctx).await?; let name_or_id = sel.name_or_id.clone(); - pool.transaction_async(|conn| async move { - let id: Uuid = match name_or_id { - NameOrId::Name(name) => { - announce_set_dsl::bgp_announce_set - .filter(bgp_announce_set::name.eq(name.to_string())) - .select(bgp_announce_set::id) - .limit(1) - .first_async::(&conn) - .await? - } - NameOrId::Id(id) => id, - }; - - let count = bgp_config_dsl::bgp_config - .filter(bgp_config::bgp_announce_set_id.eq(id)) - .count() - .execute_async(&conn) - .await?; - - if count > 0 { - return Err(TxnError::CustomError( - BgpAnnounceSetDeleteError::AnnounceSetInUse, - )); - } + let err = OptionalError::new(); + self.transaction_retry_wrapper("bgp_delete_announce_set") + .transaction(&conn, |conn| { + let err = err.clone(); + let name_or_id = name_or_id.clone(); + async move { + let id: Uuid = match name_or_id { + NameOrId::Name(name) => { + announce_set_dsl::bgp_announce_set + .filter( + bgp_announce_set::name.eq(name.to_string()), + ) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await? + } + NameOrId::Id(id) => id, + }; + + let count = bgp_config_dsl::bgp_config + .filter(bgp_config::bgp_announce_set_id.eq(id)) + .count() + .execute_async(&conn) + .await?; - diesel::update(announce_set_dsl::bgp_announce_set) - .filter(announce_set_dsl::id.eq(id)) - .set(announce_set_dsl::time_deleted.eq(Utc::now())) - .execute_async(&conn) - .await?; + if count > 0 { + return Err(err.bail( + BgpAnnounceSetDeleteError::AnnounceSetInUse, + )); + } - diesel::delete(bgp_announcement_dsl::bgp_announcement) - .filter(bgp_announcement_dsl::announce_set_id.eq(id)) - .execute_async(&conn) - .await?; + diesel::update(announce_set_dsl::bgp_announce_set) + .filter(announce_set_dsl::id.eq(id)) + .set(announce_set_dsl::time_deleted.eq(Utc::now())) + .execute_async(&conn) + .await?; - Ok(()) - }) - .await - .map_err(|e| match e { - TxnError::CustomError( - BgpAnnounceSetDeleteError::AnnounceSetInUse, - ) => Error::invalid_request("BGP announce set in use"), - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + diesel::delete(bgp_announcement_dsl::bgp_announcement) + .filter(bgp_announcement_dsl::announce_set_id.eq(id)) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + BgpAnnounceSetDeleteError::AnnounceSetInUse => { + Error::invalid_request("BGP announce set in use") + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } } diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 39a70f7a1e..e579bb8476 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -8,10 +8,7 @@ use super::DataStore; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::TransactionError; -use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, -}; +use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use camino::{Utf8Path, Utf8PathBuf}; use chrono::Utc; use diesel::prelude::*; @@ -415,30 +412,30 @@ impl DataStore { target: &SemverVersion, sql: &String, ) -> Result<(), Error> { - let result = self.pool_connection_unauthorized().await?.transaction_async(|conn| async move { - if target.to_string() != EARLIEST_SUPPORTED_VERSION { - let validate_version_query = format!("SELECT CAST(\ - IF(\ - (\ - SELECT version = '{current}' and target_version = '{target}'\ - FROM omicron.public.db_metadata WHERE singleton = true\ - ),\ - 'true',\ - 'Invalid starting version for schema change'\ - ) AS BOOL\ - );"); - conn.batch_execute_async(&validate_version_query).await?; - } - conn.batch_execute_async(&sql).await?; - Ok::<_, TransactionError<()>>(()) - }).await; + let conn = self.pool_connection_unauthorized().await?; + + let result = self.transaction_retry_wrapper("apply_schema_update") + .transaction(&conn, |conn| async move { + if target.to_string() != EARLIEST_SUPPORTED_VERSION { + let validate_version_query = format!("SELECT CAST(\ + IF(\ + (\ + SELECT version = '{current}' and target_version = '{target}'\ + FROM omicron.public.db_metadata WHERE singleton = true\ + ),\ + 'true',\ + 'Invalid starting version for schema change'\ + ) AS BOOL\ + );"); + conn.batch_execute_async(&validate_version_query).await?; + } + conn.batch_execute_async(&sql).await?; + Ok(()) + }).await; match result { Ok(()) => Ok(()), - Err(TransactionError::CustomError(())) => panic!("No custom error"), - Err(TransactionError::Database(e)) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) - } + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), } } diff --git a/nexus/db-queries/src/db/datastore/device_auth.rs b/nexus/db-queries/src/db/datastore/device_auth.rs index e1facb43f6..8d8e09744c 100644 --- a/nexus/db-queries/src/db/datastore/device_auth.rs +++ b/nexus/db-queries/src/db/datastore/device_auth.rs @@ -10,10 +10,8 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::DeviceAccessToken; use crate::db::model::DeviceAuthRequest; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use omicron_common::api::external::CreateResult; @@ -75,35 +73,40 @@ impl DataStore { RequestNotFound, TooManyRequests, } - type TxnError = TransactionError; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - match delete_request.execute_async(&conn).await? { - 0 => { - Err(TxnError::CustomError(TokenGrantError::RequestNotFound)) + let err = crate::transaction_retry::OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("device_access_token_create") + .transaction(&conn, |conn| { + let err = err.clone(); + let insert_token = insert_token.clone(); + let delete_request = delete_request.clone(); + async move { + match delete_request.execute_async(&conn).await? { + 0 => Err(err.bail(TokenGrantError::RequestNotFound)), + 1 => Ok(insert_token.get_result_async(&conn).await?), + _ => Err(err.bail(TokenGrantError::TooManyRequests)), } - 1 => Ok(insert_token.get_result_async(&conn).await?), - _ => Err(TxnError::CustomError( - TokenGrantError::TooManyRequests, - )), } }) .await - .map_err(|e| match e { - TxnError::CustomError(TokenGrantError::RequestNotFound) => { - Error::ObjectNotFound { - type_name: ResourceType::DeviceAuthRequest, - lookup_type: LookupType::ByCompositeId( - authz_request.id(), - ), + .map_err(|e| { + if let Some(err) = err.take() { + match err { + TokenGrantError::RequestNotFound => { + Error::ObjectNotFound { + type_name: ResourceType::DeviceAuthRequest, + lookup_type: LookupType::ByCompositeId( + authz_request.id(), + ), + } + } + TokenGrantError::TooManyRequests => { + Error::internal_error("unexpectedly found multiple device auth requests for the same user code") + } } - } - TxnError::CustomError(TokenGrantError::TooManyRequests) => { - Error::internal_error("unexpectedly found multiple device auth requests for the same user code") - } - TxnError::Database(e) => { + } else { public_error_from_diesel(e, ErrorHandler::Server) } }) diff --git a/nexus/db-queries/src/db/datastore/dns.rs b/nexus/db-queries/src/db/datastore/dns.rs index f7ad97593e..cfd25d6a4f 100644 --- a/nexus/db-queries/src/db/datastore/dns.rs +++ b/nexus/db-queries/src/db/datastore/dns.rs @@ -67,7 +67,9 @@ impl DataStore { dns_group: DnsGroup, ) -> ListResultVec { let conn = self.pool_connection_authorized(opctx).await?; - self.dns_zones_list_all_on_connection(opctx, &conn, dns_group).await + Ok(self + .dns_zones_list_all_on_connection(opctx, &conn, dns_group) + .await?) } /// Variant of [`Self::dns_zones_list_all`] which may be called from a @@ -77,7 +79,7 @@ impl DataStore { opctx: &OpContext, conn: &async_bb8_diesel::Connection, dns_group: DnsGroup, - ) -> ListResultVec { + ) -> Result, TransactionError> { use db::schema::dns_zone::dsl; const LIMIT: usize = 5; @@ -88,8 +90,7 @@ impl DataStore { .limit(i64::try_from(LIMIT).unwrap()) .select(DnsZone::as_select()) .load_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; bail_unless!( list.len() < LIMIT, @@ -106,12 +107,14 @@ impl DataStore { opctx: &OpContext, dns_group: DnsGroup, ) -> LookupResult { - self.dns_group_latest_version_conn( - opctx, - &*self.pool_connection_authorized(opctx).await?, - dns_group, - ) - .await + let version = self + .dns_group_latest_version_conn( + opctx, + &*self.pool_connection_authorized(opctx).await?, + dns_group, + ) + .await?; + Ok(version) } pub async fn dns_group_latest_version_conn( @@ -119,7 +122,7 @@ impl DataStore { opctx: &OpContext, conn: &async_bb8_diesel::Connection, dns_group: DnsGroup, - ) -> LookupResult { + ) -> Result> { opctx.authorize(authz::Action::Read, &authz::DNS_CONFIG).await?; use db::schema::dns_version::dsl; let versions = dsl::dns_version @@ -128,8 +131,7 @@ impl DataStore { .limit(1) .select(DnsVersion::as_select()) .load_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; bail_unless!( versions.len() == 1, @@ -377,28 +379,17 @@ impl DataStore { opctx: &OpContext, conn: &async_bb8_diesel::Connection, update: DnsVersionUpdateBuilder, - ) -> Result<(), Error> { + ) -> Result<(), TransactionError> { opctx.authorize(authz::Action::Modify, &authz::DNS_CONFIG).await?; let zones = self .dns_zones_list_all_on_connection(opctx, conn, update.dns_group) .await?; - let result = conn - .transaction_async(|c| async move { - self.dns_update_internal(opctx, &c, update, zones) - .await - .map_err(TransactionError::CustomError) - }) - .await; - - match result { - Ok(()) => Ok(()), - Err(TransactionError::CustomError(e)) => Err(e), - Err(TransactionError::Database(e)) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) - } - } + conn.transaction_async(|c| async move { + self.dns_update_internal(opctx, &c, update, zones).await + }) + .await } // This must only be used inside a transaction. Otherwise, it may make @@ -409,7 +400,7 @@ impl DataStore { conn: &async_bb8_diesel::Connection, update: DnsVersionUpdateBuilder, zones: Vec, - ) -> Result<(), Error> { + ) -> Result<(), TransactionError> { // TODO-scalability TODO-performance This would be much better as a CTE // for all the usual reasons described in RFD 192. Using an interactive // transaction here means that either we wind up holding database locks @@ -455,10 +446,7 @@ impl DataStore { diesel::insert_into(dsl::dns_version) .values(new_version) .execute_async(conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + .await?; } { @@ -480,8 +468,7 @@ impl DataStore { ) .set(dsl::version_removed.eq(new_version_num)) .execute_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; bail_unless!( nremoved == ntoremove, @@ -495,10 +482,7 @@ impl DataStore { let nadded = diesel::insert_into(dsl::dns_name) .values(new_names) .execute_async(conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + .await?; bail_unless!( nadded == ntoadd, @@ -1684,6 +1668,10 @@ mod test { let conn = datastore.pool_connection_for_tests().await.unwrap(); let error = datastore.dns_update(&opctx, &conn, update).await.unwrap_err(); + let error = match error { + TransactionError::CustomError(err) => err, + _ => panic!("Unexpected error: {:?}", error), + }; assert_eq!( error.to_string(), "Internal Error: updated wrong number of dns_name \ @@ -1707,11 +1695,15 @@ mod test { update.add_name(String::from("n2"), records1.clone()).unwrap(); let conn = datastore.pool_connection_for_tests().await.unwrap(); - let error = - datastore.dns_update(&opctx, &conn, update).await.unwrap_err(); + let error = Error::from( + datastore.dns_update(&opctx, &conn, update).await.unwrap_err(), + ); let msg = error.to_string(); - assert!(msg.starts_with("Internal Error: ")); - assert!(msg.contains("violates unique constraint")); + assert!(msg.starts_with("Internal Error: "), "Message: {msg:}"); + assert!( + msg.contains("violates unique constraint"), + "Message: {msg:}" + ); } let dns_config = datastore diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index e663130a84..4e34bfc15c 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -10,7 +10,9 @@ use crate::authz::ApiResource; use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; +use crate::db::error::TransactionError; use crate::db::lookup::LookupPath; use crate::db::model::ExternalIp; use crate::db::model::IncompleteExternalIp; @@ -132,7 +134,8 @@ impl DataStore { data: IncompleteExternalIp, ) -> CreateResult { let conn = self.pool_connection_authorized(opctx).await?; - Self::allocate_external_ip_on_connection(&conn, data).await + let ip = Self::allocate_external_ip_on_connection(&conn, data).await?; + Ok(ip) } /// Variant of [Self::allocate_external_ip] which may be called from a @@ -140,23 +143,30 @@ impl DataStore { pub(crate) async fn allocate_external_ip_on_connection( conn: &async_bb8_diesel::Connection, data: IncompleteExternalIp, - ) -> CreateResult { + ) -> Result> { let explicit_ip = data.explicit_ip().is_some(); NextExternalIp::new(data).get_result_async(conn).await.map_err(|e| { use diesel::result::Error::NotFound; match e { NotFound => { if explicit_ip { - Error::invalid_request( + TransactionError::CustomError(Error::invalid_request( "Requested external IP address not available", - ) + )) } else { - Error::invalid_request( + TransactionError::CustomError(Error::invalid_request( "No external IP addresses available", - ) + )) + } + } + _ => { + if retryable(&e) { + return TransactionError::Database(e); } + TransactionError::CustomError( + crate::db::queries::external_ip::from_diesel(e), + ) } - _ => crate::db::queries::external_ip::from_diesel(e), } }) } diff --git a/nexus/db-queries/src/db/datastore/identity_provider.rs b/nexus/db-queries/src/db/datastore/identity_provider.rs index fdc9a020e7..cee577acd6 100644 --- a/nexus/db-queries/src/db/datastore/identity_provider.rs +++ b/nexus/db-queries/src/db/datastore/identity_provider.rs @@ -14,7 +14,6 @@ use crate::db::identity::Resource; use crate::db::model::IdentityProvider; use crate::db::model::Name; use crate::db::pagination::paginated; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -63,36 +62,47 @@ impl DataStore { assert_eq!(provider.silo_id, authz_idp_list.silo().id()); let name = provider.identity().name.to_string(); - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - // insert silo identity provider record with type Saml - use db::schema::identity_provider::dsl as idp_dsl; - diesel::insert_into(idp_dsl::identity_provider) - .values(db::model::IdentityProvider { - identity: db::model::IdentityProviderIdentity { - id: provider.identity.id, - name: provider.identity.name.clone(), - description: provider.identity.description.clone(), - time_created: provider.identity.time_created, - time_modified: provider.identity.time_modified, - time_deleted: provider.identity.time_deleted, - }, - silo_id: provider.silo_id, - provider_type: db::model::IdentityProviderType::Saml, - }) - .execute_async(&conn) - .await?; + let conn = self.pool_connection_authorized(opctx).await?; - // insert silo saml identity provider record - use db::schema::saml_identity_provider::dsl; - let result = diesel::insert_into(dsl::saml_identity_provider) - .values(provider) - .returning(db::model::SamlIdentityProvider::as_returning()) - .get_result_async(&conn) - .await?; + self.transaction_retry_wrapper("saml_identity_provider_create") + .transaction(&conn, |conn| { + let provider = provider.clone(); + async move { + // insert silo identity provider record with type Saml + use db::schema::identity_provider::dsl as idp_dsl; + diesel::insert_into(idp_dsl::identity_provider) + .values(db::model::IdentityProvider { + identity: db::model::IdentityProviderIdentity { + id: provider.identity.id, + name: provider.identity.name.clone(), + description: provider + .identity + .description + .clone(), + time_created: provider.identity.time_created, + time_modified: provider.identity.time_modified, + time_deleted: provider.identity.time_deleted, + }, + silo_id: provider.silo_id, + provider_type: + db::model::IdentityProviderType::Saml, + }) + .execute_async(&conn) + .await?; - Ok(result) + // insert silo saml identity provider record + use db::schema::saml_identity_provider::dsl; + let result = + diesel::insert_into(dsl::saml_identity_provider) + .values(provider) + .returning( + db::model::SamlIdentityProvider::as_returning(), + ) + .get_result_async(&conn) + .await?; + + Ok(result) + } }) .await .map_err(|e| { diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 44cd7a95b7..2e7f9da5b7 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -148,6 +148,7 @@ pub type DataStoreConnection<'a> = pub struct DataStore { pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, + transaction_retry_producer: crate::transaction_retry::Producer, } // The majority of `DataStore`'s methods live in our submodules as a concession @@ -164,6 +165,8 @@ impl DataStore { pool, virtual_provisioning_collection_producer: crate::provisioning::Producer::new(), + transaction_retry_producer: crate::transaction_retry::Producer::new( + ), }; Ok(datastore) } @@ -210,6 +213,29 @@ impl DataStore { self.virtual_provisioning_collection_producer.clone(), ) .unwrap(); + registry + .register_producer(self.transaction_retry_producer.clone()) + .unwrap(); + } + + /// Constructs a transaction retry helper + /// + /// Automatically wraps the underlying producer + pub fn transaction_retry_wrapper( + &self, + name: &'static str, + ) -> crate::transaction_retry::RetryHelper { + crate::transaction_retry::RetryHelper::new( + &self.transaction_retry_producer, + name, + ) + } + + #[cfg(test)] + pub(crate) fn transaction_retry_producer( + &self, + ) -> &crate::transaction_retry::Producer { + &self.transaction_retry_producer } /// Returns a connection to a connection from the database connection pool. diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index 06550e9439..4d4e43c9a7 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -13,7 +13,6 @@ use crate::db::collection_insert::DatastoreCollection; use crate::db::cte_utils::BoxedQuery; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::IncompleteNetworkInterface; use crate::db::model::Instance; use crate::db::model::InstanceNetworkInterface; @@ -25,7 +24,7 @@ use crate::db::model::VpcSubnet; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; use crate::db::queries::network_interface; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -466,77 +465,91 @@ impl DataStore { InstanceNotStopped, FailedToUnsetPrimary(DieselError), } - type TxnError = TransactionError; + + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; if primary { - conn.transaction_async(|conn| async move { - let instance_runtime = - instance_query.get_result_async(&conn).await?.runtime_state; - if instance_runtime.propolis_id.is_some() - || instance_runtime.nexus_state != stopped - { - return Err(TxnError::CustomError( - NetworkInterfaceUpdateError::InstanceNotStopped, - )); - } + self.transaction_retry_wrapper("instance_update_network_interface") + .transaction(&conn, |conn| { + let err = err.clone(); + let stopped = stopped.clone(); + let update_target_query = update_target_query.clone(); + async move { + let instance_runtime = + instance_query.get_result_async(&conn).await?.runtime_state; + if instance_runtime.propolis_id.is_some() + || instance_runtime.nexus_state != stopped + { + return Err(err.bail(NetworkInterfaceUpdateError::InstanceNotStopped)); + } - // First, get the primary interface - let primary_interface = - find_primary_query.get_result_async(&conn).await?; - // If the target and primary are different, we need to toggle - // the primary into a secondary. - if primary_interface.identity.id != interface_id { - use crate::db::schema::network_interface::dsl; - if let Err(e) = diesel::update(dsl::network_interface) - .filter(dsl::id.eq(primary_interface.identity.id)) - .filter(dsl::kind.eq(NetworkInterfaceKind::Instance)) - .filter(dsl::time_deleted.is_null()) - .set(dsl::is_primary.eq(false)) - .execute_async(&conn) - .await - { - return Err(TxnError::CustomError( - NetworkInterfaceUpdateError::FailedToUnsetPrimary( - e, - ), - )); - } - } + // First, get the primary interface + let primary_interface = + find_primary_query.get_result_async(&conn).await?; + // If the target and primary are different, we need to toggle + // the primary into a secondary. + if primary_interface.identity.id != interface_id { + use crate::db::schema::network_interface::dsl; + if let Err(e) = diesel::update(dsl::network_interface) + .filter(dsl::id.eq(primary_interface.identity.id)) + .filter(dsl::kind.eq(NetworkInterfaceKind::Instance)) + .filter(dsl::time_deleted.is_null()) + .set(dsl::is_primary.eq(false)) + .execute_async(&conn) + .await + { + return Err(err.bail_retryable_or_else( + e, + |e| NetworkInterfaceUpdateError::FailedToUnsetPrimary(e) + )); + } + } - // In any case, update the actual target - Ok(update_target_query.get_result_async(&conn).await?) - }) + // In any case, update the actual target + update_target_query.get_result_async(&conn).await + } + }).await } else { // In this case, we can just directly apply the updates. By // construction, `updates.primary` is `None`, so nothing will // be done there. The other columns always need to be updated, and // we're only hitting a single row. Note that we still need to // verify the instance is stopped. - conn.transaction_async(|conn| async move { - let instance_state = - instance_query.get_result_async(&conn).await?.runtime_state; - if instance_state.propolis_id.is_some() - || instance_state.nexus_state != stopped - { - return Err(TxnError::CustomError( - NetworkInterfaceUpdateError::InstanceNotStopped, - )); - } - Ok(update_target_query.get_result_async(&conn).await?) - }) + self.transaction_retry_wrapper("instance_update_network_interface") + .transaction(&conn, |conn| { + let err = err.clone(); + let stopped = stopped.clone(); + let update_target_query = update_target_query.clone(); + async move { + let instance_state = + instance_query.get_result_async(&conn).await?.runtime_state; + if instance_state.propolis_id.is_some() + || instance_state.nexus_state != stopped + { + return Err(err.bail(NetworkInterfaceUpdateError::InstanceNotStopped)); + } + update_target_query.get_result_async(&conn).await + } + }).await } - .await // Convert to `InstanceNetworkInterface` before returning, we know // this is valid as we've filtered appropriately above. .map(NetworkInterface::as_instance) - .map_err(|e| match e { - TxnError::CustomError( - NetworkInterfaceUpdateError::InstanceNotStopped, - ) => Error::invalid_request( - "Instance must be stopped to update its network interfaces", - ), - _ => Error::internal_error(&format!("Transaction error: {:?}", e)), + .map_err(|e| { + if let Some(err) = err.take() { + match err { + NetworkInterfaceUpdateError::InstanceNotStopped => { + return Error::invalid_request( + "Instance must be stopped to update its network interfaces", + ); + }, + NetworkInterfaceUpdateError::FailedToUnsetPrimary(err) => { + return public_error_from_diesel(err, ErrorHandler::Server); + }, + } + } + public_error_from_diesel(e, ErrorHandler::Server) }) } } diff --git a/nexus/db-queries/src/db/datastore/project.rs b/nexus/db-queries/src/db/datastore/project.rs index c447b5bf98..ba0c64abfd 100644 --- a/nexus/db-queries/src/db/datastore/project.rs +++ b/nexus/db-queries/src/db/datastore/project.rs @@ -13,7 +13,6 @@ use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::fixed_data::project::SERVICES_PROJECT; use crate::db::fixed_data::silo::INTERNAL_SILO_ID; use crate::db::identity::Resource; @@ -24,7 +23,8 @@ use crate::db::model::ProjectUpdate; use crate::db::model::Silo; use crate::db::model::VirtualProvisioningCollection; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -151,49 +151,62 @@ impl DataStore { use db::schema::project::dsl; + let err = OptionalError::new(); let name = project.name().as_str().to_string(); + let conn = self.pool_connection_authorized(opctx).await?; + let db_project = self - .pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - let project: Project = Silo::insert_resource( - silo_id, - diesel::insert_into(dsl::project).values(project), - ) - .insert_and_get_result_async(&conn) - .await - .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => { - authz_silo_inner.not_found() - } - AsyncInsertError::DatabaseError(e) => { - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Project, - &name, + .transaction_retry_wrapper("project_create_in_silo") + .transaction(&conn, |conn| { + let err = err.clone(); + + let authz_silo_inner = authz_silo_inner.clone(); + let name = name.clone(); + let project = project.clone(); + async move { + let project: Project = Silo::insert_resource( + silo_id, + diesel::insert_into(dsl::project).values(project), + ) + .insert_and_get_result_async(&conn) + .await + .map_err(|e| match e { + AsyncInsertError::CollectionNotFound => { + err.bail(authz_silo_inner.not_found()) + } + AsyncInsertError::DatabaseError(diesel_error) => err + .bail_retryable_or_else( + diesel_error, + |diesel_error| { + public_error_from_diesel( + diesel_error, + ErrorHandler::Conflict( + ResourceType::Project, + &name, + ), + ) + }, ), - ) - } - })?; - - // Create resource provisioning for the project. - self.virtual_provisioning_collection_create_on_connection( - &conn, - VirtualProvisioningCollection::new( - project.id(), - CollectionTypeProvisioned::Project, - ), - ) - .await?; - Ok(project) + })?; + + // Create resource provisioning for the project. + self.virtual_provisioning_collection_create_on_connection( + &conn, + VirtualProvisioningCollection::new( + project.id(), + CollectionTypeProvisioned::Project, + ), + ) + .await?; + Ok(project) + } }) .await - .map_err(|e| match e { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = err.take() { + return err; } + public_error_from_diesel(e, ErrorHandler::Server) })?; Ok(( @@ -230,47 +243,56 @@ impl DataStore { use db::schema::project::dsl; - type TxnError = TransactionError; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - let now = Utc::now(); - let updated_rows = diesel::update(dsl::project) - .filter(dsl::time_deleted.is_null()) - .filter(dsl::id.eq(authz_project.id())) - .filter(dsl::rcgen.eq(db_project.rcgen)) - .set(dsl::time_deleted.eq(now)) - .returning(Project::as_returning()) - .execute_async(&conn) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_project), - ) - })?; + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("project_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let now = Utc::now(); + let updated_rows = diesel::update(dsl::project) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(authz_project.id())) + .filter(dsl::rcgen.eq(db_project.rcgen)) + .set(dsl::time_deleted.eq(now)) + .returning(Project::as_returning()) + .execute_async(&conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByResource( + authz_project, + ), + ) + }) + })?; + + if updated_rows == 0 { + return Err(err.bail(Error::InvalidRequest { + message: + "deletion failed due to concurrent modification" + .to_string(), + })); + } - if updated_rows == 0 { - return Err(TxnError::CustomError(Error::InvalidRequest { - message: - "deletion failed due to concurrent modification" - .to_string(), - })); + self.virtual_provisioning_collection_delete_on_connection( + &opctx.log, + &conn, + db_project.id(), + ) + .await?; + Ok(()) } - - self.virtual_provisioning_collection_delete_on_connection( - &conn, - db_project.id(), - ) - .await?; - Ok(()) }) .await - .map_err(|e| match e { - TxnError::CustomError(e) => e, - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = err.take() { + return err; } + public_error_from_diesel(e, ErrorHandler::Server) })?; Ok(()) } diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index e11377f11a..a69386cfd0 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -13,8 +13,9 @@ use crate::db; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; +use crate::db::error::MaybeRetryable::*; use crate::db::fixed_data::silo::INTERNAL_SILO_ID; use crate::db::fixed_data::vpc_subnet::DNS_VPC_SUBNET; use crate::db::fixed_data::vpc_subnet::NEXUS_VPC_SUBNET; @@ -26,6 +27,7 @@ use crate::db::model::Rack; use crate::db::model::Zpool; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -58,6 +60,7 @@ use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use omicron_common::bail_unless; use std::net::IpAddr; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; /// Groups arguments related to rack initialization @@ -88,18 +91,30 @@ enum RackInitError { DnsSerialization(Error), Silo(Error), RoleAssignment(Error), + // Retryable database error + Retryable(DieselError), + // Other non-retryable database error + Database(DieselError), } -type TxnError = TransactionError; -impl From for Error { - fn from(e: TxnError) -> Self { +// Catch-all for Diesel error conversion into RackInitError, which +// can also label errors as retryable. +impl From for RackInitError { + fn from(e: DieselError) -> Self { + if retryable(&e) { + Self::Retryable(e) + } else { + Self::Database(e) + } + } +} + +impl From for Error { + fn from(e: RackInitError) -> Self { match e { - TxnError::CustomError(RackInitError::AddingIp(err)) => err, - TxnError::CustomError(RackInitError::AddingNic(err)) => err, - TxnError::CustomError(RackInitError::DatasetInsert { - err, - zpool_id, - }) => match err { + RackInitError::AddingIp(err) => err, + RackInitError::AddingNic(err) => err, + RackInitError::DatasetInsert { err, zpool_id } => match err { AsyncInsertError::CollectionNotFound => Error::ObjectNotFound { type_name: ResourceType::Zpool, lookup_type: LookupType::ById(zpool_id), @@ -108,43 +123,36 @@ impl From for Error { public_error_from_diesel(e, ErrorHandler::Server) } }, - TxnError::CustomError(RackInitError::ServiceInsert(err)) => { - Error::internal_error(&format!( - "failed to insert Service record: {:#}", - err - )) - } - TxnError::CustomError(RackInitError::RackUpdate { - err, - rack_id, - }) => public_error_from_diesel( - err, - ErrorHandler::NotFoundByLookup( - ResourceType::Rack, - LookupType::ById(rack_id), - ), + RackInitError::ServiceInsert(err) => Error::internal_error( + &format!("failed to insert Service record: {:#}", err), ), - TxnError::CustomError(RackInitError::DnsSerialization(err)) => { - Error::internal_error(&format!( - "failed to serialize initial DNS records: {:#}", - err - )) - } - TxnError::CustomError(RackInitError::Silo(err)) => { - Error::internal_error(&format!( - "failed to create recovery Silo: {:#}", - err - )) - } - TxnError::CustomError(RackInitError::RoleAssignment(err)) => { - Error::internal_error(&format!( - "failed to assign role to initial user: {:#}", - err - )) - } - TxnError::Database(e) => { - Error::internal_error(&format!("Transaction error: {}", e)) + RackInitError::RackUpdate { err, rack_id } => { + public_error_from_diesel( + err, + ErrorHandler::NotFoundByLookup( + ResourceType::Rack, + LookupType::ById(rack_id), + ), + ) } + RackInitError::DnsSerialization(err) => Error::internal_error( + &format!("failed to serialize initial DNS records: {:#}", err), + ), + RackInitError::Silo(err) => Error::internal_error(&format!( + "failed to create recovery Silo: {:#}", + err + )), + RackInitError::RoleAssignment(err) => Error::internal_error( + &format!("failed to assign role to initial user: {:#}", err), + ), + RackInitError::Retryable(err) => Error::internal_error(&format!( + "failed operation due to database contention: {:#}", + err + )), + RackInitError::Database(err) => Error::internal_error(&format!( + "failed operation due to database error: {:#}", + err + )), } } } @@ -336,9 +344,6 @@ impl DataStore { Ok(()) } - // The following methods which return a `TxnError` take a `conn` parameter - // which comes from the transaction created in `rack_set_initialized`. - #[allow(clippy::too_many_arguments)] async fn rack_create_recovery_silo( &self, @@ -350,7 +355,7 @@ impl DataStore { recovery_user_id: external_params::UserId, recovery_user_password_hash: omicron_passwords::PasswordHashString, dns_update: DnsVersionUpdateBuilder, - ) -> Result<(), TxnError> { + ) -> Result<(), RackInitError> { let db_silo = self .silo_create_conn( conn, @@ -361,8 +366,10 @@ impl DataStore { dns_update, ) .await - .map_err(RackInitError::Silo) - .map_err(TxnError::CustomError)?; + .map_err(|err| match err.retryable() { + NotRetryable(err) => RackInitError::Silo(err.into()), + Retryable(err) => RackInitError::Retryable(err), + })?; info!(log, "Created recovery silo"); // Create the first user in the initial Recovery Silo @@ -416,8 +423,7 @@ impl DataStore { }], ) .await - .map_err(RackInitError::RoleAssignment) - .map_err(TxnError::CustomError)?; + .map_err(RackInitError::RoleAssignment)?; debug!(log, "Generated role assignment queries"); q1.execute_async(conn).await?; @@ -433,7 +439,7 @@ impl DataStore { log: &slog::Logger, service_pool: &db::model::IpPool, service: internal_params::ServicePutRequest, - ) -> Result<(), TxnError> { + ) -> Result<(), RackInitError> { use internal_params::ServiceKind; let service_db = db::model::Service::new( @@ -443,9 +449,12 @@ impl DataStore { service.address, service.kind.clone().into(), ); - self.service_upsert_conn(conn, service_db).await.map_err(|e| { - TxnError::CustomError(RackInitError::ServiceInsert(e)) - })?; + self.service_upsert_conn(conn, service_db).await.map_err( + |e| match e.retryable() { + Retryable(e) => RackInitError::Retryable(e), + NotRetryable(e) => RackInitError::ServiceInsert(e.into()), + }, + )?; // For services with external connectivity, we record their // explicit IP allocation and create a service NIC as well. @@ -476,9 +485,7 @@ impl DataStore { Some(nic.ip), Some(nic.mac), ) - .map_err(|e| { - TxnError::CustomError(RackInitError::AddingNic(e)) - })?; + .map_err(|e| RackInitError::AddingNic(e))?; Some((db_ip, db_nic)) } ServiceKind::BoundaryNtp { snat, ref nic } => { @@ -500,9 +507,7 @@ impl DataStore { Some(nic.ip), Some(nic.mac), ) - .map_err(|e| { - TxnError::CustomError(RackInitError::AddingNic(e)) - })?; + .map_err(|e| RackInitError::AddingNic(e))?; Some((db_ip, db_nic)) } _ => None, @@ -517,7 +522,10 @@ impl DataStore { IP address for {}", service.kind, ); - TxnError::CustomError(RackInitError::AddingIp(err)) + match err.retryable() { + Retryable(e) => RackInitError::Retryable(e), + NotRetryable(e) => RackInitError::AddingIp(e.into()), + } })?; self.create_network_interface_raw_conn(conn, db_nic) @@ -530,9 +538,10 @@ impl DataStore { _, db::model::NetworkInterfaceKind::Service, ) => Ok(()), - _ => Err(TxnError::CustomError( - RackInitError::AddingNic(e.into_external()), - )), + InsertError::Retryable(err) => { + Err(RackInitError::Retryable(err)) + } + _ => Err(RackInitError::AddingNic(e.into_external())), } })?; } @@ -551,146 +560,187 @@ impl DataStore { opctx.authorize(authz::Action::CreateChild, &authz::FLEET).await?; - let rack_id = rack_init.rack_id; - let services = rack_init.services; - let datasets = rack_init.datasets; - let service_ip_pool_ranges = rack_init.service_ip_pool_ranges; - let internal_dns = rack_init.internal_dns; - let external_dns = rack_init.external_dns; - let (authz_service_pool, service_pool) = self.ip_pools_service_lookup(&opctx).await?; // NOTE: This operation could likely be optimized with a CTE, but given // the low-frequency of calls, this optimization has been deferred. let log = opctx.log.clone(); + let err = Arc::new(OnceLock::new()); + + // NOTE: This transaction cannot yet be made retryable, as it uses + // nested transactions. let rack = self .pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - // Early exit if the rack has already been initialized. - let rack = rack_dsl::rack - .filter(rack_dsl::id.eq(rack_id)) - .select(Rack::as_select()) - .get_result_async(&conn) - .await - .map_err(|e| { - warn!(log, "Initializing Rack: Rack UUID not found"); - TxnError::CustomError(RackInitError::RackUpdate { - err: e, - rack_id, - }) - })?; - if rack.initialized { - info!(log, "Early exit: Rack already initialized"); - return Ok(rack); - } + .transaction_async(|conn| { + let err = err.clone(); + let log = log.clone(); + let authz_service_pool = authz_service_pool.clone(); + let rack_init = rack_init.clone(); + let service_pool = service_pool.clone(); + async move { + let rack_id = rack_init.rack_id; + let services = rack_init.services; + let datasets = rack_init.datasets; + let service_ip_pool_ranges = rack_init.service_ip_pool_ranges; + let internal_dns = rack_init.internal_dns; + let external_dns = rack_init.external_dns; + + // Early exit if the rack has already been initialized. + let rack = rack_dsl::rack + .filter(rack_dsl::id.eq(rack_id)) + .select(Rack::as_select()) + .get_result_async(&conn) + .await + .map_err(|e| { + warn!(log, "Initializing Rack: Rack UUID not found"); + err.set(RackInitError::RackUpdate { + err: e, + rack_id, + }).unwrap(); + DieselError::RollbackTransaction + })?; + if rack.initialized { + info!(log, "Early exit: Rack already initialized"); + return Ok::<_, DieselError>(rack); + } - // Otherwise, insert services and datasets. + // Otherwise, insert services and datasets. - // Set up the IP pool for internal services. - for range in service_ip_pool_ranges { - Self::ip_pool_add_range_on_connection( - &conn, - opctx, - &authz_service_pool, - &range, - ) - .await - .map_err(|err| { - warn!( - log, - "Initializing Rack: Failed to add IP pool range" - ); - TxnError::CustomError(RackInitError::AddingIp(err)) - })?; - } + // Set up the IP pool for internal services. + for range in service_ip_pool_ranges { + Self::ip_pool_add_range_on_connection( + &conn, + opctx, + &authz_service_pool, + &range, + ) + .await + .map_err(|e| { + warn!( + log, + "Initializing Rack: Failed to add IP pool range" + ); + err.set(RackInitError::AddingIp(e)).unwrap(); + DieselError::RollbackTransaction + })?; + } + + // Allocate records for all services. + for service in services { + self.rack_populate_service_records( + &conn, + &log, + &service_pool, + service, + ) + .await + .map_err(|e| { + err.set(e).unwrap(); + DieselError::RollbackTransaction + })?; + } + info!(log, "Inserted services"); + + for dataset in datasets { + use db::schema::dataset::dsl; + let zpool_id = dataset.pool_id; + >::insert_resource( + zpool_id, + diesel::insert_into(dsl::dataset) + .values(dataset.clone()) + .on_conflict(dsl::id) + .do_update() + .set(( + dsl::time_modified.eq(Utc::now()), + dsl::pool_id.eq(excluded(dsl::pool_id)), + dsl::ip.eq(excluded(dsl::ip)), + dsl::port.eq(excluded(dsl::port)), + dsl::kind.eq(excluded(dsl::kind)), + )), + ) + .insert_and_get_result_async(&conn) + .await + .map_err(|e| { + err.set(RackInitError::DatasetInsert { + err: e, + zpool_id, + }).unwrap(); + DieselError::RollbackTransaction + })?; + } + info!(log, "Inserted datasets"); - // Allocate records for all services. - for service in services { - self.rack_populate_service_records( + // Insert the initial contents of the internal and external DNS + // zones. + Self::load_dns_data(&conn, internal_dns) + .await + .map_err(|e| { + err.set(RackInitError::DnsSerialization(e)).unwrap(); + DieselError::RollbackTransaction + })?; + info!(log, "Populated DNS tables for internal DNS"); + + Self::load_dns_data(&conn, external_dns) + .await + .map_err(|e| { + err.set(RackInitError::DnsSerialization(e)).unwrap(); + DieselError::RollbackTransaction + })?; + info!(log, "Populated DNS tables for external DNS"); + + // Create the initial Recovery Silo + self.rack_create_recovery_silo( + &opctx, &conn, &log, - &service_pool, - service, - ) - .await?; - } - info!(log, "Inserted services"); - - for dataset in datasets { - use db::schema::dataset::dsl; - let zpool_id = dataset.pool_id; - >::insert_resource( - zpool_id, - diesel::insert_into(dsl::dataset) - .values(dataset.clone()) - .on_conflict(dsl::id) - .do_update() - .set(( - dsl::time_modified.eq(Utc::now()), - dsl::pool_id.eq(excluded(dsl::pool_id)), - dsl::ip.eq(excluded(dsl::ip)), - dsl::port.eq(excluded(dsl::port)), - dsl::kind.eq(excluded(dsl::kind)), - )), + rack_init.recovery_silo, + rack_init.recovery_silo_fq_dns_name, + rack_init.recovery_user_id, + rack_init.recovery_user_password_hash, + rack_init.dns_update, ) - .insert_and_get_result_async(&conn) .await - .map_err(|err| { - TxnError::CustomError(RackInitError::DatasetInsert { - err, - zpool_id, - }) + .map_err(|e| match e { + RackInitError::Retryable(e) => e, + _ => { + err.set(e).unwrap(); + DieselError::RollbackTransaction + }, })?; - } - info!(log, "Inserted datasets"); - - // Insert the initial contents of the internal and external DNS - // zones. - Self::load_dns_data(&conn, internal_dns) - .await - .map_err(RackInitError::DnsSerialization) - .map_err(TxnError::CustomError)?; - info!(log, "Populated DNS tables for internal DNS"); - Self::load_dns_data(&conn, external_dns) - .await - .map_err(RackInitError::DnsSerialization) - .map_err(TxnError::CustomError)?; - info!(log, "Populated DNS tables for external DNS"); - - // Create the initial Recovery Silo - self.rack_create_recovery_silo( - &opctx, - &conn, - &log, - rack_init.recovery_silo, - rack_init.recovery_silo_fq_dns_name, - rack_init.recovery_user_id, - rack_init.recovery_user_password_hash, - rack_init.dns_update, - ) - .await?; - - let rack = diesel::update(rack_dsl::rack) - .filter(rack_dsl::id.eq(rack_id)) - .set(( - rack_dsl::initialized.eq(true), - rack_dsl::time_modified.eq(Utc::now()), - )) - .returning(Rack::as_returning()) - .get_result_async::(&conn) - .await - .map_err(|err| { - TxnError::CustomError(RackInitError::RackUpdate { - err, - rack_id, - }) - })?; - Ok::<_, TxnError>(rack) - }) - .await?; + let rack = diesel::update(rack_dsl::rack) + .filter(rack_dsl::id.eq(rack_id)) + .set(( + rack_dsl::initialized.eq(true), + rack_dsl::time_modified.eq(Utc::now()), + )) + .returning(Rack::as_returning()) + .get_result_async::(&conn) + .await + .map_err(|e| { + if retryable(&e) { + return e; + } + err.set(RackInitError::RackUpdate { + err: e, + rack_id, + }).unwrap(); + DieselError::RollbackTransaction + })?; + Ok(rack) + } + }, + ) + .await + .map_err(|e| { + if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + err.into() + } else { + Error::internal_error(&format!("Transaction error: {}", e)) + } + })?; Ok(rack) } @@ -745,42 +795,54 @@ impl DataStore { use crate::db::schema::external_ip::dsl as extip_dsl; use crate::db::schema::service::dsl as service_dsl; - type TxnError = TransactionError; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - let ips = extip_dsl::external_ip - .inner_join( - service_dsl::service.on(service_dsl::id - .eq(extip_dsl::parent_id.assume_not_null())), - ) - .filter(extip_dsl::parent_id.is_not_null()) - .filter(extip_dsl::time_deleted.is_null()) - .filter(extip_dsl::is_service) - .filter(service_dsl::kind.eq(db::model::ServiceKind::Nexus)) - .select(ExternalIp::as_select()) - .get_results_async(&conn) - .await? - .into_iter() - .map(|external_ip| external_ip.ip.ip()) - .collect(); - - let dns_zones = self - .dns_zones_list_all_on_connection( - opctx, - &conn, - DnsGroup::External, - ) - .await?; - Ok((ips, dns_zones)) + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("nexus_external_addresses") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let ips = extip_dsl::external_ip + .inner_join( + service_dsl::service.on(service_dsl::id + .eq(extip_dsl::parent_id.assume_not_null())), + ) + .filter(extip_dsl::parent_id.is_not_null()) + .filter(extip_dsl::time_deleted.is_null()) + .filter(extip_dsl::is_service) + .filter( + service_dsl::kind.eq(db::model::ServiceKind::Nexus), + ) + .select(ExternalIp::as_select()) + .get_results_async(&conn) + .await? + .into_iter() + .map(|external_ip| external_ip.ip.ip()) + .collect(); + + let dns_zones = self + .dns_zones_list_all_on_connection( + opctx, + &conn, + DnsGroup::External, + ) + .await + .map_err(|e| match e.retryable() { + NotRetryable(not_retryable_err) => { + err.bail(not_retryable_err) + } + Retryable(retryable_err) => retryable_err, + })?; + + Ok((ips, dns_zones)) + } }) .await - .map_err(|error: TxnError| match error { - TransactionError::CustomError(err) => err, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = err.take() { + return err.into(); } + public_error_from_diesel(e, ErrorHandler::Server) }) } } @@ -1014,14 +1076,16 @@ mod test { async fn [](db: &DataStore) -> Vec<$model> { use crate::db::schema::$table::dsl; use nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL; - db.pool_connection_for_tests() + let conn = db.pool_connection_for_tests() .await - .unwrap() - .transaction_async(|conn| async move { + .unwrap(); + + db.transaction_retry_wrapper(concat!("fn_to_get_all_", stringify!($table))) + .transaction(&conn, |conn| async move { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) .await .unwrap(); - Ok::<_, crate::db::TransactionError<()>>( + Ok( dsl::$table .select($model::as_select()) .get_results_async(&conn) diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 9465fe2792..b055a3e85c 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -10,18 +10,16 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::lookup::LookupPath; use crate::db::model::Dataset; use crate::db::model::Region; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use nexus_types::external_api::params; use omicron_common::api::external; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; -use omicron_common::backoff::{self, BackoffError}; use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; use uuid::Uuid; @@ -152,7 +150,7 @@ impl DataStore { /// Also updates the storage usage on their corresponding datasets. pub async fn regions_hard_delete( &self, - log: &Logger, + _log: &Logger, region_ids: Vec, ) -> DeleteResult { if region_ids.is_empty() { @@ -164,98 +162,79 @@ impl DataStore { #[error("Numeric error: {0}")] NumericError(String), } - type TxnError = TransactionError; - - // Retry this transaction until it succeeds. It's a little heavy in that - // there's a for loop inside that iterates over the datasets the - // argument regions belong to, and it often encounters the "retry - // transaction" error. - let transaction = { - |region_ids: Vec| async { - self.pool_connection_unauthorized() - .await? - .transaction_async(|conn| async move { - use db::schema::dataset::dsl as dataset_dsl; - use db::schema::region::dsl as region_dsl; - - // Remove the regions, collecting datasets they're from. - let datasets = diesel::delete(region_dsl::region) - .filter(region_dsl::id.eq_any(region_ids)) - .returning(region_dsl::dataset_id) - .get_results_async::(&conn).await?; - - // Update datasets to which the regions belonged. - for dataset in datasets { - let dataset_total_occupied_size: Option< - diesel::pg::data_types::PgNumeric, - > = region_dsl::region - .filter(region_dsl::dataset_id.eq(dataset)) - .select(diesel::dsl::sum( - region_dsl::block_size - * region_dsl::blocks_per_extent - * region_dsl::extent_count, - )) - .nullable() - .get_result_async(&conn).await?; - - let dataset_total_occupied_size: i64 = if let Some( - dataset_total_occupied_size, - ) = - dataset_total_occupied_size - { - let dataset_total_occupied_size: db::model::ByteCount = - dataset_total_occupied_size.try_into().map_err( - |e: anyhow::Error| { - TxnError::CustomError( - RegionDeleteError::NumericError( - e.to_string(), - ), - ) - }, - )?; - - dataset_total_occupied_size.into() - } else { - 0 - }; - - diesel::update(dataset_dsl::dataset) - .filter(dataset_dsl::id.eq(dataset)) - .set( - dataset_dsl::size_used - .eq(dataset_total_occupied_size), - ) - .execute_async(&conn).await?; - } - - Ok(()) - }) - .await - .map_err(|e: TxnError| { - if e.retry_transaction() { - BackoffError::transient(Error::internal_error( - &format!("Retryable transaction error {:?}", e) + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("regions_hard_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + let region_ids = region_ids.clone(); + async move { + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl as region_dsl; + + // Remove the regions, collecting datasets they're from. + let datasets = diesel::delete(region_dsl::region) + .filter(region_dsl::id.eq_any(region_ids)) + .returning(region_dsl::dataset_id) + .get_results_async::(&conn).await?; + + // Update datasets to which the regions belonged. + for dataset in datasets { + let dataset_total_occupied_size: Option< + diesel::pg::data_types::PgNumeric, + > = region_dsl::region + .filter(region_dsl::dataset_id.eq(dataset)) + .select(diesel::dsl::sum( + region_dsl::block_size + * region_dsl::blocks_per_extent + * region_dsl::extent_count, )) + .nullable() + .get_result_async(&conn).await?; + + let dataset_total_occupied_size: i64 = if let Some( + dataset_total_occupied_size, + ) = + dataset_total_occupied_size + { + let dataset_total_occupied_size: db::model::ByteCount = + dataset_total_occupied_size.try_into().map_err( + |e: anyhow::Error| { + err.bail(RegionDeleteError::NumericError( + e.to_string(), + )) + }, + )?; + + dataset_total_occupied_size.into() } else { - BackoffError::Permanent(Error::internal_error( - &format!("Transaction error: {}", e) - )) + 0 + }; + + diesel::update(dataset_dsl::dataset) + .filter(dataset_dsl::id.eq(dataset)) + .set( + dataset_dsl::size_used + .eq(dataset_total_occupied_size), + ) + .execute_async(&conn).await?; + } + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + RegionDeleteError::NumericError(err) => { + return Error::internal_error( + &format!("Transaction error: {}", err) + ); } - }) - } - }; - - backoff::retry_notify( - backoff::retry_policy_internal_service_aggressive(), - || async { - let region_ids = region_ids.clone(); - transaction(region_ids).await - }, - |e: Error, delay| { - info!(log, "{:?}, trying again in {:?}", e, delay,); - }, - ) - .await + } + } + public_error_from_diesel(e, ErrorHandler::Server) + }) } /// Return the total occupied size for a dataset diff --git a/nexus/db-queries/src/db/datastore/service.rs b/nexus/db-queries/src/db/datastore/service.rs index 40bf250abe..df7ed27a6d 100644 --- a/nexus/db-queries/src/db/datastore/service.rs +++ b/nexus/db-queries/src/db/datastore/service.rs @@ -11,7 +11,9 @@ use crate::db; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; +use crate::db::error::TransactionError; use crate::db::identity::Asset; use crate::db::model::Service; use crate::db::model::Sled; @@ -38,7 +40,12 @@ impl DataStore { service: Service, ) -> CreateResult { let conn = self.pool_connection_authorized(opctx).await?; - self.service_upsert_conn(&conn, service).await + self.service_upsert_conn(&conn, service).await.map_err(|e| match e { + TransactionError::CustomError(err) => err, + TransactionError::Database(err) => { + public_error_from_diesel(err, ErrorHandler::Server) + } + }) } /// Stores a new service in the database (using an existing db connection). @@ -46,7 +53,7 @@ impl DataStore { &self, conn: &async_bb8_diesel::Connection, service: Service, - ) -> CreateResult { + ) -> Result> { use db::schema::service::dsl; let service_id = service.id(); @@ -68,17 +75,24 @@ impl DataStore { .insert_and_get_result_async(conn) .await .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => Error::ObjectNotFound { - type_name: ResourceType::Sled, - lookup_type: LookupType::ById(sled_id), - }, - AsyncInsertError::DatabaseError(e) => public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Service, - &service_id.to_string(), - ), - ), + AsyncInsertError::CollectionNotFound => { + TransactionError::CustomError(Error::ObjectNotFound { + type_name: ResourceType::Sled, + lookup_type: LookupType::ById(sled_id), + }) + } + AsyncInsertError::DatabaseError(e) => { + if retryable(&e) { + return TransactionError::Database(e); + } + TransactionError::CustomError(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Service, + &service_id.to_string(), + ), + )) + } }) } diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index ec3658c067..ab48ec458f 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -11,6 +11,7 @@ use crate::context::OpContext; use crate::db; use crate::db::datastore::RunnableQuery; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::error::TransactionError; use crate::db::fixed_data::silo::{DEFAULT_SILO, INTERNAL_SILO}; @@ -123,15 +124,17 @@ impl DataStore { dns_update: DnsVersionUpdateBuilder, ) -> CreateResult { let conn = self.pool_connection_authorized(opctx).await?; - self.silo_create_conn( - &conn, - opctx, - nexus_opctx, - new_silo_params, - new_silo_dns_names, - dns_update, - ) - .await + let silo = self + .silo_create_conn( + &conn, + opctx, + nexus_opctx, + new_silo_params, + new_silo_dns_names, + dns_update, + ) + .await?; + Ok(silo) } pub async fn silo_create_conn( @@ -142,7 +145,7 @@ impl DataStore { new_silo_params: params::SiloCreate, new_silo_dns_names: &[String], dns_update: DnsVersionUpdateBuilder, - ) -> CreateResult { + ) -> Result> { let silo_id = Uuid::new_v4(); let silo_group_id = Uuid::new_v4(); @@ -199,71 +202,71 @@ impl DataStore { None }; - conn.transaction_async(|conn| async move { - let silo = silo_create_query - .get_result_async(&conn) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Silo, - new_silo_params.identity.name.as_str(), - ), - ) - })?; - self.virtual_provisioning_collection_create_on_connection( - &conn, - VirtualProvisioningCollection::new( - silo.id(), - CollectionTypeProvisioned::Silo, - ), - ) - .await?; - - if let Some(query) = silo_admin_group_ensure_query { - query.get_result_async(&conn).await?; - } - - if let Some(queries) = silo_admin_group_role_assignment_queries { - let (delete_old_query, insert_new_query) = queries; - delete_old_query.execute_async(&conn).await?; - insert_new_query.execute_async(&conn).await?; - } - - let certificates = new_silo_params - .tls_certificates - .into_iter() - .map(|c| { - Certificate::new( + let silo = conn + .transaction_async(|conn| async move { + let silo = silo_create_query + .get_result_async(&conn) + .await + .map_err(|e| { + if retryable(&e) { + return TransactionError::Database(e); + } + TransactionError::CustomError(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Silo, + new_silo_params.identity.name.as_str(), + ), + )) + })?; + self.virtual_provisioning_collection_create_on_connection( + &conn, + VirtualProvisioningCollection::new( silo.id(), - Uuid::new_v4(), - ServiceKind::Nexus, - c, - new_silo_dns_names, - ) - }) - .collect::, _>>() - .map_err(Error::from)?; - { - use db::schema::certificate::dsl; - diesel::insert_into(dsl::certificate) - .values(certificates) - .execute_async(&conn) - .await?; - } - - self.dns_update(nexus_opctx, &conn, dns_update).await?; + CollectionTypeProvisioned::Silo, + ), + ) + .await?; - Ok(silo) - }) - .await - .map_err(|e| match e { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + if let Some(query) = silo_admin_group_ensure_query { + query.get_result_async(&conn).await?; + } + + if let Some(queries) = silo_admin_group_role_assignment_queries + { + let (delete_old_query, insert_new_query) = queries; + delete_old_query.execute_async(&conn).await?; + insert_new_query.execute_async(&conn).await?; + } + + let certificates = new_silo_params + .tls_certificates + .into_iter() + .map(|c| { + Certificate::new( + silo.id(), + Uuid::new_v4(), + ServiceKind::Nexus, + c, + new_silo_dns_names, + ) + }) + .collect::, _>>() + .map_err(Error::from)?; + { + use db::schema::certificate::dsl; + diesel::insert_into(dsl::certificate) + .values(certificates) + .execute_async(&conn) + .await?; + } + + self.dns_update(nexus_opctx, &conn, dns_update).await?; + + Ok::>(silo) + }) + .await?; + Ok(silo) } pub async fn silos_list_by_id( @@ -380,7 +383,7 @@ impl DataStore { } self.virtual_provisioning_collection_delete_on_connection( - &conn, id, + &opctx.log, &conn, id, ) .await?; diff --git a/nexus/db-queries/src/db/datastore/silo_group.rs b/nexus/db-queries/src/db/datastore/silo_group.rs index 46f4aae7c9..29fcb7490b 100644 --- a/nexus/db-queries/src/db/datastore/silo_group.rs +++ b/nexus/db-queries/src/db/datastore/silo_group.rs @@ -145,35 +145,39 @@ impl DataStore { ) -> UpdateResult<()> { opctx.authorize(authz::Action::Modify, authz_silo_user).await?; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - use db::schema::silo_group_membership::dsl; + let conn = self.pool_connection_authorized(opctx).await?; - // Delete existing memberships for user - let silo_user_id = authz_silo_user.id(); - diesel::delete(dsl::silo_group_membership) - .filter(dsl::silo_user_id.eq(silo_user_id)) - .execute_async(&conn) - .await?; + self.transaction_retry_wrapper("silo_group_membership_replace_for_user") + .transaction(&conn, |conn| { + let silo_group_ids = silo_group_ids.clone(); + async move { + use db::schema::silo_group_membership::dsl; + + // Delete existing memberships for user + let silo_user_id = authz_silo_user.id(); + diesel::delete(dsl::silo_group_membership) + .filter(dsl::silo_user_id.eq(silo_user_id)) + .execute_async(&conn) + .await?; - // Create new memberships for user - let silo_group_memberships: Vec< - db::model::SiloGroupMembership, - > = silo_group_ids - .iter() - .map(|group_id| db::model::SiloGroupMembership { - silo_group_id: *group_id, - silo_user_id, - }) - .collect(); + // Create new memberships for user + let silo_group_memberships: Vec< + db::model::SiloGroupMembership, + > = silo_group_ids + .iter() + .map(|group_id| db::model::SiloGroupMembership { + silo_group_id: *group_id, + silo_user_id, + }) + .collect(); - diesel::insert_into(dsl::silo_group_membership) - .values(silo_group_memberships) - .execute_async(&conn) - .await?; + diesel::insert_into(dsl::silo_group_membership) + .values(silo_group_memberships) + .execute_async(&conn) + .await?; - Ok(()) + Ok(()) + } }) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 406119a636..023384a9bf 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -10,13 +10,12 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::Sled; use crate::db::model::SledResource; use crate::db::model::SledUpdate; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -90,123 +89,141 @@ impl DataStore { enum SledReservationError { NotFound, } - type TxnError = TransactionError; - - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - use db::schema::sled_resource::dsl as resource_dsl; - // Check if resource ID already exists - if so, return it. - let old_resource = resource_dsl::sled_resource - .filter(resource_dsl::id.eq(resource_id)) - .select(SledResource::as_select()) - .limit(1) - .load_async(&conn) - .await?; - - if !old_resource.is_empty() { - return Ok(old_resource[0].clone()); - } - - // If it doesn't already exist, find a sled with enough space - // for the resources we're requesting. - use db::schema::sled::dsl as sled_dsl; - // This answers the boolean question: - // "Does the SUM of all hardware thread usage, plus the one we're trying - // to allocate, consume less threads than exists on the sled?" - let sled_has_space_for_threads = - (diesel::dsl::sql::(&format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::hardware_threads::NAME - )) + resources.hardware_threads) - .le(sled_dsl::usable_hardware_threads); - - // This answers the boolean question: - // "Does the SUM of all RAM usage, plus the one we're trying - // to allocate, consume less RAM than exists on the sled?" - let sled_has_space_for_rss = - (diesel::dsl::sql::(&format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::rss_ram::NAME - )) + resources.rss_ram) - .le(sled_dsl::usable_physical_ram); - - // Determine whether adding this service's reservoir allocation - // to what's allocated on the sled would avoid going over quota. - let sled_has_space_in_reservoir = - (diesel::dsl::sql::(&format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::reservoir_ram::NAME - )) + resources.reservoir_ram) - .le(sled_dsl::reservoir_size); - - // Generate a query describing all of the sleds that have space - // for this reservation. - let mut sled_targets = sled_dsl::sled - .left_join( - resource_dsl::sled_resource - .on(resource_dsl::sled_id.eq(sled_dsl::id)), - ) - .group_by(sled_dsl::id) - .having( - sled_has_space_for_threads - .and(sled_has_space_for_rss) - .and(sled_has_space_in_reservoir), - ) - .filter(sled_dsl::time_deleted.is_null()) - // Filter out sleds that are not provisionable. - .filter( - sled_dsl::provision_state - .eq(db::model::SledProvisionState::Provisionable), - ) - .select(sled_dsl::id) - .into_boxed(); - - // Further constrain the sled IDs according to any caller- - // supplied constraints. - if let Some(must_select_from) = constraints.must_select_from() { - sled_targets = sled_targets - .filter(sled_dsl::id.eq_any(must_select_from.to_vec())); - } - sql_function!(fn random() -> diesel::sql_types::Float); - let sled_targets = sled_targets - .order(random()) - .limit(1) - .get_results_async::(&conn) - .await?; - - if sled_targets.is_empty() { - return Err(TxnError::CustomError( - SledReservationError::NotFound, - )); + let err = OptionalError::new(); + + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("sled_reservation_create") + .transaction(&conn, |conn| { + // Clone variables into retryable function + let err = err.clone(); + let constraints = constraints.clone(); + let resources = resources.clone(); + + async move { + use db::schema::sled_resource::dsl as resource_dsl; + // Check if resource ID already exists - if so, return it. + let old_resource = resource_dsl::sled_resource + .filter(resource_dsl::id.eq(resource_id)) + .select(SledResource::as_select()) + .limit(1) + .load_async(&conn) + .await?; + + if !old_resource.is_empty() { + return Ok(old_resource[0].clone()); + } + + // If it doesn't already exist, find a sled with enough space + // for the resources we're requesting. + use db::schema::sled::dsl as sled_dsl; + // This answers the boolean question: + // "Does the SUM of all hardware thread usage, plus the one we're trying + // to allocate, consume less threads than exists on the sled?" + let sled_has_space_for_threads = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::hardware_threads::NAME + ), + ) + resources.hardware_threads) + .le(sled_dsl::usable_hardware_threads); + + // This answers the boolean question: + // "Does the SUM of all RAM usage, plus the one we're trying + // to allocate, consume less RAM than exists on the sled?" + let sled_has_space_for_rss = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::rss_ram::NAME + ), + ) + resources.rss_ram) + .le(sled_dsl::usable_physical_ram); + + // Determine whether adding this service's reservoir allocation + // to what's allocated on the sled would avoid going over quota. + let sled_has_space_in_reservoir = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::reservoir_ram::NAME + ), + ) + resources.reservoir_ram) + .le(sled_dsl::reservoir_size); + + // Generate a query describing all of the sleds that have space + // for this reservation. + let mut sled_targets = + sled_dsl::sled + .left_join( + resource_dsl::sled_resource + .on(resource_dsl::sled_id.eq(sled_dsl::id)), + ) + .group_by(sled_dsl::id) + .having( + sled_has_space_for_threads + .and(sled_has_space_for_rss) + .and(sled_has_space_in_reservoir), + ) + .filter(sled_dsl::time_deleted.is_null()) + // Filter out sleds that are not provisionable. + .filter(sled_dsl::provision_state.eq( + db::model::SledProvisionState::Provisionable, + )) + .select(sled_dsl::id) + .into_boxed(); + + // Further constrain the sled IDs according to any caller- + // supplied constraints. + if let Some(must_select_from) = + constraints.must_select_from() + { + sled_targets = sled_targets.filter( + sled_dsl::id.eq_any(must_select_from.to_vec()), + ); + } + + sql_function!(fn random() -> diesel::sql_types::Float); + let sled_targets = sled_targets + .order(random()) + .limit(1) + .get_results_async::(&conn) + .await?; + + if sled_targets.is_empty() { + return Err(err.bail(SledReservationError::NotFound)); + } + + // Create a SledResource record, associate it with the target + // sled. + let resource = SledResource::new( + resource_id, + sled_targets[0], + resource_kind, + resources, + ); + + diesel::insert_into(resource_dsl::sled_resource) + .values(resource) + .returning(SledResource::as_returning()) + .get_result_async(&conn) + .await } - - // Create a SledResource record, associate it with the target - // sled. - let resource = SledResource::new( - resource_id, - sled_targets[0], - resource_kind, - resources, - ); - - Ok(diesel::insert_into(resource_dsl::sled_resource) - .values(resource) - .returning(SledResource::as_returning()) - .get_result_async(&conn) - .await?) }) .await - .map_err(|e| match e { - TxnError::CustomError(SledReservationError::NotFound) => { - external::Error::unavail( - "No sleds can fit the requested instance", - ) - } - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SledReservationError::NotFound => { + return external::Error::unavail( + "No sleds can fit the requested instance", + ); + } + } } + public_error_from_diesel(e, ErrorHandler::Server) }) } diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 7c03e4bd40..7a9eb8d2bc 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -20,8 +20,7 @@ use crate::db::model::SnapshotState; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; -use crate::db::TransactionError; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -48,114 +47,99 @@ impl DataStore { let gen = snapshot.gen; opctx.authorize(authz::Action::CreateChild, authz_project).await?; - #[derive(Debug, thiserror::Error)] - pub enum CustomError { - #[error("Resource already exists")] - ResourceAlreadyExists, - - #[error("saw AsyncInsertError")] - InsertError(AsyncInsertError), - } - - type TxnError = TransactionError; - - let snapshot_name = snapshot.name().to_string(); let project_id = snapshot.project_id; - let snapshot: Snapshot = self - .pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - use db::schema::snapshot::dsl; + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; - // If an undeleted snapshot exists in the database with the - // same name and project but a different id to the snapshot - // this function was passed as an argument, then return an - // error here. - // - // As written below, - // - // .on_conflict((dsl::project_id, dsl::name)) - // .filter_target(dsl::time_deleted.is_null()) - // .do_update() - // .set(dsl::time_modified.eq(dsl::time_modified)) - // - // will set any existing record's `time_modified` if the - // project id and name match, even if the snapshot ID does - // not match. diesel supports adding a filter below like so - // (marked with >>): - // - // .on_conflict((dsl::project_id, dsl::name)) - // .filter_target(dsl::time_deleted.is_null()) - // .do_update() - // .set(dsl::time_modified.eq(dsl::time_modified)) - // >> .filter(dsl::id.eq(snapshot.id())) - // - // which will restrict the `insert_into`'s set so that it - // only applies if the snapshot ID matches. But, - // AsyncInsertError does not have a ObjectAlreadyExists - // variant, so this will be returned as CollectionNotFound - // due to the `insert_into` failing. - // - // If this function is passed a snapshot with an ID that - // does not match, but a project and name that does, return - // ObjectAlreadyExists here. + let snapshot: Snapshot = self + .transaction_retry_wrapper("project_ensure_snapshot") + .transaction(&conn, |conn| { + let err = err.clone(); + let snapshot = snapshot.clone(); + let snapshot_name = snapshot.name().to_string(); + async move { + use db::schema::snapshot::dsl; - let existing_snapshot_id: Option = dsl::snapshot - .filter(dsl::time_deleted.is_null()) - .filter(dsl::name.eq(snapshot.name().to_string())) - .filter(dsl::project_id.eq(snapshot.project_id)) - .select(dsl::id) - .limit(1) - .first_async(&conn) - .await - .optional()?; + // If an undeleted snapshot exists in the database with the + // same name and project but a different id to the snapshot + // this function was passed as an argument, then return an + // error here. + // + // As written below, + // + // .on_conflict((dsl::project_id, dsl::name)) + // .filter_target(dsl::time_deleted.is_null()) + // .do_update() + // .set(dsl::time_modified.eq(dsl::time_modified)) + // + // will set any existing record's `time_modified` if the + // project id and name match, even if the snapshot ID does + // not match. diesel supports adding a filter below like so + // (marked with >>): + // + // .on_conflict((dsl::project_id, dsl::name)) + // .filter_target(dsl::time_deleted.is_null()) + // .do_update() + // .set(dsl::time_modified.eq(dsl::time_modified)) + // >> .filter(dsl::id.eq(snapshot.id())) + // + // which will restrict the `insert_into`'s set so that it + // only applies if the snapshot ID matches. But, + // AsyncInsertError does not have a ObjectAlreadyExists + // variant, so this will be returned as CollectionNotFound + // due to the `insert_into` failing. + // + // If this function is passed a snapshot with an ID that + // does not match, but a project and name that does, return + // ObjectAlreadyExists here. - if let Some(existing_snapshot_id) = existing_snapshot_id { - if existing_snapshot_id != snapshot.id() { - return Err(TransactionError::CustomError( - CustomError::ResourceAlreadyExists, - )); - } - } + let existing_snapshot_id: Option = dsl::snapshot + .filter(dsl::time_deleted.is_null()) + .filter(dsl::name.eq(snapshot.name().to_string())) + .filter(dsl::project_id.eq(snapshot.project_id)) + .select(dsl::id) + .limit(1) + .first_async(&conn) + .await + .optional()?; - Project::insert_resource( - project_id, - diesel::insert_into(dsl::snapshot) - .values(snapshot) - .on_conflict((dsl::project_id, dsl::name)) - .filter_target(dsl::time_deleted.is_null()) - .do_update() - .set(dsl::time_modified.eq(dsl::time_modified)), - ) - .insert_and_get_result_async(&conn) - .await - .map_err(|e| { - TransactionError::CustomError(CustomError::InsertError(e)) - }) - }) - .await - .map_err(|e: TxnError| match e { - TxnError::CustomError(e) => match e { - CustomError::ResourceAlreadyExists => { - Error::ObjectAlreadyExists { - type_name: ResourceType::Snapshot, - object_name: snapshot_name, + if let Some(existing_snapshot_id) = existing_snapshot_id { + if existing_snapshot_id != snapshot.id() { + return Err(err.bail(Error::ObjectAlreadyExists { + type_name: ResourceType::Snapshot, + object_name: snapshot_name, + })); } } - CustomError::InsertError(e) => match e { + + Project::insert_resource( + project_id, + diesel::insert_into(dsl::snapshot) + .values(snapshot) + .on_conflict((dsl::project_id, dsl::name)) + .filter_target(dsl::time_deleted.is_null()) + .do_update() + .set(dsl::time_modified.eq(dsl::time_modified)), + ) + .insert_and_get_result_async(&conn) + .await + .map_err(|e| match e { AsyncInsertError::CollectionNotFound => { - Error::ObjectNotFound { + err.bail(Error::ObjectNotFound { type_name: ResourceType::Project, lookup_type: LookupType::ById(project_id), - } - } - AsyncInsertError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + }) } - }, - }, - TxnError::Database(e) => { + AsyncInsertError::DatabaseError(e) => e, + }) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err + } else { public_error_from_diesel(e, ErrorHandler::Server) } })?; diff --git a/nexus/db-queries/src/db/datastore/switch_interface.rs b/nexus/db-queries/src/db/datastore/switch_interface.rs index 88cff50471..67f16fa08f 100644 --- a/nexus/db-queries/src/db/datastore/switch_interface.rs +++ b/nexus/db-queries/src/db/datastore/switch_interface.rs @@ -11,11 +11,10 @@ use crate::db::datastore::address_lot::{ }; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::LoopbackAddress; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; -use diesel::result::Error as DieselError; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use ipnetwork::IpNetwork; use nexus_types::external_api::params::LoopbackAddressCreate; @@ -40,80 +39,78 @@ impl DataStore { ReserveBlock(ReserveBlockError), } - type TxnError = TransactionError; - let conn = self.pool_connection_authorized(opctx).await?; let inet = IpNetwork::new(params.address, params.mask) .map_err(|_| Error::invalid_request("invalid address"))?; + let err = OptionalError::new(); + // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - let lot_id = authz_address_lot.id(); - let (block, rsvd_block) = - crate::db::datastore::address_lot::try_reserve_block( - lot_id, - inet.ip().into(), - params.anycast, - &conn, - ) - .await - .map_err(|e| match e { - ReserveBlockTxnError::CustomError(err) => { - TxnError::CustomError( - LoopbackAddressCreateError::ReserveBlock(err), + self.transaction_retry_wrapper("loopback_address_create") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let lot_id = authz_address_lot.id(); + let (block, rsvd_block) = + crate::db::datastore::address_lot::try_reserve_block( + lot_id, + inet.ip().into(), + params.anycast, + &conn, ) + .await + .map_err(|e| match e { + ReserveBlockTxnError::CustomError(e) => err.bail( + LoopbackAddressCreateError::ReserveBlock(e), + ), + ReserveBlockTxnError::Database(e) => e, + })?; + + // Address block reserved, now create the loopback address. + + let addr = LoopbackAddress::new( + id, + block.id, + rsvd_block.id, + params.rack_id, + params.switch_location.to_string(), + inet, + params.anycast, + ); + + let db_addr: LoopbackAddress = + diesel::insert_into(dsl::loopback_address) + .values(addr) + .returning(LoopbackAddress::as_returning()) + .get_result_async(&conn) + .await?; + + Ok(db_addr) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + LoopbackAddressCreateError::ReserveBlock( + ReserveBlockError::AddressUnavailable, + ) => Error::invalid_request("address unavailable"), + LoopbackAddressCreateError::ReserveBlock( + ReserveBlockError::AddressNotInLot, + ) => Error::invalid_request("address not in lot"), } - ReserveBlockTxnError::Database(err) => { - TxnError::Database(err) - } - })?; - - // Address block reserved, now create the loopback address. - - let addr = LoopbackAddress::new( - id, - block.id, - rsvd_block.id, - params.rack_id, - params.switch_location.to_string(), - inet, - params.anycast, - ); - - let db_addr: LoopbackAddress = - diesel::insert_into(dsl::loopback_address) - .values(addr) - .returning(LoopbackAddress::as_returning()) - .get_result_async(&conn) - .await?; - - Ok(db_addr) - }) - .await - .map_err(|e| match e { - TxnError::CustomError( - LoopbackAddressCreateError::ReserveBlock( - ReserveBlockError::AddressUnavailable, - ), - ) => Error::invalid_request("address unavailable"), - TxnError::CustomError( - LoopbackAddressCreateError::ReserveBlock( - ReserveBlockError::AddressNotInLot, - ), - ) => Error::invalid_request("address not in lot"), - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::LoopbackAddress, - &format!("lo {}", inet), - ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, - }) + } else { + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::LoopbackAddress, + &format!("lo {}", inet), + ), + ) + } + }) } pub async fn loopback_address_delete( @@ -130,22 +127,23 @@ impl DataStore { // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - let la = diesel::delete(dsl::loopback_address) - .filter(dsl::id.eq(id)) - .returning(LoopbackAddress::as_returning()) - .get_result_async(&conn) - .await?; - - diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) - .filter(rsvd_block_dsl::id.eq(la.rsvd_address_lot_block_id)) - .execute_async(&conn) - .await?; - - Ok(()) - }) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + self.transaction_retry_wrapper("loopback_address_delete") + .transaction(&conn, |conn| async move { + let la = diesel::delete(dsl::loopback_address) + .filter(dsl::id.eq(id)) + .returning(LoopbackAddress::as_returning()) + .get_result_async(&conn) + .await?; + + diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) + .filter(rsvd_block_dsl::id.eq(la.rsvd_address_lot_block_id)) + .execute_async(&conn) + .await?; + + Ok(()) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn loopback_address_get( diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index 6bd4e61f70..221feee23c 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -11,7 +11,6 @@ use crate::db::datastore::address_lot::{ use crate::db::datastore::UpdatePrecondition; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::{ LldpServiceConfig, Name, SwitchInterfaceConfig, SwitchPort, SwitchPortAddressConfig, SwitchPortBgpPeerConfig, SwitchPortConfig, @@ -20,8 +19,8 @@ use crate::db::model::{ SwitchVlanInterfaceConfig, }; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; -use diesel::result::Error as DieselError; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ CombineDsl, ExpressionMethods, JoinOnDsl, NullableExpressionMethods, QueryDsl, SelectableHelper, @@ -163,283 +162,285 @@ impl DataStore { BgpConfigNotFound, ReserveBlock(ReserveBlockError), } - type TxnError = TransactionError; type SpsCreateError = SwitchPortSettingsCreateError; + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - // create the top level port settings object - let port_settings = match id { - Some(id) => SwitchPortSettings::with_id(id, ¶ms.identity), - None => SwitchPortSettings::new(¶ms.identity), - }; - //let port_settings = SwitchPortSettings::new(¶ms.identity); - let db_port_settings: SwitchPortSettings = - diesel::insert_into(port_settings_dsl::switch_port_settings) - .values(port_settings) - .returning(SwitchPortSettings::as_returning()) - .get_result_async(&conn) - .await?; - - let psid = db_port_settings.identity.id; - - // add the port config - let port_config = SwitchPortConfig::new( - psid, - params.port_config.geometry.into(), - ); - - let db_port_config: SwitchPortConfig = - diesel::insert_into(port_config_dsl::switch_port_settings_port_config) - .values(port_config) - .returning(SwitchPortConfig::as_returning()) - .get_result_async(&conn) - .await?; - - let mut result = SwitchPortSettingsCombinedResult{ - settings: db_port_settings, - groups: Vec::new(), - port: db_port_config, - links: Vec::new(), - link_lldp: Vec::new(), - interfaces: Vec::new(), - vlan_interfaces: Vec::new(), - routes: Vec::new(), - bgp_peers: Vec::new(), - addresses: Vec::new(), - }; - - //TODO validate link configs consistent with port geometry. - // - https://github.com/oxidecomputer/omicron/issues/2816 - - let mut lldp_config = Vec::with_capacity(params.links.len()); - let mut link_config = Vec::with_capacity(params.links.len()); - - for (link_name, c) in ¶ms.links { - let lldp_config_id = match c.lldp.lldp_config { - Some(_) => todo!(), // TODO actual lldp support - None => None, - }; - let lldp_svc_config = - LldpServiceConfig::new(c.lldp.enabled, lldp_config_id); - - lldp_config.push(lldp_svc_config.clone()); - link_config.push(SwitchPortLinkConfig::new( - psid, - lldp_svc_config.id, - link_name.clone(), - c.mtu, - c.fec.into(), - c.speed.into(), - c.autoneg, - )); - } - result.link_lldp = - diesel::insert_into(lldp_config_dsl::lldp_service_config) - .values(lldp_config.clone()) - .returning(LldpServiceConfig::as_returning()) - .get_results_async(&conn) - .await?; - result.links = - diesel::insert_into( - link_config_dsl::switch_port_settings_link_config) - .values(link_config) - .returning(SwitchPortLinkConfig::as_returning()) - .get_results_async(&conn) - .await?; - - let mut interface_config = Vec::with_capacity(params.interfaces.len()); - let mut vlan_interface_config = Vec::new(); - for (interface_name, i) in ¶ms.interfaces { - let ifx_config = SwitchInterfaceConfig::new( - psid, - interface_name.clone(), - i.v6_enabled, - i.kind.into(), - ); - interface_config.push(ifx_config.clone()); - if let params::SwitchInterfaceKind::Vlan(vlan_if) = i.kind { - vlan_interface_config.push(SwitchVlanInterfaceConfig::new( - ifx_config.id, - vlan_if.vid, - )); - } - } - result.interfaces = - diesel::insert_into( - interface_config_dsl::switch_port_settings_interface_config) - .values(interface_config) - .returning(SwitchInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; - result.vlan_interfaces = - diesel::insert_into(vlan_config_dsl::switch_vlan_interface_config) - .values(vlan_interface_config) - .returning(SwitchVlanInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; - - - let mut route_config = Vec::with_capacity(params.routes.len()); - - for (interface_name, r) in ¶ms.routes { - for route in &r.routes { - route_config.push(SwitchPortRouteConfig::new( - psid, - interface_name.clone(), - route.dst.into(), - route.gw.into(), - route.vid.map(Into::into), - )); - } - } - result.routes = - diesel::insert_into( - route_config_dsl::switch_port_settings_route_config) - .values(route_config) - .returning(SwitchPortRouteConfig::as_returning()) - .get_results_async(&conn) - .await?; - - let mut bgp_peer_config = Vec::new(); - for (interface_name, peer_config) in ¶ms.bgp_peers { - for p in &peer_config.peers { - use db::schema::bgp_config; - let bgp_config_id = match &p.bgp_config { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - bgp_config_dsl::bgp_config - .filter(bgp_config::time_deleted.is_null()) - .filter(bgp_config::name.eq(name)) - .select(bgp_config::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| - TxnError::CustomError( - SwitchPortSettingsCreateError::BgpConfigNotFound, - ) - )? - } + self.transaction_retry_wrapper("switch_port_settings_create") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // create the top level port settings object + let port_settings = match id { + Some(id) => SwitchPortSettings::with_id(id, ¶ms.identity), + None => SwitchPortSettings::new(¶ms.identity), }; - - bgp_peer_config.push(SwitchPortBgpPeerConfig::new( + //let port_settings = SwitchPortSettings::new(¶ms.identity); + let db_port_settings: SwitchPortSettings = + diesel::insert_into(port_settings_dsl::switch_port_settings) + .values(port_settings) + .returning(SwitchPortSettings::as_returning()) + .get_result_async(&conn) + .await?; + + let psid = db_port_settings.identity.id; + + // add the port config + let port_config = SwitchPortConfig::new( psid, - bgp_config_id, - interface_name.clone(), - p.addr.into(), - p.hold_time.into(), - p.idle_hold_time.into(), - p.delay_open.into(), - p.connect_retry.into(), - p.keepalive.into(), - )); + params.port_config.geometry.into(), + ); + + let db_port_config: SwitchPortConfig = + diesel::insert_into(port_config_dsl::switch_port_settings_port_config) + .values(port_config) + .returning(SwitchPortConfig::as_returning()) + .get_result_async(&conn) + .await?; + + let mut result = SwitchPortSettingsCombinedResult{ + settings: db_port_settings, + groups: Vec::new(), + port: db_port_config, + links: Vec::new(), + link_lldp: Vec::new(), + interfaces: Vec::new(), + vlan_interfaces: Vec::new(), + routes: Vec::new(), + bgp_peers: Vec::new(), + addresses: Vec::new(), + }; - } - } - result.bgp_peers = - diesel::insert_into( - bgp_peer_dsl::switch_port_settings_bgp_peer_config) - .values(bgp_peer_config) - .returning(SwitchPortBgpPeerConfig::as_returning()) - .get_results_async(&conn) - .await?; + //TODO validate link configs consistent with port geometry. + // - https://github.com/oxidecomputer/omicron/issues/2816 + + let mut lldp_config = Vec::with_capacity(params.links.len()); + let mut link_config = Vec::with_capacity(params.links.len()); + + for (link_name, c) in ¶ms.links { + let lldp_config_id = match c.lldp.lldp_config { + Some(_) => todo!(), // TODO actual lldp support + None => None, + }; + let lldp_svc_config = + LldpServiceConfig::new(c.lldp.enabled, lldp_config_id); + + lldp_config.push(lldp_svc_config.clone()); + link_config.push(SwitchPortLinkConfig::new( + psid, + lldp_svc_config.id, + link_name.clone(), + c.mtu, + c.fec.into(), + c.speed.into(), + c.autoneg, + )); + } + result.link_lldp = + diesel::insert_into(lldp_config_dsl::lldp_service_config) + .values(lldp_config.clone()) + .returning(LldpServiceConfig::as_returning()) + .get_results_async(&conn) + .await?; + result.links = + diesel::insert_into( + link_config_dsl::switch_port_settings_link_config) + .values(link_config) + .returning(SwitchPortLinkConfig::as_returning()) + .get_results_async(&conn) + .await?; + + let mut interface_config = Vec::with_capacity(params.interfaces.len()); + let mut vlan_interface_config = Vec::new(); + for (interface_name, i) in ¶ms.interfaces { + let ifx_config = SwitchInterfaceConfig::new( + psid, + interface_name.clone(), + i.v6_enabled, + i.kind.into(), + ); + interface_config.push(ifx_config.clone()); + if let params::SwitchInterfaceKind::Vlan(vlan_if) = i.kind { + vlan_interface_config.push(SwitchVlanInterfaceConfig::new( + ifx_config.id, + vlan_if.vid, + )); + } + } + result.interfaces = + diesel::insert_into( + interface_config_dsl::switch_port_settings_interface_config) + .values(interface_config) + .returning(SwitchInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; + result.vlan_interfaces = + diesel::insert_into(vlan_config_dsl::switch_vlan_interface_config) + .values(vlan_interface_config) + .returning(SwitchVlanInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; + + let mut route_config = Vec::with_capacity(params.routes.len()); + + for (interface_name, r) in ¶ms.routes { + for route in &r.routes { + route_config.push(SwitchPortRouteConfig::new( + psid, + interface_name.clone(), + route.dst.into(), + route.gw.into(), + route.vid.map(Into::into), + )); + } + } + result.routes = + diesel::insert_into( + route_config_dsl::switch_port_settings_route_config) + .values(route_config) + .returning(SwitchPortRouteConfig::as_returning()) + .get_results_async(&conn) + .await?; + + let mut bgp_peer_config = Vec::new(); + for (interface_name, peer_config) in ¶ms.bgp_peers { + for p in &peer_config.peers { + use db::schema::bgp_config; + let bgp_config_id = match &p.bgp_config { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + bgp_config_dsl::bgp_config + .filter(bgp_config::time_deleted.is_null()) + .filter(bgp_config::name.eq(name)) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsCreateError::BgpConfigNotFound + ) + })? + } + }; + + bgp_peer_config.push(SwitchPortBgpPeerConfig::new( + psid, + bgp_config_id, + interface_name.clone(), + p.addr.into(), + p.hold_time.into(), + p.idle_hold_time.into(), + p.delay_open.into(), + p.connect_retry.into(), + p.keepalive.into(), + )); - let mut address_config = Vec::new(); - use db::schema::address_lot; - for (interface_name, a) in ¶ms.addresses { - for address in &a.addresses { - let address_lot_id = match &address.address_lot { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - address_lot_dsl::address_lot - .filter(address_lot::time_deleted.is_null()) - .filter(address_lot::name.eq(name)) - .select(address_lot::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| - TxnError::CustomError( - SwitchPortSettingsCreateError::AddressLotNotFound, - ) - )? } - }; - // TODO: Reduce DB round trips needed for reserving ip blocks - // https://github.com/oxidecomputer/omicron/issues/3060 - let (block, rsvd_block) = - crate::db::datastore::address_lot::try_reserve_block( - address_lot_id, - address.address.ip().into(), - // TODO: Should we allow anycast addresses for switch_ports? - // anycast - false, - &conn - ) - .await - .map_err(|e| match e { - ReserveBlockTxnError::CustomError(err) => { - TxnError::CustomError( - SwitchPortSettingsCreateError::ReserveBlock(err) + } + result.bgp_peers = + diesel::insert_into( + bgp_peer_dsl::switch_port_settings_bgp_peer_config) + .values(bgp_peer_config) + .returning(SwitchPortBgpPeerConfig::as_returning()) + .get_results_async(&conn) + .await?; + + let mut address_config = Vec::new(); + use db::schema::address_lot; + for (interface_name, a) in ¶ms.addresses { + for address in &a.addresses { + let address_lot_id = match &address.address_lot { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + address_lot_dsl::address_lot + .filter(address_lot::time_deleted.is_null()) + .filter(address_lot::name.eq(name)) + .select(address_lot::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsCreateError::AddressLotNotFound + ) + })? + } + }; + // TODO: Reduce DB round trips needed for reserving ip blocks + // https://github.com/oxidecomputer/omicron/issues/3060 + let (block, rsvd_block) = + crate::db::datastore::address_lot::try_reserve_block( + address_lot_id, + address.address.ip().into(), + // TODO: Should we allow anycast addresses for switch_ports? + // anycast + false, + &conn ) - } - ReserveBlockTxnError::Database(err) => TxnError::Database(err), - })?; - - address_config.push(SwitchPortAddressConfig::new( - psid, - block.id, - rsvd_block.id, - address.address.into(), - interface_name.clone(), - )); + .await + .map_err(|e| match e { + ReserveBlockTxnError::CustomError(e) => { + err.bail(SwitchPortSettingsCreateError::ReserveBlock(e)) + } + ReserveBlockTxnError::Database(e) => e, + })?; + + address_config.push(SwitchPortAddressConfig::new( + psid, + block.id, + rsvd_block.id, + address.address.into(), + interface_name.clone(), + )); + } + } + result.addresses = + diesel::insert_into( + address_config_dsl::switch_port_settings_address_config) + .values(address_config) + .returning(SwitchPortAddressConfig::as_returning()) + .get_results_async(&conn) + .await?; + + Ok(result) } } - result.addresses = - diesel::insert_into( - address_config_dsl::switch_port_settings_address_config) - .values(address_config) - .returning(SwitchPortAddressConfig::as_returning()) - .get_results_async(&conn) - .await?; - - Ok(result) - }) + ) .await - .map_err(|e| match e { - TxnError::CustomError(SpsCreateError::AddressLotNotFound) => { - Error::invalid_request("AddressLot not found") - } - TxnError::CustomError(SpsCreateError::BgpConfigNotFound) => { - Error::invalid_request("BGP config not found") - } - TxnError::CustomError( - SwitchPortSettingsCreateError::ReserveBlock( - ReserveBlockError::AddressUnavailable - ) - ) => Error::invalid_request("address unavailable"), - TxnError::CustomError( - SwitchPortSettingsCreateError::ReserveBlock( - ReserveBlockError::AddressNotInLot - ) - ) => Error::invalid_request("address not in lot"), - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SpsCreateError::AddressLotNotFound => { + Error::invalid_request("AddressLot not found") + } + SpsCreateError::BgpConfigNotFound => { + Error::invalid_request("BGP config not found") + } + SwitchPortSettingsCreateError::ReserveBlock( + ReserveBlockError::AddressUnavailable + ) => Error::invalid_request("address unavailable"), + SwitchPortSettingsCreateError::ReserveBlock( + ReserveBlockError::AddressNotInLot + ) => Error::invalid_request("address not in lot"), + } + } else { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::SwitchPortSettings, params.identity.name.as_str(), ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, + ) + } }) } @@ -454,7 +455,6 @@ impl DataStore { enum SwitchPortSettingsDeleteError { SwitchPortSettingsNotFound, } - type TxnError = TransactionError; let conn = self.pool_connection_authorized(opctx).await?; @@ -463,173 +463,178 @@ impl DataStore { Some(name_or_id) => name_or_id, }; + let err = OptionalError::new(); + // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - - use db::schema::switch_port_settings; - let id = match selector { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - port_settings_dsl::switch_port_settings - .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::name.eq(name)) - .select(switch_port_settings::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| - TxnError::CustomError( - SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound, - ) - )? - } - }; + self.transaction_retry_wrapper("switch_port_settings_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::switch_port_settings; + let id = match selector { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + port_settings_dsl::switch_port_settings + .filter(switch_port_settings::time_deleted.is_null()) + .filter(switch_port_settings::name.eq(name)) + .select(switch_port_settings::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound + ) + })? + } + }; - // delete the top level port settings object - diesel::delete(port_settings_dsl::switch_port_settings) - .filter(switch_port_settings::id.eq(id)) - .execute_async(&conn) - .await?; + // delete the top level port settings object + diesel::delete(port_settings_dsl::switch_port_settings) + .filter(switch_port_settings::id.eq(id)) + .execute_async(&conn) + .await?; - // delete the port config object - use db::schema::switch_port_settings_port_config::{ - self as sps_port_config, dsl as port_config_dsl, - }; - diesel::delete(port_config_dsl::switch_port_settings_port_config) - .filter(sps_port_config::port_settings_id.eq(id)) - .execute_async(&conn) - .await?; + // delete the port config object + use db::schema::switch_port_settings_port_config::{ + self as sps_port_config, dsl as port_config_dsl, + }; + diesel::delete(port_config_dsl::switch_port_settings_port_config) + .filter(sps_port_config::port_settings_id.eq(id)) + .execute_async(&conn) + .await?; - // delete the link configs - use db::schema::switch_port_settings_link_config::{ - self as sps_link_config, dsl as link_config_dsl, - }; - let links: Vec = - diesel::delete( - link_config_dsl::switch_port_settings_link_config - ) - .filter( - sps_link_config::port_settings_id.eq(id) - ) - .returning(SwitchPortLinkConfig::as_returning()) - .get_results_async(&conn) - .await?; + // delete the link configs + use db::schema::switch_port_settings_link_config::{ + self as sps_link_config, dsl as link_config_dsl, + }; + let links: Vec = + diesel::delete( + link_config_dsl::switch_port_settings_link_config + ) + .filter( + sps_link_config::port_settings_id.eq(id) + ) + .returning(SwitchPortLinkConfig::as_returning()) + .get_results_async(&conn) + .await?; - // delete lldp configs - use db::schema::lldp_service_config::{self, dsl as lldp_config_dsl}; - let lldp_svc_ids: Vec = links - .iter() - .map(|link| link.lldp_service_config_id) - .collect(); - diesel::delete(lldp_config_dsl::lldp_service_config) - .filter(lldp_service_config::id.eq_any(lldp_svc_ids)) - .execute_async(&conn) - .await?; + // delete lldp configs + use db::schema::lldp_service_config::{self, dsl as lldp_config_dsl}; + let lldp_svc_ids: Vec = links + .iter() + .map(|link| link.lldp_service_config_id) + .collect(); + diesel::delete(lldp_config_dsl::lldp_service_config) + .filter(lldp_service_config::id.eq_any(lldp_svc_ids)) + .execute_async(&conn) + .await?; - // delete interface configs - use db::schema::switch_port_settings_interface_config::{ - self as sps_interface_config, dsl as interface_config_dsl, - }; + // delete interface configs + use db::schema::switch_port_settings_interface_config::{ + self as sps_interface_config, dsl as interface_config_dsl, + }; - let interfaces: Vec = - diesel::delete( - interface_config_dsl::switch_port_settings_interface_config - ) - .filter( - sps_interface_config::port_settings_id.eq(id) - ) - .returning(SwitchInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; + let interfaces: Vec = + diesel::delete( + interface_config_dsl::switch_port_settings_interface_config + ) + .filter( + sps_interface_config::port_settings_id.eq(id) + ) + .returning(SwitchInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; - // delete any vlan interfaces - use db::schema::switch_vlan_interface_config::{ - self, dsl as vlan_config_dsl, - }; - let interface_ids: Vec = interfaces - .iter() - .map(|interface| interface.id) - .collect(); - - diesel::delete(vlan_config_dsl::switch_vlan_interface_config) - .filter( - switch_vlan_interface_config::interface_config_id.eq_any( - interface_ids + // delete any vlan interfaces + use db::schema::switch_vlan_interface_config::{ + self, dsl as vlan_config_dsl, + }; + let interface_ids: Vec = interfaces + .iter() + .map(|interface| interface.id) + .collect(); + + diesel::delete(vlan_config_dsl::switch_vlan_interface_config) + .filter( + switch_vlan_interface_config::interface_config_id.eq_any( + interface_ids + ) ) + .execute_async(&conn) + .await?; + + // delete route configs + use db::schema::switch_port_settings_route_config; + use db::schema::switch_port_settings_route_config::dsl + as route_config_dsl; + + diesel::delete( + route_config_dsl::switch_port_settings_route_config ) + .filter(switch_port_settings_route_config::port_settings_id.eq(id)) .execute_async(&conn) .await?; - // delete route configs - use db::schema::switch_port_settings_route_config; - use db::schema::switch_port_settings_route_config::dsl - as route_config_dsl; + // delete bgp configurations + use db::schema::switch_port_settings_bgp_peer_config as bgp_peer; + use db::schema::switch_port_settings_bgp_peer_config::dsl + as bgp_peer_dsl; - diesel::delete( - route_config_dsl::switch_port_settings_route_config - ) - .filter(switch_port_settings_route_config::port_settings_id.eq(id)) - .execute_async(&conn) - .await?; + diesel::delete(bgp_peer_dsl::switch_port_settings_bgp_peer_config) + .filter(bgp_peer::port_settings_id.eq(id)) + .execute_async(&conn) + .await?; - // delete bgp configurations - use db::schema::switch_port_settings_bgp_peer_config as bgp_peer; - use db::schema::switch_port_settings_bgp_peer_config::dsl - as bgp_peer_dsl; + // delete address configs + use db::schema::switch_port_settings_address_config::{ + self as address_config, dsl as address_config_dsl, + }; - diesel::delete(bgp_peer_dsl::switch_port_settings_bgp_peer_config) - .filter(bgp_peer::port_settings_id.eq(id)) - .execute_async(&conn) + let port_settings_addrs = diesel::delete( + address_config_dsl::switch_port_settings_address_config, + ) + .filter(address_config::port_settings_id.eq(id)) + .returning(SwitchPortAddressConfig::as_returning()) + .get_results_async(&conn) .await?; - // delete address configs - use db::schema::switch_port_settings_address_config::{ - self as address_config, dsl as address_config_dsl, - }; - - let port_settings_addrs = diesel::delete( - address_config_dsl::switch_port_settings_address_config, - ) - .filter(address_config::port_settings_id.eq(id)) - .returning(SwitchPortAddressConfig::as_returning()) - .get_results_async(&conn) - .await?; + use db::schema::address_lot_rsvd_block::dsl as rsvd_block_dsl; - use db::schema::address_lot_rsvd_block::dsl as rsvd_block_dsl; + for ps in &port_settings_addrs { + diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) + .filter(rsvd_block_dsl::id.eq(ps.rsvd_address_lot_block_id)) + .execute_async(&conn) + .await?; + } - for ps in &port_settings_addrs { - diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) - .filter(rsvd_block_dsl::id.eq(ps.rsvd_address_lot_block_id)) - .execute_async(&conn) - .await?; + Ok(()) } - - Ok(()) }) .await - .map_err(|e| match e { - TxnError::CustomError( - SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound) => { - Error::invalid_request("port settings not found") + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound => { + Error::invalid_request("port settings not found") + } + } + } else { + let name = match ¶ms.port_settings { + Some(name_or_id) => name_or_id.to_string(), + None => String::new(), + }; + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SwitchPortSettings, + &name, + ), + ) } - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => { - let name = match ¶ms.port_settings { - Some(name_or_id) => name_or_id.to_string(), - None => String::new(), - }; - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::SwitchPortSettings, - &name, - ), - ) - }, - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, }) } @@ -666,174 +671,178 @@ impl DataStore { enum SwitchPortSettingsGetError { NotFound(external::Name), } - type TxnError = TransactionError; + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - // get the top level port settings object - use db::schema::switch_port_settings::{ - self, dsl as port_settings_dsl, - }; - - let id = match name_or_id { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name_str = name.to_string(); + self.transaction_retry_wrapper("switch_port_settings_get") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // get the top level port settings object + use db::schema::switch_port_settings::{ + self, dsl as port_settings_dsl, + }; + + let id = match name_or_id { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name_str = name.to_string(); + port_settings_dsl::switch_port_settings + .filter(switch_port_settings::time_deleted.is_null()) + .filter(switch_port_settings::name.eq(name_str)) + .select(switch_port_settings::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or_else(diesel_error, |_| { + SwitchPortSettingsGetError::NotFound( + name.clone(), + ) + }) + })? + } + }; + + let settings: SwitchPortSettings = port_settings_dsl::switch_port_settings .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::name.eq(name_str)) - .select(switch_port_settings::id) + .filter(switch_port_settings::id.eq(id)) + .select(SwitchPortSettings::as_select()) .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError( - SwitchPortSettingsGetError::NotFound( - name.clone(), - ), - ) - })? - } - }; - - let settings: SwitchPortSettings = - port_settings_dsl::switch_port_settings - .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::id.eq(id)) - .select(SwitchPortSettings::as_select()) - .limit(1) - .first_async::(&conn) - .await?; + .first_async::(&conn) + .await?; - // get the port config - use db::schema::switch_port_settings_port_config::{ - self as port_config, dsl as port_config_dsl, - }; - let port: SwitchPortConfig = - port_config_dsl::switch_port_settings_port_config - .filter(port_config::port_settings_id.eq(id)) - .select(SwitchPortConfig::as_select()) - .limit(1) - .first_async::(&conn) - .await?; + // get the port config + use db::schema::switch_port_settings_port_config::{ + self as port_config, dsl as port_config_dsl, + }; + let port: SwitchPortConfig = + port_config_dsl::switch_port_settings_port_config + .filter(port_config::port_settings_id.eq(id)) + .select(SwitchPortConfig::as_select()) + .limit(1) + .first_async::(&conn) + .await?; - // initialize result - let mut result = - SwitchPortSettingsCombinedResult::new(settings, port); + // initialize result + let mut result = + SwitchPortSettingsCombinedResult::new(settings, port); - // get the link configs - use db::schema::switch_port_settings_link_config::{ - self as link_config, dsl as link_config_dsl, - }; + // get the link configs + use db::schema::switch_port_settings_link_config::{ + self as link_config, dsl as link_config_dsl, + }; - result.links = link_config_dsl::switch_port_settings_link_config - .filter(link_config::port_settings_id.eq(id)) - .select(SwitchPortLinkConfig::as_select()) - .load_async::(&conn) - .await?; + result.links = link_config_dsl::switch_port_settings_link_config + .filter(link_config::port_settings_id.eq(id)) + .select(SwitchPortLinkConfig::as_select()) + .load_async::(&conn) + .await?; - let lldp_svc_ids: Vec = result - .links - .iter() - .map(|link| link.lldp_service_config_id) - .collect(); - - use db::schema::lldp_service_config as lldp_config; - use db::schema::lldp_service_config::dsl as lldp_dsl; - result.link_lldp = lldp_dsl::lldp_service_config - .filter(lldp_config::id.eq_any(lldp_svc_ids)) - .select(LldpServiceConfig::as_select()) - .limit(1) - .load_async::(&conn) - .await?; + let lldp_svc_ids: Vec = result + .links + .iter() + .map(|link| link.lldp_service_config_id) + .collect(); + + use db::schema::lldp_service_config as lldp_config; + use db::schema::lldp_service_config::dsl as lldp_dsl; + result.link_lldp = lldp_dsl::lldp_service_config + .filter(lldp_config::id.eq_any(lldp_svc_ids)) + .select(LldpServiceConfig::as_select()) + .limit(1) + .load_async::(&conn) + .await?; - // get the interface configs - use db::schema::switch_port_settings_interface_config::{ - self as interface_config, dsl as interface_config_dsl, - }; + // get the interface configs + use db::schema::switch_port_settings_interface_config::{ + self as interface_config, dsl as interface_config_dsl, + }; - result.interfaces = - interface_config_dsl::switch_port_settings_interface_config - .filter(interface_config::port_settings_id.eq(id)) - .select(SwitchInterfaceConfig::as_select()) - .load_async::(&conn) + result.interfaces = + interface_config_dsl::switch_port_settings_interface_config + .filter(interface_config::port_settings_id.eq(id)) + .select(SwitchInterfaceConfig::as_select()) + .load_async::(&conn) + .await?; + + use db::schema::switch_vlan_interface_config as vlan_config; + use db::schema::switch_vlan_interface_config::dsl as vlan_dsl; + let interface_ids: Vec = result + .interfaces + .iter() + .map(|interface| interface.id) + .collect(); + + result.vlan_interfaces = vlan_dsl::switch_vlan_interface_config + .filter(vlan_config::interface_config_id.eq_any(interface_ids)) + .select(SwitchVlanInterfaceConfig::as_select()) + .load_async::(&conn) .await?; - use db::schema::switch_vlan_interface_config as vlan_config; - use db::schema::switch_vlan_interface_config::dsl as vlan_dsl; - let interface_ids: Vec = result - .interfaces - .iter() - .map(|interface| interface.id) - .collect(); - - result.vlan_interfaces = vlan_dsl::switch_vlan_interface_config - .filter(vlan_config::interface_config_id.eq_any(interface_ids)) - .select(SwitchVlanInterfaceConfig::as_select()) - .load_async::(&conn) - .await?; - - // get the route configs - use db::schema::switch_port_settings_route_config::{ - self as route_config, dsl as route_config_dsl, - }; + // get the route configs + use db::schema::switch_port_settings_route_config::{ + self as route_config, dsl as route_config_dsl, + }; - result.routes = route_config_dsl::switch_port_settings_route_config - .filter(route_config::port_settings_id.eq(id)) - .select(SwitchPortRouteConfig::as_select()) - .load_async::(&conn) - .await?; + result.routes = route_config_dsl::switch_port_settings_route_config + .filter(route_config::port_settings_id.eq(id)) + .select(SwitchPortRouteConfig::as_select()) + .load_async::(&conn) + .await?; - // get the bgp peer configs - use db::schema::switch_port_settings_bgp_peer_config::{ - self as bgp_peer, dsl as bgp_peer_dsl, - }; + // get the bgp peer configs + use db::schema::switch_port_settings_bgp_peer_config::{ + self as bgp_peer, dsl as bgp_peer_dsl, + }; - result.bgp_peers = - bgp_peer_dsl::switch_port_settings_bgp_peer_config - .filter(bgp_peer::port_settings_id.eq(id)) - .select(SwitchPortBgpPeerConfig::as_select()) - .load_async::(&conn) - .await?; + result.bgp_peers = + bgp_peer_dsl::switch_port_settings_bgp_peer_config + .filter(bgp_peer::port_settings_id.eq(id)) + .select(SwitchPortBgpPeerConfig::as_select()) + .load_async::(&conn) + .await?; - // get the address configs - use db::schema::switch_port_settings_address_config::{ - self as address_config, dsl as address_config_dsl, - }; + // get the address configs + use db::schema::switch_port_settings_address_config::{ + self as address_config, dsl as address_config_dsl, + }; - result.addresses = - address_config_dsl::switch_port_settings_address_config - .filter(address_config::port_settings_id.eq(id)) - .select(SwitchPortAddressConfig::as_select()) - .load_async::(&conn) - .await?; + result.addresses = + address_config_dsl::switch_port_settings_address_config + .filter(address_config::port_settings_id.eq(id)) + .select(SwitchPortAddressConfig::as_select()) + .load_async::(&conn) + .await?; - Ok(result) + Ok(result) + } }) .await - .map_err(|e| match e { - TxnError::CustomError(SwitchPortSettingsGetError::NotFound( - name, - )) => Error::not_found_by_name( - ResourceType::SwitchPortSettings, - &name, - ), - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => { - let name = name_or_id.to_string(); - public_error_from_diesel( - e, - ErrorHandler::Conflict( + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SwitchPortSettingsGetError::NotFound(name) => { + Error::not_found_by_name( ResourceType::SwitchPortSettings, &name, - ), - ) + ) + } } - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, + } else { + let name = name_or_id.to_string(); + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SwitchPortSettings, + &name, + ), + ) + } }) } @@ -850,7 +859,8 @@ impl DataStore { enum SwitchPortCreateError { RackNotFound, } - type TxnError = TransactionError; + + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; let switch_port = SwitchPort::new( @@ -861,46 +871,59 @@ impl DataStore { // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - use db::schema::rack; - use db::schema::rack::dsl as rack_dsl; - rack_dsl::rack - .filter(rack::id.eq(rack_id)) - .select(rack::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError(SwitchPortCreateError::RackNotFound) - })?; - - // insert switch port - use db::schema::switch_port::dsl as switch_port_dsl; - let db_switch_port: SwitchPort = - diesel::insert_into(switch_port_dsl::switch_port) - .values(switch_port) - .returning(SwitchPort::as_returning()) - .get_result_async(&conn) - .await?; + self.transaction_retry_wrapper("switch_port_create") + .transaction(&conn, |conn| { + let err = err.clone(); + let switch_port = switch_port.clone(); + async move { + use db::schema::rack; + use db::schema::rack::dsl as rack_dsl; + rack_dsl::rack + .filter(rack::id.eq(rack_id)) + .select(rack::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + err.bail_retryable_or( + e, + SwitchPortCreateError::RackNotFound, + ) + })?; - Ok(db_switch_port) - }) - .await - .map_err(|e| match e { - TxnError::CustomError(SwitchPortCreateError::RackNotFound) => { - Error::invalid_request("rack not found") - } - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::SwitchPort, - &format!("{}/{}/{}", rack_id, &switch_location, &port,), - ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, - }) + // insert switch port + use db::schema::switch_port::dsl as switch_port_dsl; + let db_switch_port: SwitchPort = + diesel::insert_into(switch_port_dsl::switch_port) + .values(switch_port) + .returning(SwitchPort::as_returning()) + .get_result_async(&conn) + .await?; + + Ok(db_switch_port) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SwitchPortCreateError::RackNotFound => { + Error::invalid_request("rack not found") + } + } + } else { + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SwitchPort, + &format!( + "{}/{}/{}", + rack_id, &switch_location, &port, + ), + ), + ) + } + }) } pub async fn switch_port_delete( @@ -914,58 +937,75 @@ impl DataStore { NotFound, ActiveSettings, } - type TxnError = TransactionError; + + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - use db::schema::switch_port; - use db::schema::switch_port::dsl as switch_port_dsl; - - let switch_location = params.switch_location.to_string(); - let port_name = portname.to_string(); - let port: SwitchPort = switch_port_dsl::switch_port - .filter(switch_port::rack_id.eq(params.rack_id)) - .filter( - switch_port::switch_location.eq(switch_location.clone()), - ) - .filter(switch_port::port_name.eq(port_name.clone())) - .select(SwitchPort::as_select()) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError(SwitchPortDeleteError::NotFound) - })?; - - if port.port_settings_id.is_some() { - return Err(TxnError::CustomError( - SwitchPortDeleteError::ActiveSettings, - )); - } + self.transaction_retry_wrapper("switch_port_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::switch_port; + use db::schema::switch_port::dsl as switch_port_dsl; + + let switch_location = params.switch_location.to_string(); + let port_name = portname.to_string(); + let port: SwitchPort = switch_port_dsl::switch_port + .filter(switch_port::rack_id.eq(params.rack_id)) + .filter( + switch_port::switch_location + .eq(switch_location.clone()), + ) + .filter(switch_port::port_name.eq(port_name.clone())) + .select(SwitchPort::as_select()) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortDeleteError::NotFound, + ) + })?; - diesel::delete(switch_port_dsl::switch_port) - .filter(switch_port::id.eq(port.id)) - .execute_async(&conn) - .await?; + if port.port_settings_id.is_some() { + return Err( + err.bail(SwitchPortDeleteError::ActiveSettings) + ); + } - Ok(()) - }) - .await - .map_err(|e| match e { - TxnError::CustomError(SwitchPortDeleteError::NotFound) => { - let name = &portname.clone(); - Error::not_found_by_name(ResourceType::SwitchPort, name) - } - TxnError::CustomError(SwitchPortDeleteError::ActiveSettings) => { - Error::invalid_request("must clear port settings first") - } - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + diesel::delete(switch_port_dsl::switch_port) + .filter(switch_port::id.eq(port.id)) + .execute_async(&conn) + .await?; + + Ok(()) + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SwitchPortDeleteError::NotFound => { + let name = &portname.clone(); + Error::not_found_by_name( + ResourceType::SwitchPort, + name, + ) + } + SwitchPortDeleteError::ActiveSettings => { + Error::invalid_request( + "must clear port settings first", + ) + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) } pub async fn switch_port_list( diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index 8b1eecb781..0790bd458e 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -8,15 +8,13 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; use crate::db; -use crate::db::error::{ - public_error_from_diesel, ErrorHandler, TransactionError, -}; +use crate::db::error::{public_error_from_diesel, ErrorHandler}; use crate::db::model::{ ComponentUpdate, SemverVersion, SystemUpdate, UpdateArtifact, UpdateDeployment, UpdateStatus, UpdateableComponent, }; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; use nexus_db_model::SystemUpdateComponentUpdate; @@ -141,36 +139,40 @@ impl DataStore { let version_string = update.version.to_string(); - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - let db_update = diesel::insert_into(component_update::table) - .values(update.clone()) - .returning(ComponentUpdate::as_returning()) - .get_result_async(&conn) - .await?; - - diesel::insert_into(join_table::table) - .values(SystemUpdateComponentUpdate { - system_update_id, - component_update_id: update.id(), - }) - .returning(SystemUpdateComponentUpdate::as_returning()) - .get_result_async(&conn) - .await?; - - Ok(db_update) + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("create_component_update") + .transaction(&conn, |conn| { + let update = update.clone(); + async move { + let db_update = + diesel::insert_into(component_update::table) + .values(update.clone()) + .returning(ComponentUpdate::as_returning()) + .get_result_async(&conn) + .await?; + + diesel::insert_into(join_table::table) + .values(SystemUpdateComponentUpdate { + system_update_id, + component_update_id: update.id(), + }) + .returning(SystemUpdateComponentUpdate::as_returning()) + .get_result_async(&conn) + .await?; + + Ok(db_update) + } }) .await - .map_err(|e| match e { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => public_error_from_diesel( + .map_err(|e| { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::ComponentUpdate, &version_string, ), - ), + ) }) } diff --git a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs index c5c2751723..230c3941ff 100644 --- a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs +++ b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs @@ -15,6 +15,7 @@ use crate::db::pool::DbConnection; use crate::db::queries::virtual_provisioning_collection_update::VirtualProvisioningCollectionUpdate; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; +use diesel::result::Error as DieselError; use omicron_common::api::external::{DeleteResult, Error}; use uuid::Uuid; @@ -52,13 +53,14 @@ impl DataStore { virtual_provisioning_collection, ) .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub(crate) async fn virtual_provisioning_collection_create_on_connection( &self, conn: &async_bb8_diesel::Connection, virtual_provisioning_collection: VirtualProvisioningCollection, - ) -> Result, Error> { + ) -> Result, DieselError> { use db::schema::virtual_provisioning_collection::dsl; let provisions: Vec = @@ -66,12 +68,10 @@ impl DataStore { .values(virtual_provisioning_collection) .on_conflict_do_nothing() .get_results_async(conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - self.virtual_provisioning_collection_producer - .append_all_metrics(&provisions)?; + .await?; + let _ = self + .virtual_provisioning_collection_producer + .append_all_metrics(&provisions); Ok(provisions) } @@ -103,16 +103,20 @@ impl DataStore { id: Uuid, ) -> DeleteResult { let conn = self.pool_connection_authorized(opctx).await?; - self.virtual_provisioning_collection_delete_on_connection(&conn, id) - .await + self.virtual_provisioning_collection_delete_on_connection( + &opctx.log, &conn, id, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } /// Delete a [`VirtualProvisioningCollection`] object. pub(crate) async fn virtual_provisioning_collection_delete_on_connection( &self, + log: &slog::Logger, conn: &async_bb8_diesel::Connection, id: Uuid, - ) -> DeleteResult { + ) -> Result<(), DieselError> { use db::schema::virtual_provisioning_collection::dsl; // NOTE: We don't really need to extract the value we're deleting from @@ -122,13 +126,11 @@ impl DataStore { .filter(dsl::id.eq(id)) .returning(VirtualProvisioningCollection::as_select()) .get_result_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; if !collection.is_empty() { - return Err(Error::internal_error(&format!( - "Collection deleted while non-empty: {collection:?}" - ))); + warn!(log, "Collection deleted while non-empty: {collection:?}"); + return Err(DieselError::RollbackTransaction); } Ok(()) } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 5f126050ae..4f31efd610 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -8,15 +8,14 @@ use super::DataStore; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::identity::Asset; use crate::db::model::Dataset; use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::Volume; use crate::db::queries::volume::DecreaseCrucibleResourceCountAndSoftDeleteVolume; +use crate::transaction_retry::OptionalError; use anyhow::bail; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::OptionalExtension; @@ -44,7 +43,6 @@ impl DataStore { #[error("Serde error during Volume creation: {0}")] SerdeError(#[from] serde_json::Error), } - type TxnError = TransactionError; // Grab all the targets that the volume construction request references. // Do this outside the transaction, as the data inside volume doesn't @@ -66,86 +64,91 @@ impl DataStore { crucible_targets }; - self.pool_connection_unauthorized() - .await? - .transaction_async(|conn| async move { - let maybe_volume: Option = dsl::volume - .filter(dsl::id.eq(volume.id())) - .select(Volume::as_select()) - .first_async(&conn) - .await - .optional() - .map_err(|e| { - TxnError::CustomError(VolumeCreationError::Public( - public_error_from_diesel(e, ErrorHandler::Server), - )) - })?; - - // If the volume existed already, return it and do not increase - // usage counts. - if let Some(volume) = maybe_volume { - return Ok(volume); - } - - // TODO do we need on_conflict do_nothing here? if the transaction - // model is read-committed, the SELECT above could return nothing, - // and the INSERT here could still result in a conflict. - // - // See also https://github.com/oxidecomputer/omicron/issues/1168 - let volume: Volume = diesel::insert_into(dsl::volume) - .values(volume.clone()) - .on_conflict(dsl::id) - .do_nothing() - .returning(Volume::as_returning()) - .get_result_async(&conn) - .await - .map_err(|e| { - TxnError::CustomError(VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Volume, - volume.id().to_string().as_str(), - ), - ), - )) - })?; + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("volume_create") + .transaction(&conn, |conn| { + let err = err.clone(); + let crucible_targets = crucible_targets.clone(); + let volume = volume.clone(); + async move { + let maybe_volume: Option = dsl::volume + .filter(dsl::id.eq(volume.id())) + .select(Volume::as_select()) + .first_async(&conn) + .await + .optional()?; - // Increase the usage count for Crucible resources according to the - // contents of the volume. + // If the volume existed already, return it and do not increase + // usage counts. + if let Some(volume) = maybe_volume { + return Ok(volume); + } - // Increase the number of uses for each referenced region snapshot. - use db::schema::region_snapshot::dsl as rs_dsl; - for read_only_target in &crucible_targets.read_only_targets { - diesel::update(rs_dsl::region_snapshot) - .filter( - rs_dsl::snapshot_addr.eq(read_only_target.clone()), - ) - .filter(rs_dsl::deleting.eq(false)) - .set( - rs_dsl::volume_references - .eq(rs_dsl::volume_references + 1), - ) - .execute_async(&conn) + // TODO do we need on_conflict do_nothing here? if the transaction + // model is read-committed, the SELECT above could return nothing, + // and the INSERT here could still result in a conflict. + // + // See also https://github.com/oxidecomputer/omicron/issues/1168 + let volume: Volume = diesel::insert_into(dsl::volume) + .values(volume.clone()) + .on_conflict(dsl::id) + .do_nothing() + .returning(Volume::as_returning()) + .get_result_async(&conn) .await .map_err(|e| { - TxnError::CustomError(VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ), - )) + err.bail_retryable_or_else(e, |e| { + VolumeCreationError::Public( + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Volume, + volume.id().to_string().as_str(), + ), + ), + ) + }) })?; - } - Ok(volume) + // Increase the usage count for Crucible resources according to the + // contents of the volume. + + // Increase the number of uses for each referenced region snapshot. + use db::schema::region_snapshot::dsl as rs_dsl; + for read_only_target in &crucible_targets.read_only_targets + { + diesel::update(rs_dsl::region_snapshot) + .filter( + rs_dsl::snapshot_addr + .eq(read_only_target.clone()), + ) + .filter(rs_dsl::deleting.eq(false)) + .set( + rs_dsl::volume_references + .eq(rs_dsl::volume_references + 1), + ) + .execute_async(&conn) + .await?; + } + + Ok(volume) + } }) .await - .map_err(|e| match e { - TxnError::CustomError(VolumeCreationError::Public(e)) => e, - - _ => { - Error::internal_error(&format!("Transaction error: {}", e)) + .map_err(|e| { + if let Some(err) = err.take() { + match err { + VolumeCreationError::Public(err) => err, + VolumeCreationError::SerdeError(err) => { + Error::internal_error(&format!( + "Transaction error: {}", + err + )) + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) } }) } @@ -192,16 +195,12 @@ impl DataStore { #[derive(Debug, thiserror::Error)] enum VolumeGetError { - #[error("Error during volume_checkout: {0}")] - DieselError(#[from] diesel::result::Error), - #[error("Serde error during volume_checkout: {0}")] SerdeError(#[from] serde_json::Error), #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), } - type TxnError = TransactionError; // We perform a transaction here, to be sure that on completion // of this, the database contains an updated version of the @@ -209,141 +208,141 @@ impl DataStore { // types that require it). The generation number (along with the // rest of the volume data) that was in the database is what is // returned to the caller. - self.pool_connection_unauthorized() - .await? - .transaction_async(|conn| async move { - // Grab the volume in question. - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await?; - - // Turn the volume.data into the VolumeConstructionRequest - let vcr: VolumeConstructionRequest = - serde_json::from_str(volume.data()).map_err(|e| { - TxnError::CustomError(VolumeGetError::SerdeError(e)) - })?; - - // Look to see if the VCR is a Volume type, and if so, look at - // its sub_volumes. If they are of type Region, then we need - // to update their generation numbers and record that update - // back to the database. We return to the caller whatever the - // original volume data was we pulled from the database. - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - let mut update_needed = false; - let mut new_sv = Vec::new(); - for sv in sub_volumes { - match sv { - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen, - } => { - update_needed = true; - new_sv.push( - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen: gen + 1, - }, - ); - } - _ => { - new_sv.push(sv); + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + + self.transaction_retry_wrapper("volume_checkout") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // Grab the volume in question. + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(&conn) + .await?; + + // Turn the volume.data into the VolumeConstructionRequest + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()).map_err(|e| { + err.bail(VolumeGetError::SerdeError(e)) + })?; + + // Look to see if the VCR is a Volume type, and if so, look at + // its sub_volumes. If they are of type Region, then we need + // to update their generation numbers and record that update + // back to the database. We return to the caller whatever the + // original volume data was we pulled from the database. + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + let mut update_needed = false; + let mut new_sv = Vec::new(); + for sv in sub_volumes { + match sv { + VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen, + } => { + update_needed = true; + new_sv.push( + VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen: gen + 1, + }, + ); + } + _ => { + new_sv.push(sv); + } } } - } - // Only update the volume data if we found the type - // of volume that needed it. - if update_needed { - // Create a new VCR and fill in the contents - // from what the original volume had, but with our - // updated sub_volume records. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes: new_sv, - read_only_parent, - }; - - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - TxnError::CustomError( - VolumeGetError::SerdeError(e), - ) - })?; + // Only update the volume data if we found the type + // of volume that needed it. + if update_needed { + // Create a new VCR and fill in the contents + // from what the original volume had, but with our + // updated sub_volume records. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes: new_sv, + read_only_parent, + }; - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; + let new_volume_data = serde_json::to_string( + &new_vcr, + ) + .map_err(|e| { + err.bail(VolumeGetError::SerdeError(e)) + })?; - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(TxnError::CustomError( - VolumeGetError::UnexpectedDatabaseUpdate( - num_updated, - 1, - ), - )); + // Update the original volume_id with the new + // volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(&conn) + .await?; + + // This should update just one row. If it does + // not, then something is terribly wrong in the + // database. + if num_updated != 1 { + return Err(err.bail( + VolumeGetError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + )); + } } } + VolumeConstructionRequest::Region { + block_size: _, + blocks_per_extent: _, + extent_count: _, + opts: _, + gen: _, + } => { + // We don't support a pure Region VCR at the volume + // level in the database, so this choice should + // never be encountered, but I want to know if it is. + panic!("Region not supported as a top level volume"); + } + VolumeConstructionRequest::File { + id: _, + block_size: _, + path: _, + } + | VolumeConstructionRequest::Url { + id: _, + block_size: _, + url: _, + } => {} } - VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _, - } => { - // We don't support a pure Region VCR at the volume - // level in the database, so this choice should - // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); - } - VolumeConstructionRequest::File { - id: _, - block_size: _, - path: _, - } - | VolumeConstructionRequest::Url { - id: _, - block_size: _, - url: _, - } => {} + Ok(volume) } - Ok(volume) }) .await - .map_err(|e| match e { - TxnError::CustomError(VolumeGetError::DieselError(e)) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - - _ => { - Error::internal_error(&format!("Transaction error: {}", e)) + .map_err(|e| { + if let Some(err) = err.take() { + return Error::internal_error(&format!("Transaction error: {}", err)); } + public_error_from_diesel(e, ErrorHandler::Server) }) } @@ -638,16 +637,12 @@ impl DataStore { ) -> Result { #[derive(Debug, thiserror::Error)] enum RemoveReadOnlyParentError { - #[error("Error removing read only parent: {0}")] - DieselError(#[from] diesel::result::Error), - #[error("Serde error removing read only parent: {0}")] SerdeError(#[from] serde_json::Error), #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), } - type TxnError = TransactionError; // In this single transaction: // - Get the given volume from the volume_id from the database @@ -663,170 +658,160 @@ impl DataStore { // data from original volume_id. // - Put the new temp VCR into the temp volume.data, update the // temp_volume in the database. - self.pool_connection_unauthorized() - .await? - .transaction_async(|conn| async move { - // Grab the volume in question. If the volume record was already - // deleted then we can just return. - let volume = { - use db::schema::volume::dsl; - - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await - .optional()?; - - let volume = if let Some(v) = volume { - v - } else { - // the volume does not exist, nothing to do. - return Ok(false); + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("volume_remove_rop") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // Grab the volume in question. If the volume record was already + // deleted then we can just return. + let volume = { + use db::schema::volume::dsl; + + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(&conn) + .await + .optional()?; + + let volume = if let Some(v) = volume { + v + } else { + // the volume does not exist, nothing to do. + return Ok(false); + }; + + if volume.time_deleted.is_some() { + // this volume is deleted, so let whatever is deleting + // it clean it up. + return Ok(false); + } else { + // A volume record exists, and was not deleted, we + // can attempt to remove its read_only_parent. + volume + } }; - if volume.time_deleted.is_some() { - // this volume is deleted, so let whatever is deleting - // it clean it up. - return Ok(false); - } else { - // A volume record exists, and was not deleted, we - // can attempt to remove its read_only_parent. - volume - } - }; - - // If a read_only_parent exists, remove it from volume_id, and - // attach it to temp_volume_id. - let vcr: VolumeConstructionRequest = - serde_json::from_str( - volume.data() - ) - .map_err(|e| { - TxnError::CustomError( - RemoveReadOnlyParentError::SerdeError( - e, - ), + // If a read_only_parent exists, remove it from volume_id, and + // attach it to temp_volume_id. + let vcr: VolumeConstructionRequest = + serde_json::from_str( + volume.data() ) - })?; - - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - if read_only_parent.is_none() { - // This volume has no read_only_parent - Ok(false) - } else { - // Create a new VCR and fill in the contents - // from what the original volume had. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent: None, - }; - - let new_volume_data = - serde_json::to_string( - &new_vcr + .map_err(|e| { + err.bail( + RemoveReadOnlyParentError::SerdeError( + e, ) - .map_err(|e| { - TxnError::CustomError( - RemoveReadOnlyParentError::SerdeError( - e, - ), - ) - })?; + ) + })?; - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; - - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(TxnError::CustomError( - RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1), - )); - } + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + if read_only_parent.is_none() { + // This volume has no read_only_parent + Ok(false) + } else { + // Create a new VCR and fill in the contents + // from what the original volume had. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent: None, + }; - // Make a new VCR, with the information from - // our temp_volume_id, but the read_only_parent - // from the original volume. - let rop_vcr = VolumeConstructionRequest::Volume { - id: temp_volume_id, - block_size, - sub_volumes: vec![], - read_only_parent, - }; - let rop_volume_data = - serde_json::to_string( - &rop_vcr - ) - .map_err(|e| { - TxnError::CustomError( - RemoveReadOnlyParentError::SerdeError( - e, - ), + let new_volume_data = + serde_json::to_string( + &new_vcr ) - })?; - // Update the temp_volume_id with the volume - // data that contains the read_only_parent. - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(temp_volume_id)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(rop_volume_data)) + .map_err(|e| { + err.bail(RemoveReadOnlyParentError::SerdeError( + e, + )) + })?; + + // Update the original volume_id with the new + // volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) .execute_async(&conn) .await?; - if num_updated != 1 { - return Err(TxnError::CustomError( - RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1), - )); + + // This should update just one row. If it does + // not, then something is terribly wrong in the + // database. + if num_updated != 1 { + return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); + } + + // Make a new VCR, with the information from + // our temp_volume_id, but the read_only_parent + // from the original volume. + let rop_vcr = VolumeConstructionRequest::Volume { + id: temp_volume_id, + block_size, + sub_volumes: vec![], + read_only_parent, + }; + let rop_volume_data = + serde_json::to_string( + &rop_vcr + ) + .map_err(|e| { + err.bail(RemoveReadOnlyParentError::SerdeError( + e, + )) + })?; + // Update the temp_volume_id with the volume + // data that contains the read_only_parent. + let num_updated = + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(temp_volume_id)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(rop_volume_data)) + .execute_async(&conn) + .await?; + if num_updated != 1 { + return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); + } + Ok(true) } - Ok(true) } - } - VolumeConstructionRequest::File { id: _, block_size: _, path: _ } - | VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _ } - | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { - // Volume has a format that does not contain ROPs - Ok(false) + VolumeConstructionRequest::File { id: _, block_size: _, path: _ } + | VolumeConstructionRequest::Region { + block_size: _, + blocks_per_extent: _, + extent_count: _, + opts: _, + gen: _ } + | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { + // Volume has a format that does not contain ROPs + Ok(false) + } } } }) .await - .map_err(|e| match e { - TxnError::CustomError( - RemoveReadOnlyParentError::DieselError(e), - ) => public_error_from_diesel( - e, - ErrorHandler::Server, - ), - - _ => { - Error::internal_error(&format!("Transaction error: {}", e)) + .map_err(|e| { + if let Some(err) = err.take() { + return Error::internal_error(&format!("Transaction error: {}", err)); } + public_error_from_diesel(e, ErrorHandler::Server) }) } } -#[derive(Default, Debug, Serialize, Deserialize)] +#[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct CrucibleTargets { pub read_only_targets: Vec, } diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 6db99465a3..069ce63028 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -12,7 +12,6 @@ use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::fixed_data::vpc::SERVICES_VPC_ID; use crate::db::identity::Resource; use crate::db::model::IncompleteVpc; @@ -37,7 +36,7 @@ use crate::db::queries::vpc::InsertVpcQuery; use crate::db::queries::vpc::VniSearchIter; use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; use crate::db::queries::vpc_subnet::SubnetError; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -580,53 +579,65 @@ impl DataStore { .set(dsl::time_deleted.eq(now)); let rules_is_empty = rules.is_empty(); - let insert_new_query = Vpc::insert_resource( - authz_vpc.id(), - diesel::insert_into(dsl::vpc_firewall_rule).values(rules), - ); - #[derive(Debug)] enum FirewallUpdateError { CollectionNotFound, } - type TxnError = TransactionError; + + let err = OptionalError::new(); // TODO-scalability: Ideally this would be a CTE so we don't need to // hold a transaction open across multiple roundtrips from the database, // but for now we're using a transaction due to the severely decreased // legibility of CTEs via diesel right now. - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - delete_old_query.execute_async(&conn).await?; - - // The generation count update on the vpc table row will take a - // write lock on the row, ensuring that the vpc was not deleted - // concurently. - if rules_is_empty { - return Ok(vec![]); - } - insert_new_query + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("vpc_update_firewall_rules") + .transaction(&conn, |conn| { + let err = err.clone(); + let delete_old_query = delete_old_query.clone(); + let rules = rules.clone(); + async move { + delete_old_query.execute_async(&conn).await?; + + // The generation count update on the vpc table row will take a + // write lock on the row, ensuring that the vpc was not deleted + // concurently. + if rules_is_empty { + return Ok(vec![]); + } + Vpc::insert_resource( + authz_vpc.id(), + diesel::insert_into(dsl::vpc_firewall_rule) + .values(rules), + ) .insert_and_get_results_async(&conn) .await .map_err(|e| match e { AsyncInsertError::CollectionNotFound => { - TxnError::CustomError( - FirewallUpdateError::CollectionNotFound, - ) + err.bail(FirewallUpdateError::CollectionNotFound) } - AsyncInsertError::DatabaseError(e) => e.into(), + AsyncInsertError::DatabaseError(e) => e, }) + } }) .await - .map_err(|e| match e { - TxnError::CustomError( - FirewallUpdateError::CollectionNotFound, - ) => Error::not_found_by_id(ResourceType::Vpc, &authz_vpc.id()), - TxnError::Database(e) => public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_vpc), - ), + .map_err(|e| { + if let Some(err) = err.take() { + match err { + FirewallUpdateError::CollectionNotFound => { + Error::not_found_by_id( + ResourceType::Vpc, + &authz_vpc.id(), + ) + } + } + } else { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByResource(authz_vpc), + ) + } }) } diff --git a/nexus/db-queries/src/db/error.rs b/nexus/db-queries/src/db/error.rs index cbe2b0a71f..fc7f30da93 100644 --- a/nexus/db-queries/src/db/error.rs +++ b/nexus/db-queries/src/db/error.rs @@ -17,7 +17,7 @@ pub enum TransactionError { /// The customizable error type. /// /// This error should be used for all non-Diesel transaction failures. - #[error("Custom transaction error; {0}")] + #[error("Custom transaction error: {0}")] CustomError(T), /// The Diesel error type. @@ -28,31 +28,61 @@ pub enum TransactionError { Database(#[from] DieselError), } +pub fn retryable(error: &DieselError) -> bool { + match error { + DieselError::DatabaseError(kind, boxed_error_information) => match kind + { + DieselErrorKind::SerializationFailure => { + return boxed_error_information + .message() + .starts_with("restart transaction"); + } + _ => false, + }, + _ => false, + } +} + +/// Identifies if the error is retryable or not. +pub enum MaybeRetryable { + /// The error isn't retryable. + NotRetryable(T), + /// The error is retryable. + Retryable(DieselError), +} + +impl TransactionError { + /// Identifies that the error could be returned from a Diesel transaction. + /// + /// Allows callers to propagate arbitrary errors out of transaction contexts + /// without losing information that might be valuable to the calling context, + /// such as "does this particular error indicate that the entire transaction + /// should retry?". + pub fn retryable(self) -> MaybeRetryable { + use MaybeRetryable::*; + + match self { + TransactionError::Database(err) if retryable(&err) => { + Retryable(err) + } + _ => NotRetryable(self), + } + } +} + impl From for TransactionError { fn from(err: PublicError) -> Self { TransactionError::CustomError(err) } } -impl TransactionError { - /// Based on [the CRDB][1] docs, return true if this transaction must be - /// retried. - /// - /// [1]: https://www.cockroachlabs.com/docs/v23.1/transaction-retry-error-reference#client-side-retry-handling - pub fn retry_transaction(&self) -> bool { - match &self { - Self::Database(DieselError::DatabaseError( - kind, - boxed_error_information, - )) => match kind { - DieselErrorKind::SerializationFailure => { - return boxed_error_information - .message() - .starts_with("restart transaction"); - } - _ => false, - }, - _ => false, +impl From> for PublicError { + fn from(err: TransactionError) -> Self { + match err { + TransactionError::CustomError(err) => err, + TransactionError::Database(err) => { + public_error_from_diesel(err, ErrorHandler::Server) + } } } } diff --git a/nexus/db-queries/src/db/mod.rs b/nexus/db-queries/src/db/mod.rs index b7c7079b54..e6b8743e94 100644 --- a/nexus/db-queries/src/db/mod.rs +++ b/nexus/db-queries/src/db/mod.rs @@ -17,7 +17,7 @@ mod config; mod cte_utils; // This is marked public for use by the integration tests pub mod datastore; -mod error; +pub(crate) mod error; mod explain; pub mod fixed_data; pub mod lookup; @@ -42,7 +42,7 @@ pub use nexus_db_model::schema; pub use crate::db::error::TransactionError; pub use config::Config; pub use datastore::DataStore; -pub use pool::Pool; +pub use pool::{DbConnection, Pool}; pub use saga_recovery::{recover, CompletionTask, RecoveryTask}; pub use saga_types::SecId; pub use sec_store::CockroachDbSecStore; diff --git a/nexus/db-queries/src/db/queries/network_interface.rs b/nexus/db-queries/src/db/queries/network_interface.rs index 84a81a7b7a..1dbe57da6f 100644 --- a/nexus/db-queries/src/db/queries/network_interface.rs +++ b/nexus/db-queries/src/db/queries/network_interface.rs @@ -5,6 +5,7 @@ //! Queries for inserting and deleting network interfaces. use crate::db; +use crate::db::error::{public_error_from_diesel, retryable, ErrorHandler}; use crate::db::model::IncompleteNetworkInterface; use crate::db::pool::DbConnection; use crate::db::queries::next_item::DefaultShiftGenerator; @@ -120,6 +121,8 @@ pub enum InsertError { InstanceMustBeStopped(Uuid), /// The instance does not exist at all, or is in the destroyed state. InstanceNotFound(Uuid), + /// The operation occurred within a transaction, and is retryable + Retryable(DieselError), /// Any other error External(external::Error), } @@ -135,7 +138,6 @@ impl InsertError { e: DieselError, interface: &IncompleteNetworkInterface, ) -> Self { - use crate::db::error; match e { // Catch the specific errors designed to communicate the failures we // want to distinguish @@ -143,9 +145,9 @@ impl InsertError { decode_database_error(e, interface) } // Any other error at all is a bug - _ => InsertError::External(error::public_error_from_diesel( + _ => InsertError::External(public_error_from_diesel( e, - error::ErrorHandler::Server, + ErrorHandler::Server, )), } } @@ -209,6 +211,9 @@ impl InsertError { InsertError::InstanceNotFound(id) => { external::Error::not_found_by_id(external::ResourceType::Instance, &id) } + InsertError::Retryable(err) => { + public_error_from_diesel(err, ErrorHandler::Server) + } InsertError::External(e) => e, } } @@ -290,6 +295,10 @@ fn decode_database_error( r#"uuid: incorrect UUID length: non-unique-subnets"#, ); + if retryable(&err) { + return InsertError::Retryable(err); + } + match err { // If the address allocation subquery fails, we'll attempt to insert // NULL for the `ip` column. This checks that the non-NULL constraint on diff --git a/nexus/db-queries/src/lib.rs b/nexus/db-queries/src/lib.rs index a693f7ff42..5d1927ebc7 100644 --- a/nexus/db-queries/src/lib.rs +++ b/nexus/db-queries/src/lib.rs @@ -9,6 +9,7 @@ pub mod authz; pub mod context; pub mod db; pub mod provisioning; +pub mod transaction_retry; #[macro_use] extern crate slog; diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs new file mode 100644 index 0000000000..c474b729f8 --- /dev/null +++ b/nexus/db-queries/src/transaction_retry.rs @@ -0,0 +1,341 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helper types for performing automatic transaction retries + +use async_bb8_diesel::AsyncConnection; +use chrono::Utc; +use diesel::result::Error as DieselError; +use oximeter::{types::Sample, Metric, MetricsError, Target}; +use rand::{thread_rng, Rng}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +// Identifies "which" transaction is retrying +#[derive(Debug, Clone, Target)] +struct DatabaseTransaction { + name: String, +} + +// Identifies that a retry has occurred, and track how long +// the transaction took (either since starting, or since the last +// retry failure was recorded). +#[derive(Debug, Clone, Metric)] +struct RetryData { + #[datum] + latency: f64, + attempt: u32, +} + +// Collects all transaction retry samples +#[derive(Debug, Default, Clone)] +pub(crate) struct Producer { + samples: Arc>>, +} + +impl Producer { + pub(crate) fn new() -> Self { + Self { samples: Arc::new(Mutex::new(vec![])) } + } + + fn append( + &self, + transaction: &DatabaseTransaction, + data: &RetryData, + ) -> Result<(), MetricsError> { + let sample = Sample::new_with_timestamp(Utc::now(), transaction, data)?; + self.samples.lock().unwrap().push(sample); + Ok(()) + } +} + +struct RetryHelperInner { + start: chrono::DateTime, + attempts: u32, +} + +impl RetryHelperInner { + fn new() -> Self { + Self { start: Utc::now(), attempts: 1 } + } + + fn tick(&mut self) -> Self { + let start = self.start; + let attempts = self.attempts; + + self.start = Utc::now(); + self.attempts += 1; + + Self { start, attempts } + } +} + +/// Helper utility for tracking retry attempts and latency. +/// Intended to be used from within "transaction_async_with_retry". +pub struct RetryHelper { + producer: Producer, + name: &'static str, + inner: Mutex, +} + +const MIN_RETRY_BACKOFF: Duration = Duration::from_millis(0); +const MAX_RETRY_BACKOFF: Duration = Duration::from_millis(50); +const MAX_RETRY_ATTEMPTS: u32 = 10; + +impl RetryHelper { + /// Creates a new RetryHelper, and starts a timer tracking the transaction + /// duration. + pub(crate) fn new(producer: &Producer, name: &'static str) -> Self { + Self { + producer: producer.clone(), + name, + inner: Mutex::new(RetryHelperInner::new()), + } + } + + /// Calls the function "f" in an asynchronous, retryable transaction. + pub async fn transaction( + self, + conn: &async_bb8_diesel::Connection, + f: Func, + ) -> Result + where + R: Send + 'static, + Fut: std::future::Future> + Send, + Func: Fn(async_bb8_diesel::Connection) -> Fut + + Send + + Sync, + { + conn.transaction_async_with_retry(f, self.as_callback()).await + } + + // Called upon retryable transaction failure. + // + // This function: + // - Appends a metric identifying the duration of the transaction operation + // - Performs a random (uniform) backoff (limited to less than 50 ms) + // - Returns "true" if the transaction should be restarted + async fn retry_callback(&self) -> bool { + // Look at the current attempt and start time so we can log this + // information before we start sleeping. + let (start, attempt) = { + let inner = self.inner.lock().unwrap(); + (inner.start, inner.attempts) + }; + + let latency = (Utc::now() - start) + .to_std() + .unwrap_or(Duration::ZERO) + .as_secs_f64(); + + let _ = self.producer.append( + &DatabaseTransaction { name: self.name.into() }, + &RetryData { latency, attempt }, + ); + + // This backoff is not exponential, but I'm not sure we actually want + // that much backoff here. If we're repeatedly failing, it would + // probably be better to fail the operation, at which point Oximeter + // will keep track of the failing transaction and identify that it's a + // high-priority target for CTE conversion. + let duration = { + let mut rng = thread_rng(); + rng.gen_range(MIN_RETRY_BACKOFF..MAX_RETRY_BACKOFF) + }; + tokio::time::sleep(duration).await; + + // Now that we've finished sleeping, reset the timer and bump the number + // of attempts we've tried. + let inner = self.inner.lock().unwrap().tick(); + return inner.attempts < MAX_RETRY_ATTEMPTS; + } + + /// Converts this function to a retryable callback that can be used from + /// "transaction_async_with_retry". + pub(crate) fn as_callback( + self, + ) -> impl Fn() -> futures::future::BoxFuture<'static, bool> { + let r = Arc::new(self); + move || { + let r = r.clone(); + Box::pin(async move { r.retry_callback().await }) + } + } +} + +impl oximeter::Producer for Producer { + fn produce( + &mut self, + ) -> Result + 'static>, MetricsError> { + let samples = std::mem::take(&mut *self.samples.lock().unwrap()); + Ok(Box::new(samples.into_iter())) + } +} + +/// Helper utility for passing non-retryable errors out-of-band from +/// transactions. +/// +/// Transactions prefer to act on the `diesel::result::Error` type, +/// but transaction users may want more meaningful error types. +/// This utility helps callers safely propagate back Diesel errors while +/// retaining auxiliary error info. +pub struct OptionalError(Arc>>); + +impl Clone for OptionalError { + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl OptionalError { + pub fn new() -> Self { + Self(Arc::new(Mutex::new(None))) + } + + /// Sets "Self" to the value of `error` and returns `DieselError::RollbackTransaction`. + pub fn bail(&self, err: E) -> DieselError { + (*self.0.lock().unwrap()).replace(err); + DieselError::RollbackTransaction + } + + /// If `diesel_error` is retryable, returns it without setting Self. + /// + /// Otherwise, sets "Self" to the value of `err`, and returns + /// `DieselError::RollbackTransaction`. + pub fn bail_retryable_or( + &self, + diesel_error: DieselError, + err: E, + ) -> DieselError { + self.bail_retryable_or_else(diesel_error, |_diesel_error| err) + } + + /// If `diesel_error` is retryable, returns it without setting Self. + /// + /// Otherwise, sets "Self" to the value of `f` applied to `diesel_err`, and + /// returns `DieselError::RollbackTransaction`. + pub fn bail_retryable_or_else( + &self, + diesel_error: DieselError, + f: F, + ) -> DieselError + where + F: FnOnce(DieselError) -> E, + { + if crate::db::error::retryable(&diesel_error) { + return diesel_error; + } else { + self.bail(f(diesel_error)) + } + } + + /// If "Self" was previously set to a non-retryable error, return it. + pub fn take(self) -> Option { + (*self.0.lock().unwrap()).take() + } +} + +#[cfg(test)] +mod test { + use super::*; + + use crate::db::datastore::datastore_test; + use nexus_test_utils::db::test_setup_database; + use omicron_test_utils::dev; + use oximeter::types::FieldValue; + + // If a transaction is explicitly rolled back, we should not expect any + // samples to be taken. With no retries, this is just a normal operation + // failure. + #[tokio::test] + async fn test_transaction_rollback_produces_no_samples() { + let logctx = dev::test_setup_log( + "test_transaction_rollback_produces_no_samples", + ); + let mut db = test_setup_database(&logctx.log).await; + let (_opctx, datastore) = datastore_test(&logctx, &db).await; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper( + "test_transaction_rollback_produces_no_samples", + ) + .transaction(&conn, |_conn| async move { + Err::<(), _>(diesel::result::Error::RollbackTransaction) + }) + .await + .expect_err("Should have failed"); + + let samples = datastore + .transaction_retry_producer() + .samples + .lock() + .unwrap() + .clone(); + assert_eq!(samples, vec![]); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + // If a transaction fails with a retryable error, we record samples, + // providing oximeter-level information about the attempts. + #[tokio::test] + async fn test_transaction_retry_produces_samples() { + let logctx = + dev::test_setup_log("test_transaction_retry_produces_samples"); + let mut db = test_setup_database(&logctx.log).await; + let (_opctx, datastore) = datastore_test(&logctx, &db).await; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore + .transaction_retry_wrapper( + "test_transaction_retry_produces_samples", + ) + .transaction(&conn, |_conn| async move { + Err::<(), _>(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + Box::new("restart transaction: Retry forever!".to_string()), + )) + }) + .await + .expect_err("Should have failed"); + + let samples = datastore + .transaction_retry_producer() + .samples + .lock() + .unwrap() + .clone(); + assert_eq!(samples.len(), MAX_RETRY_ATTEMPTS as usize); + + for i in 0..samples.len() { + let sample = &samples[i]; + + assert_eq!( + sample.timeseries_name, + "database_transaction:retry_data" + ); + + let target_fields = sample.sorted_target_fields(); + assert_eq!( + target_fields["name"].value, + FieldValue::String( + "test_transaction_retry_produces_samples".to_string() + ) + ); + + // Attempts are one-indexed + let metric_fields = sample.sorted_metric_fields(); + assert_eq!( + metric_fields["attempt"].value, + FieldValue::U32(u32::try_from(i).unwrap() + 1), + ); + } + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} diff --git a/nexus/src/app/background/dns_config.rs b/nexus/src/app/background/dns_config.rs index 654e9c0bf1..805ae813fe 100644 --- a/nexus/src/app/background/dns_config.rs +++ b/nexus/src/app/background/dns_config.rs @@ -166,7 +166,6 @@ mod test { use crate::app::background::init::test::read_internal_dns_zone_id; use crate::app::background::init::test::write_test_dns_generation; use assert_matches::assert_matches; - use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use async_bb8_diesel::AsyncSimpleConnection; use diesel::ExpressionMethods; @@ -237,11 +236,11 @@ mod test { ); // Similarly, wipe all of the state and verify that we handle that okay. + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper("dns_config_test_basic") + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) @@ -265,7 +264,7 @@ mod test { .execute_async(&conn) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>(()) + Ok(()) }) .await .unwrap(); diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index cfa023a013..d30d2162c4 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -247,14 +247,12 @@ fn init_dns( #[cfg(test)] pub mod test { - use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use dropshot::HandlerTaskMode; use nexus_db_model::DnsGroup; use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; - use nexus_db_queries::db::TransactionError; use nexus_test_utils_macros::nexus_test; use nexus_types::internal_api::params as nexus_params; use nexus_types::internal_api::params::ServiceKind; @@ -446,11 +444,11 @@ pub mod test { datastore: &DataStore, internal_dns_zone_id: Uuid, ) { - type TxnError = TransactionError<()>; { let conn = datastore.pool_connection_for_tests().await.unwrap(); - let _: Result<(), TxnError> = conn - .transaction_async(|conn| async move { + let _: Result<(), _> = datastore + .transaction_retry_wrapper("write_test_dns_generation") + .transaction(&conn, |conn| async move { { use nexus_db_queries::db::model::DnsVersion; use nexus_db_queries::db::schema::dns_version::dsl; diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index fe403a7d41..4883afaddc 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -830,9 +830,7 @@ pub(crate) mod test { app::saga::create_saga_dag, app::sagas::disk_create::Params, app::sagas::disk_create::SagaDiskCreate, external_api::params, }; - use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -972,27 +970,25 @@ pub(crate) mod test { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper( + "no_virtual_provisioning_collection_records_using_storage", + ) + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::virtual_provisioning_collection - .filter(dsl::virtual_disk_bytes_provisioned.ne(0)) - .select(VirtualProvisioningCollection::as_select()) - .get_results_async::( - &conn, - ) - .await - .unwrap() - .is_empty(), - ) + Ok(dsl::virtual_provisioning_collection + .filter(dsl::virtual_disk_bytes_provisioned.ne(0)) + .select(VirtualProvisioningCollection::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) }) .await .unwrap() diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 153e0323e7..8c2f96c36c 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -866,9 +866,7 @@ pub mod test { app::sagas::instance_create::SagaInstanceCreate, app::sagas::test_helpers, external_api::params, }; - use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ BoolExpressionMethods, ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, @@ -1013,30 +1011,28 @@ pub mod test { use nexus_db_queries::db::model::SledResource; use nexus_db_queries::db::schema::sled_resource::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper( + "no_sled_resource_instance_records_exist", + ) + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::sled_resource - .filter( - dsl::kind.eq( - nexus_db_queries::db::model::SledResourceKind::Instance, - ), - ) - .select(SledResource::as_select()) - .get_results_async::(&conn) - .await - .unwrap() - .is_empty(), - ) + Ok(dsl::sled_resource + .filter(dsl::kind.eq( + nexus_db_queries::db::model::SledResourceKind::Instance, + )) + .select(SledResource::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) }) .await .unwrap() @@ -1048,16 +1044,17 @@ pub mod test { use nexus_db_queries::db::model::VirtualProvisioningResource; use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl; - datastore.pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist") + .transaction(&conn, |conn| async move { conn .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( + Ok( dsl::virtual_provisioning_resource .filter(dsl::resource_type.eq(nexus_db_queries::db::model::ResourceTypeProvisioned::Instance.to_string())) .select(VirtualProvisioningResource::as_select()) @@ -1075,31 +1072,29 @@ pub mod test { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper( + "no_virtual_provisioning_collection_records_using_instances", + ) + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::virtual_provisioning_collection - .filter( - dsl::cpus_provisioned - .ne(0) - .or(dsl::ram_provisioned.ne(0)), - ) - .select(VirtualProvisioningCollection::as_select()) - .get_results_async::( - &conn, - ) - .await - .unwrap() - .is_empty(), - ) + Ok(dsl::virtual_provisioning_collection + .filter( + dsl::cpus_provisioned + .ne(0) + .or(dsl::ram_provisioned.ne(0)), + ) + .select(VirtualProvisioningCollection::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) }) .await .unwrap() diff --git a/nexus/src/app/sagas/project_create.rs b/nexus/src/app/sagas/project_create.rs index 135e20ff06..40acc822c0 100644 --- a/nexus/src/app/sagas/project_create.rs +++ b/nexus/src/app/sagas/project_create.rs @@ -157,9 +157,7 @@ mod test { app::saga::create_saga_dag, app::sagas::project_create::Params, app::sagas::project_create::SagaProjectCreate, external_api::params, }; - use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -233,15 +231,16 @@ mod test { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; - datastore.pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper("no_virtual_provisioning_collection_records_for_projects") + .transaction(&conn, |conn| async move { conn .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( + Ok( dsl::virtual_provisioning_collection .filter(dsl::collection_type.eq(nexus_db_queries::db::model::CollectionTypeProvisioned::Project.to_string())) // ignore built-in services project diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index eccb013b66..3110bd318a 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -10,9 +10,7 @@ use crate::{ app::{saga::create_saga_dag, test_interfaces::TestInterfaces as _}, Nexus, }; -use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, -}; +use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use futures::future::BoxFuture; use nexus_db_queries::{ @@ -434,11 +432,10 @@ pub(crate) async fn assert_no_failed_undo_steps( ) { use nexus_db_queries::db::model::saga_types::SagaNodeEvent; + let conn = datastore.pool_connection_for_tests().await.unwrap(); let saga_node_events: Vec = datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper("assert_no_failed_undo_steps") + .transaction(&conn, |conn| async move { use nexus_db_queries::db::schema::saga_node_event::dsl; conn.batch_execute_async( @@ -447,14 +444,12 @@ pub(crate) async fn assert_no_failed_undo_steps( .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::saga_node_event - .filter(dsl::event_type.eq(String::from("undo_failed"))) - .select(SagaNodeEvent::as_select()) - .load_async::(&conn) - .await - .unwrap(), - ) + Ok(dsl::saga_node_event + .filter(dsl::event_type.eq(String::from("undo_failed"))) + .select(SagaNodeEvent::as_select()) + .load_async::(&conn) + .await + .unwrap()) }) .await .unwrap();