From 21d57767f425bffdb58ed5e71fe47614a150e405 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Thu, 28 Dec 2023 18:08:42 +0000 Subject: [PATCH] Working idempotent double attach/detach. This required that we drop the non-null parent constraint on ephemeral IPs, but I think that's worth it in the name of consistency. --- nexus/db-model/src/external_ip.rs | 6 +- nexus/db-model/src/schema.rs | 2 + .../src/db/datastore/external_ip.rs | 266 ++++++++++++++---- .../db-queries/src/db/queries/external_ip.rs | 18 +- nexus/src/app/instance.rs | 4 +- nexus/src/app/instance_network.rs | 25 +- nexus/src/app/sagas/instance_common.rs | 70 ++++- nexus/src/app/sagas/instance_create.rs | 3 + nexus/src/app/sagas/instance_ip_attach.rs | 30 +- nexus/src/app/sagas/instance_ip_detach.rs | 64 +++-- nexus/src/external_api/http_entrypoints.rs | 2 +- schema/crdb/22.0.0/up08.sql | 2 + schema/crdb/22.0.0/up09.sql | 4 + schema/crdb/dbinit.sql | 10 +- 14 files changed, 382 insertions(+), 124 deletions(-) create mode 100644 schema/crdb/22.0.0/up08.sql create mode 100644 schema/crdb/22.0.0/up09.sql diff --git a/nexus/db-model/src/external_ip.rs b/nexus/db-model/src/external_ip.rs index b6f556ab61..d762d0bb4a 100644 --- a/nexus/db-model/src/external_ip.rs +++ b/nexus/db-model/src/external_ip.rs @@ -183,7 +183,7 @@ impl IncompleteExternalIp { } } - pub fn for_ephemeral(id: Uuid, instance_id: Uuid, pool_id: Uuid) -> Self { + pub fn for_ephemeral(id: Uuid, pool_id: Uuid) -> Self { let kind = IpKind::Ephemeral; Self { id, @@ -192,7 +192,7 @@ impl IncompleteExternalIp { time_created: Utc::now(), kind, is_service: false, - parent_id: Some(instance_id), + parent_id: None, pool_id, project_id: None, explicit_ip: None, @@ -402,7 +402,7 @@ impl IpKind { pub fn initial_state(&self) -> IpAttachState { match &self { IpKind::SNat => IpAttachState::Attached, - IpKind::Ephemeral => IpAttachState::Attaching, + IpKind::Ephemeral => IpAttachState::Detached, IpKind::Floating => IpAttachState::Detached, } } diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 7af74036b2..ba12c9d041 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -1406,6 +1406,8 @@ allow_tables_to_appear_in_same_query!( allow_tables_to_appear_in_same_query!(dns_zone, dns_version, dns_name); allow_tables_to_appear_in_same_query!(external_ip, service); +allow_tables_to_appear_in_same_query!(external_ip, instance); +joinable!(external_ip -> instance (parent_id)); allow_tables_to_appear_in_same_query!( switch_port, diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index b021de9580..2364cbf341 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -80,13 +80,33 @@ impl DataStore { /// concurrent access. /// Callers must call `external_ip_complete_op` on saga completion to move /// the IP to `Attached`. + /// + /// To better handle idempotent attachment, this method returns an + /// additional bool: + /// - true: EIP was detached or attaching. proceed with saga. + /// - false: EIP was attached. No-op for remainder of saga. pub async fn allocate_instance_ephemeral_ip( &self, opctx: &OpContext, ip_id: Uuid, instance_id: Uuid, pool_name: Option, - ) -> CreateResult { + creating_instance: bool, + ) -> CreateResult<(ExternalIp, bool)> { + use db::schema::external_ip::dsl; + use db::schema::external_ip::table; + use db::schema::instance::dsl as inst_dsl; + use db::schema::instance::table as inst_table; + use diesel::result::DatabaseErrorKind::UniqueViolation; + + // This is slightly hacky: we need to create an unbound ephemeral IP, and + // then attempt to bind it to respect two separate constraints: + // - At most one Ephemeral IP per instance + // - At most MAX external IPs per instance + // We already catch and convert a UniqueViolation on ephemeral IPs: + // if we see this occur, then + // Naturally, we now *need* to destroy the ephemeral IP if the newly alloc'd + // IP was not attached, including on idempotent success. let pool = match pool_name { Some(name) => { let (.., authz_pool, pool) = LookupPath::new(opctx, &self) @@ -115,9 +135,116 @@ impl DataStore { }; let pool_id = pool.identity.id; - let data = - IncompleteExternalIp::for_ephemeral(ip_id, instance_id, pool_id); - self.allocate_external_ip(opctx, data).await + let data = IncompleteExternalIp::for_ephemeral(ip_id, pool_id); + let temp_ip = self.allocate_external_ip(opctx, data).await?; + + let safe_states = if creating_instance { + &SAFE_TO_ATTACH_INSTANCE_STATES_CREATING[..] + } else { + &SAFE_TO_ATTACH_INSTANCE_STATES[..] + }; + + let query = Instance::attach_resource( + instance_id, + temp_ip.id, + inst_table.into_boxed().filter(inst_dsl::state.eq_any(safe_states)), + table + .into_boxed() + .filter(dsl::state.eq(IpAttachState::Detached)) + .filter(dsl::kind.eq(IpKind::Ephemeral)), + MAX_EXTERNAL_IPS_PLUS_SNAT, + diesel::update(dsl::external_ip).set(( + dsl::parent_id.eq(Some(instance_id)), + dsl::time_modified.eq(Utc::now()), + dsl::state.eq(IpAttachState::Attaching), + )), + ); + + let result = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map(Some) + .or_else(|e: AttachError| match e { + AttachError::CollectionNotFound => { + Err(Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + )) + }, + AttachError::ResourceNotFound => { + Err(Error::internal_error("call-scoped ephemeral IP was lost")) + }, + AttachError::NoUpdate { attached_count, resource, collection } => { + match resource.state { + // Idempotent errors: is in progress forsame resource pair -- this is fine. + IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok(Some((collection, resource))), + IpAttachState::Attached => return Err(Error::invalid_request( + "floating IP cannot be attached to one \ + instance while still attached to another" + )), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching | IpAttachState::Detaching => return Err(Error::unavail( + "tried to attach floating IP mid-attach/detach" + )), + + IpAttachState::Detached => {}, + } + + Err(match &collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( + "tried to attach floating IP while instance was changing state" + ), + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + if attached_count >= MAX_EXTERNAL_IPS_PLUS_SNAT as i64 { + Error::invalid_request(&format!( + "an instance may not have more than {} external IP addresses", + MAX_EXTERNAL_IPS_PER_INSTANCE, + )) + } else { + eprintln!("{resource:?}, {collection:?}"); + Error::internal_error("failed to attach ephemeral IP") + } + }, + state => Error::invalid_request(&format!("cannot attach floating IP to instance in {state} state")), + }) + }, + // This case occurs for both currently attaching and attached IPs: + AttachError::DatabaseError(diesel::result::Error::DatabaseError(UniqueViolation, ..)) => { + Ok(None) + }, + AttachError::DatabaseError(e) => { + Err(public_error_from_diesel(e, ErrorHandler::Server)) + }, + }); + + // if completed (!do_saga), we need to attempt + + match result { + Err(e) => { + self.deallocate_external_ip(opctx, temp_ip.id).await?; + Err(e) + } + // Idempotent cases: + Ok(Some((_, eip))) if eip.id != temp_ip.id => { + // Is this even possible? + eprintln!("mismatch?"); + self.deallocate_external_ip(opctx, temp_ip.id).await?; + Ok((eip, true)) + } + Ok(None) => { + self.deallocate_external_ip(opctx, temp_ip.id).await?; + let eip = self + .instance_lookup_external_ips(opctx, instance_id) + .await? + .into_iter() + .find(|v| v.kind == IpKind::Ephemeral) + .ok_or_else(|| Error::internal_error("hmm"))?; + Ok((eip, false)) + } + Ok(Some((_, eip))) => { + eprintln!(""); + Ok((eip, true)) + } + } } /// Allocates an IP address for internal service usage. @@ -259,8 +386,8 @@ impl DataStore { ) } } - // Floating IP: name conflict - DatabaseError(UniqueViolation, ..) if name.is_some() => { + // Floating IP: name conflict + DatabaseError(UniqueViolation, ..) => { TransactionError::CustomError(public_error_from_diesel( e, ErrorHandler::Conflict( @@ -271,12 +398,6 @@ impl DataStore { ), )) } - // Ephemeral IP: violated one-per-instance rule. - DatabaseError(UniqueViolation, ..) => { - TransactionError::CustomError(Error::invalid_request( - "instance/service cannot have more than one ephemeral IP" - )) - } _ => { if retryable(&e) { return TransactionError::Database(e); @@ -367,12 +488,17 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Moves an instance's ephemeral IP from 'Attached' to 'Detaching'. + /// + /// To support idempotency, this method will succeed if the instance + /// has no ephemeral IP or one is actively being removed. As a result, + /// information on an actual ExternalIp is best-effort. pub async fn begin_deallocate_ephemeral_ip( &self, opctx: &OpContext, ip_id: Uuid, instance_id: Uuid, - ) -> Result { + ) -> Result, Error> { use db::schema::external_ip::dsl; use db::schema::external_ip::table; use db::schema::instance::dsl as inst_dsl; @@ -401,46 +527,48 @@ impl DataStore { let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await - .map_err(|e: DetachError| match e { - DetachError::CollectionNotFound => { - Error::not_found_by_id( - ResourceType::Instance, - &instance_id, - ) - }, - DetachError::ResourceNotFound => { - Error::invalid_request("instance has no ephemeral IP to detach") - }, - DetachError::NoUpdate { resource, collection } => { - match resource.state { - IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Error::internal_error( - "Ephemeral IP is not attached to the target instance", - ), - // User can reattempt depending on how the current saga unfolds. - IpAttachState::Attaching | IpAttachState::Detaching => return Error::unavail( - "tried to detach ephemeral IP mid-attach/detach" - ), - IpAttachState::Attached => {}, - IpAttachState::Detached => return Error::internal_error( - "Ephemeral IP cannot exist in 'detached' state", - ), - } - - match collection.runtime_state.nexus_state { - state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( - "tried to detach ephemeral IP while instance was changing state" - ), - state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { - Error::internal_error("failed to detach ephemeral IP") + .map(Some) + .or_else(|e: DetachError| { + Err(match e { + DetachError::CollectionNotFound => { + Error::not_found_by_id( + ResourceType::Instance, + &instance_id, + ) }, - state => Error::invalid_request(&format!("cannot attach ephemeral IP to instance in {state} state")), - } - }, - DetachError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - }, - - })?; + DetachError::ResourceNotFound => { + return Ok(None); + }, + DetachError::NoUpdate { resource, collection } => { + match resource.state { + // XXX: internal error? + IpAttachState::Attached if resource.parent_id != Some(instance_id) => return Err(Error::internal_error( + "Ephemeral IP is not attached to the target instance", + )), + IpAttachState::Detaching => return Ok(Some(resource)), + // User can reattempt depending on how the current saga unfolds. + IpAttachState::Attaching => return Err(Error::unavail( + "tried to detach ephemeral IP mid-attach/detach" + )), + IpAttachState::Attached => {}, + IpAttachState::Detached => return Err(Error::internal_error( + "Ephemeral IP cannot exist in 'detached' state", + )), + } + match collection.runtime_state.nexus_state { + state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( + "tried to detach ephemeral IP while instance was changing state" + ), + state if SAFE_TO_ATTACH_INSTANCE_STATES.contains(&state) => { + Error::internal_error("failed to detach ephemeral IP") + }, + state => Error::invalid_request(&format!("cannot attach ephemeral IP to instance in {state} state")), + } + }, + DetachError::DatabaseError(e) => { + public_error_from_diesel(e, ErrorHandler::Server) + }, + })})?; Ok(eip) } @@ -588,13 +716,18 @@ impl DataStore { /// This moves a floating IP into the 'attaching' state. Callers are /// responsible for calling `external_ip_complete_op` to finalise the /// IP in 'attached' state at saga completion. + /// + /// To better handle idempotent attachment, this method returns an + /// additional bool: + /// - true: EIP was detached or attaching. proceed with saga. + /// - false: EIP was attached. No-op for remainder of saga. pub async fn floating_ip_begin_attach( &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, creating_instance: bool, - ) -> UpdateResult { + ) -> UpdateResult<(ExternalIp, bool)> { use db::schema::external_ip::dsl; use db::schema::external_ip::table; use db::schema::instance::dsl as inst_dsl; @@ -633,6 +766,7 @@ impl DataStore { )), ); + let mut do_saga = true; let (_, eip) = query.attach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await .or_else(|e: AttachError| match e { @@ -650,8 +784,12 @@ impl DataStore { }, AttachError::NoUpdate { attached_count, resource, collection } => { match resource.state { - // Idempotent errors: is in progress forsame resource pair -- this is fine. + // Idempotent errors: is in progress or complete for same resource pair -- this is fine. IpAttachState::Attaching if resource.parent_id == Some(instance_id) => return Ok((collection, resource)), + IpAttachState::Attached if resource.parent_id == Some(instance_id) => { + do_saga = false; + return Ok((collection, resource)) + }, IpAttachState::Attached => return Err(Error::invalid_request( "floating IP cannot be attached to one \ instance while still attached to another" @@ -664,7 +802,7 @@ impl DataStore { IpAttachState::Detached => {}, } - Err(match collection.runtime_state.nexus_state { + Err(match &collection.runtime_state.nexus_state { state if SAFE_TRANSIENT_INSTANCE_STATES.contains(&state) => Error::unavail( "tried to attach floating IP while instance was changing state" ), @@ -675,6 +813,7 @@ impl DataStore { MAX_EXTERNAL_IPS_PER_INSTANCE, )) } else { + eprintln!("{resource:?}, {collection:?}"); Error::internal_error("failed to attach floating IP") } }, @@ -687,7 +826,7 @@ impl DataStore { })?; - Ok(eip) + Ok((eip, do_saga)) } /// Detaches a Floating IP address from an instance. @@ -695,13 +834,18 @@ impl DataStore { /// This moves a floating IP into the 'detaching' state. Callers are /// responsible for calling `external_ip_complete_op` to finalise the /// IP in 'detached' state at saga completion. + /// + /// To better handle idempotent detachment, this method returns an + /// additional bool: + /// - true: EIP was attached or detaching. proceed with saga. + /// - false: EIP was detached. No-op for remainder of saga. pub async fn floating_ip_begin_detach( &self, opctx: &OpContext, authz_fip: &authz::FloatingIp, instance_id: Uuid, creating_instance: bool, - ) -> UpdateResult { + ) -> UpdateResult<(ExternalIp, bool)> { use db::schema::external_ip::dsl; use db::schema::external_ip::table; use db::schema::instance::dsl as inst_dsl; @@ -737,6 +881,7 @@ impl DataStore { )), ); + let mut do_saga = true; let eip = query.detach_and_get_result_async(&*self.pool_connection_authorized(opctx).await?) .await .or_else(|e: DetachError| Err(match e { @@ -756,7 +901,10 @@ impl DataStore { let parent_match = resource.parent_id == Some(instance_id); match resource.state { // Idempotent cases: already detached OR detaching from same instance. - IpAttachState::Detached => return Ok(resource), + IpAttachState::Detached => { + do_saga = false; + return Ok(resource) + }, IpAttachState::Detaching if parent_match => return Ok(resource), IpAttachState::Attached if !parent_match => return Err(Error::invalid_request( "Floating IP is not attached to the target instance", @@ -784,7 +932,7 @@ impl DataStore { }))?; - Ok(eip) + Ok((eip, do_saga)) } /// Move an external IP from a transitional state (attaching, detaching) diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index 415750165b..0b5eb7c071 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -1065,9 +1065,11 @@ mod tests { Uuid::new_v4(), instance_id, /* pool_name = */ None, + true, ) .await - .expect("Failed to allocate Ephemeral IP when there is space"); + .expect("Failed to allocate Ephemeral IP when there is space") + .0; assert_eq!(ephemeral_ip.ip.ip(), range.last_address()); assert_eq!(ephemeral_ip.first_port.0, 0); assert_eq!(ephemeral_ip.last_port.0, super::MAX_PORT); @@ -1105,6 +1107,7 @@ mod tests { Uuid::new_v4(), instance_id, /* pool_name = */ None, + true, ) .await; assert!( @@ -1250,9 +1253,11 @@ mod tests { id, instance_id, pool_name, + true, ) .await - .expect("Failed to allocate instance ephemeral IP address"); + .expect("Failed to allocate instance ephemeral IP address") + .0; assert_eq!(ip.kind, IpKind::Ephemeral); assert_eq!(ip.ip.ip(), range.first_address()); assert_eq!(ip.first_port.0, 0); @@ -1780,9 +1785,11 @@ mod tests { id, instance_id, pool_name, + true, ) .await - .expect("Failed to allocate instance ephemeral IP address"); + .expect("Failed to allocate instance ephemeral IP address") + .0; assert_eq!(ip.kind, IpKind::Ephemeral); assert_eq!(ip.ip.ip(), second_range.first_address()); assert_eq!(ip.first_port.0, 0); @@ -1823,9 +1830,11 @@ mod tests { Uuid::new_v4(), instance_id, pool_name.clone(), + true, ) .await - .expect("Failed to allocate instance ephemeral IP address"); + .expect("Failed to allocate instance ephemeral IP address") + .0; println!("{ip:#?}"); if let IpAddr::V4(addr) = ip.ip.ip() { assert_eq!(addr.octets()[3], octet); @@ -1842,6 +1851,7 @@ mod tests { Uuid::new_v4(), Uuid::new_v4(), pool_name, + true, ) .await .expect_err("Should not use IP addresses from a different pool"); diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 932e10468e..28aa0dcca0 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1942,7 +1942,7 @@ impl super::Nexus { opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, ext_ip: ¶ms::ExternalIpDelete, - ) -> UpdateResult { + ) -> UpdateResult> { let (.., authz_project, authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; @@ -1960,7 +1960,7 @@ impl super::Nexus { .await?; saga_outputs - .lookup_node_output::("output") + .lookup_node_output::>("output") .map_err(|e| Error::internal_error(&format!("{:#}", &e))) .internal_context("looking up output from ip attach saga") } diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index ca45025b5e..663d112673 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -342,18 +342,19 @@ impl super::Nexus { .instance_lookup_external_ips(&opctx, instance_id) .await?; - let (ips_of_interest, must_all_be_attached) = - if let Some(wanted_id) = ip_filter { - if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { - (std::slice::from_ref(ip), false) - } else { - return Err(Error::internal_error(&format!( - "failed to find external ip address with id: {wanted_id}", - ))); - } + let (ips_of_interest, must_all_be_attached) = if let Some(wanted_id) = + ip_filter + { + if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) { + (std::slice::from_ref(ip), false) } else { - (&ips[..], true) - }; + return Err(Error::internal_error(&format!( + "failed to find external ip address with id: {wanted_id}, saw {ips:?}", + ))); + } + } else { + (&ips[..], true) + }; // This is performed so that an IP attach/detach will block the // instance_start saga. Return service unavailable to indicate @@ -482,7 +483,7 @@ impl super::Nexus { std::slice::from_ref(ip) } else { return Err(Error::internal_error(&format!( - "failed to find external ip address with id: {wanted_id}", + "failed to find external ip address with id: {wanted_id}, saw {external_ips:?}", ))); } } else { diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index c94aea8fb3..6fedd8d775 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -151,6 +151,19 @@ pub struct InstanceStateForIp { pub state: InstanceState, } +/// External IP state needed for IP attach/detachment. +/// +/// This holds a record of the mid-processing external IP, where possible. +/// there are cases where this might not be known (e.g., double detach of an +/// ephemeral IP). +/// In particular we need to explicitly no-op if not `do_saga`, to prevent +/// failures borne from instance state changes from knocking out a valid IP binding. +#[derive(Debug, Deserialize, Serialize)] +pub struct ModifyStateForExternalIp { + pub external_ip: Option, + pub do_saga: bool, +} + /// Move an external IP from one state to another as a saga operation, /// returning `Ok(true)` if the record was successfully moved and `Ok(false)` /// if the record was lost. @@ -168,7 +181,16 @@ pub async fn instance_ip_move_state( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let new_ip = sagactx.lookup::("target_ip")?; + let new_ip = sagactx.lookup::("target_ip")?; + + if !new_ip.do_saga { + return Ok(true); + } + let Some(new_ip) = new_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; match datastore .external_ip_complete_op(&opctx, new_ip.id, new_ip.kind, from, to) @@ -265,7 +287,15 @@ pub async fn instance_ip_add_nat( return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; // Querying sleds requires fleet access; use the instance allocator context // for this. @@ -305,7 +335,15 @@ pub async fn instance_ip_remove_nat( return Ok(()); }; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; osagactx .nexus() @@ -329,9 +367,18 @@ pub async fn instance_ip_add_opte( return Ok(()); }; - let new_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; + let sled_agent_body = - new_ip.try_into().map_err(ActionError::action_failed)?; + target_ip.try_into().map_err(ActionError::action_failed)?; osagactx .nexus() @@ -369,9 +416,18 @@ pub async fn instance_ip_remove_opte( return Ok(()); }; - let new_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; + if !target_ip.do_saga { + return Ok(()); + } + let Some(target_ip) = target_ip.external_ip else { + return Err(ActionError::action_failed(Error::internal_error( + "tried to `do_saga` without valid external IP", + ))); + }; + let sled_agent_body = - new_ip.try_into().map_err(ActionError::action_failed)?; + target_ip.try_into().map_err(ActionError::action_failed)?; osagactx .nexus() diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index d921275402..53169308c7 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -634,9 +634,11 @@ async fn sic_allocate_instance_external_ip( ip_id, instance_id, pool_name, + true, ) .await .map_err(ActionError::action_failed)? + .0 } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpCreate::Floating { ref floating_ip_name } => { @@ -652,6 +654,7 @@ async fn sic_allocate_instance_external_ip( .floating_ip_begin_attach(&opctx, &authz_fip, instance_id, true) .await .map_err(ActionError::action_failed)? + .0 } }; diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 83fb8184dc..3df445b95e 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -5,15 +5,16 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - InstanceStateForIp, + InstanceStateForIp, ModifyStateForExternalIp, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::{ExternalIp, IpAttachState}; +use nexus_db_model::IpAttachState; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; +use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -80,7 +81,7 @@ pub struct Params { async fn siia_begin_attach_ip( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; @@ -100,9 +101,14 @@ async fn siia_begin_attach_ip( Uuid::new_v4(), params.authz_instance.id(), pool_name, + false, ) .await .map_err(ActionError::action_failed) + .map(|(external_ip, do_saga)| ModifyStateForExternalIp { + external_ip: Some(external_ip), + do_saga, + }) } // Set the parent of an existing floating IP to the new instance's ID. params::ExternalIpCreate::Floating { ref floating_ip_name } => { @@ -123,6 +129,10 @@ async fn siia_begin_attach_ip( ) .await .map_err(ActionError::action_failed) + .map(|(external_ip, do_saga)| ModifyStateForExternalIp { + external_ip: Some(external_ip), + do_saga, + }) } } } @@ -213,7 +223,7 @@ async fn siia_complete_attach( ) -> Result { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; if !instance_ip_move_state( &sagactx, @@ -229,7 +239,15 @@ async fn siia_complete_attach( ) } - target_ip.try_into().map_err(ActionError::action_failed) + target_ip + .external_ip + .ok_or_else(|| { + Error::internal_error( + "must always have a defined external IP during instance attach", + ) + }) + .and_then(TryInto::try_into) + .map_err(ActionError::action_failed) } #[derive(Debug)] @@ -264,7 +282,7 @@ pub(crate) mod test { ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; use dropshot::test_util::ClientTestContext; - use nexus_db_model::{IpKind, Name}; + use nexus_db_model::{ExternalIp, IpKind, Name}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::{ create_floating_ip, create_instance, create_project, populate_ip_pool, diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 0545bd3c71..d460075f95 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -5,16 +5,15 @@ use super::instance_common::{ instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state, instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte, - InstanceStateForIp, + InstanceStateForIp, ModifyStateForExternalIp, }; use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::sagas::declare_saga_actions; use crate::app::{authn, authz, db}; use crate::external_api::params; -use nexus_db_model::{ExternalIp, IpAttachState, IpKind}; +use nexus_db_model::{IpAttachState, IpKind}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; use steno::ActionError; @@ -61,7 +60,7 @@ pub struct Params { async fn siid_begin_detach_ip( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); let datastore = osagactx.datastore(); let params = sagactx.saga_params::()?; @@ -80,23 +79,28 @@ async fn siid_begin_detach_ip( .await .map_err(ActionError::action_failed)?; - let eph_ip = eips - .iter() - .find(|e| e.kind == IpKind::Ephemeral) - .ok_or_else(|| { - ActionError::action_failed(Error::invalid_request( - "instance does not have an attached ephemeral IP address" - )) - })?; - - datastore - .begin_deallocate_ephemeral_ip( - &opctx, - eph_ip.id, - params.authz_instance.id(), - ) - .await - .map_err(ActionError::action_failed) + // XXX: cleanup. + if let Some(eph_ip) = + eips.iter().find(|e| e.kind == IpKind::Ephemeral) + { + datastore + .begin_deallocate_ephemeral_ip( + &opctx, + eph_ip.id, + params.authz_instance.id(), + ) + .await + .map_err(ActionError::action_failed) + .map(|external_ip| ModifyStateForExternalIp { + do_saga: external_ip.is_some(), + external_ip, + }) + } else { + Ok(ModifyStateForExternalIp { + do_saga: false, + external_ip: None, + }) + } } params::ExternalIpDelete::Floating { ref floating_ip_name } => { let floating_ip_name = db::model::Name(floating_ip_name.clone()); @@ -116,6 +120,10 @@ async fn siid_begin_detach_ip( ) .await .map_err(ActionError::action_failed) + .map(|(external_ip, do_saga)| ModifyStateForExternalIp { + external_ip: Some(external_ip), + do_saga, + }) } } } @@ -202,10 +210,10 @@ async fn siid_update_opte_undo( async fn siid_complete_detach( sagactx: NexusActionContext, -) -> Result { +) -> Result, ActionError> { let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - let target_ip = sagactx.lookup::("target_ip")?; + let target_ip = sagactx.lookup::("target_ip")?; if !instance_ip_move_state( &sagactx, @@ -217,11 +225,15 @@ async fn siid_complete_detach( { warn!( log, - "siid_complete_attach: external IP was deleted or call was idempotent" + "siid_complete_detach: external IP was deleted or call was idempotent" ) } - target_ip.try_into().map_err(ActionError::action_failed) + target_ip + .external_ip + .map(TryInto::try_into) + .transpose() + .map_err(ActionError::action_failed) } #[derive(Debug)] @@ -264,7 +276,7 @@ pub(crate) mod test { use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; - use nexus_db_model::Name; + use nexus_db_model::{ExternalIp, Name}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::create_instance; use nexus_test_utils_macros::nexus_test; diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs index ea8bbc39c8..8c2d220203 100644 --- a/nexus/src/external_api/http_entrypoints.rs +++ b/nexus/src/external_api/http_entrypoints.rs @@ -3776,7 +3776,7 @@ async fn instance_external_ip_detach( path_params: Path, query_params: Query, ip_to_detach: TypedBody, -) -> Result, HttpError> { +) -> Result>, HttpError> { let apictx = rqctx.context(); let handler = async { let opctx = crate::context::op_context_for_external_api(&rqctx).await?; diff --git a/schema/crdb/22.0.0/up08.sql b/schema/crdb/22.0.0/up08.sql new file mode 100644 index 0000000000..3d85aaad05 --- /dev/null +++ b/schema/crdb/22.0.0/up08.sql @@ -0,0 +1,2 @@ +ALTER TABLE IF EXISTS omicron.public.external_ip +DROP CONSTRAINT IF EXISTS null_non_fip_parent_id; diff --git a/schema/crdb/22.0.0/up09.sql b/schema/crdb/22.0.0/up09.sql new file mode 100644 index 0000000000..bac963cce5 --- /dev/null +++ b/schema/crdb/22.0.0/up09.sql @@ -0,0 +1,4 @@ +ALTER TABLE IF EXISTS omicron.public.external_ip +ADD CONSTRAINT IF NOT EXISTS null_snat_parent_id CHECK ( + (kind != 'snat') OR (parent_id IS NOT NULL) +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 6580d39cf7..a372403b28 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1733,11 +1733,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.external_ip ( ), /* - * Only nullable if this is a floating IP, which may exist not - * attached to any instance or service yet. + * Only nullable if this is a floating/ephemeral IP, which may exist not + * attached to any instance or service yet. Ephemeral IPs should not exist + * without parent instances/services, but need to temporarily exist in this + * state for live attachment. */ - CONSTRAINT null_non_fip_parent_id CHECK ( - (kind != 'floating' AND parent_id is NOT NULL) OR (kind = 'floating') + CONSTRAINT null_snat_parent_id CHECK ( + (kind != 'snat') OR (parent_id IS NOT NULL) ), /* Ephemeral IPs are not supported for services. */