diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs index 2e877cc065..83fb8184dc 100644 --- a/nexus/src/app/sagas/instance_ip_attach.rs +++ b/nexus/src/app/sagas/instance_ip_attach.rs @@ -34,6 +34,13 @@ use uuid::Uuid; // sled-agent as temporary errors and unwinding. For the delete case, we // allow the attach/detach completion to have a missing record. // See `instance_common::instance_ip_get_instance_state` for more info. +// +// One more consequence of sled state being able to change beneath us +// is that the central undo actions (DPD/OPTE state) *must* be best-effort. +// This is not bad per-se: instance stop does not itself remove NAT routing +// rules. The only reason these should fail is because an instance has stopped, +// at which point there's no good in e.g. adding another entry to a non-existent +// sled-agent regardless. declare_saga_actions! { instance_ip_attach; @@ -71,9 +78,6 @@ pub struct Params { pub serialized_authn: authn::saga::Serialized, } -// TODO: factor this out for attach, detach, and instance create -// to share an impl. - async fn siia_begin_attach_ip( sagactx: NexusActionContext, ) -> Result { @@ -169,13 +173,17 @@ async fn siia_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { async fn siia_nat_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_remove_nat( + if let Err(e) = instance_ip_remove_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, ) - .await?; + .await + { + error!(log, "siia_nat_undo: failed to notify DPD: {e}"); + } Ok(()) } @@ -190,8 +198,13 @@ async fn siia_update_opte( async fn siia_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await?; + if let Err(e) = + instance_ip_remove_opte(&sagactx, ¶ms.authz_instance).await + { + error!(log, "siia_update_opte_undo: failed to notify sled-agent: {e}"); + } Ok(()) } @@ -219,8 +232,6 @@ async fn siia_complete_attach( target_ip.try_into().map_err(ActionError::action_failed) } -// TODO: backout changes if run state changed illegally? - #[derive(Debug)] pub struct SagaInstanceIpAttach; impl NexusSaga for SagaInstanceIpAttach { @@ -247,11 +258,8 @@ impl NexusSaga for SagaInstanceIpAttach { #[cfg(test)] pub(crate) mod test { use super::*; - use crate::app::{ - saga::create_saga_dag, - sagas::test_helpers::{self, instance_simulate}, - }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use crate::app::{saga::create_saga_dag, sagas::test_helpers}; + use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -259,15 +267,10 @@ pub(crate) mod test { use nexus_db_model::{IpKind, Name}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::{ - create_disk, create_floating_ip, create_instance, create_project, - object_create, populate_ip_pool, + create_floating_ip, create_instance, create_project, populate_ip_pool, }; use nexus_test_utils_macros::nexus_test; - use nexus_types::external_api::params::ExternalIpCreate; - use omicron_common::api::external::{ - ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, - SimpleIdentity, - }; + use omicron_common::api::external::SimpleIdentity; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -327,11 +330,12 @@ pub(crate) mod test { let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; + let sled_agent = &cptestctx.sled_agent.sled_agent; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; - let _instance = + let _project_id = ip_manip_test_setup(&client).await; + let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; for use_float in [false, true] { @@ -340,9 +344,31 @@ pub(crate) mod test { let dag = create_saga_dag::(params).unwrap(); let saga = nexus.create_runnable_saga(dag).await.unwrap(); nexus.run_saga(saga).await.expect("Attach saga should succeed"); - - // TODO: assert sled agent, dpd happy, ... } + + let instance_id = instance.id(); + + // Sled agent has a record of the new external IPs. + let mut eips = sled_agent.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + assert!(my_eips.iter().any(|v| matches!( + v, + omicron_sled_agent::params::InstanceExternalIpBody::Floating(_) + ))); + assert!(my_eips.iter().any(|v| matches!( + v, + omicron_sled_agent::params::InstanceExternalIpBody::Ephemeral(_) + ))); + + // DB has records for SNAT plus the new IPs. + let db_eips = datastore + .instance_lookup_external_ips(&opctx, instance_id) + .await + .unwrap(); + assert_eq!(db_eips.len(), 3); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Ephemeral)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Floating)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); } pub(crate) async fn verify_clean_slate( @@ -381,10 +407,9 @@ pub(crate) mod test { .is_none()); // No IP bindings remain on sled-agent. - let eips = &*sled_agent.external_ips.lock().await; - for (_nic_id, eip_set) in eips { - assert!(eip_set.is_empty()); - } + let mut eips = sled_agent.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + assert!(my_eips.is_empty()); } #[nexus_test(server = crate::Server)] @@ -398,7 +423,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -424,7 +449,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -447,14 +472,13 @@ pub(crate) mod test { async fn test_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let log = &cptestctx.logctx.log; let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let _instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs index 46edca1d0c..0545bd3c71 100644 --- a/nexus/src/app/sagas/instance_ip_detach.rs +++ b/nexus/src/app/sagas/instance_ip_detach.rs @@ -14,25 +14,14 @@ use crate::external_api::params; use nexus_db_model::{ExternalIp, IpAttachState, IpKind}; use nexus_db_queries::db::lookup::LookupPath; use nexus_types::external_api::views; -use omicron_common::api::external::{Error, InstanceState}; +use omicron_common::api::external::Error; use serde::Deserialize; use serde::Serialize; use steno::ActionError; use uuid::Uuid; -// rough sequence of evts: -// - take temp ownership of instance while interacting w/ sled agent -// -> mark instance migration id as Some(0) if None -// - Withdraw routes -// -> ensure_dpd... (?) Do we actually need to? -// -> must precede OPTE: host may change its sending -// behaviour prematurely -// - Deregister addr in OPTE -// -> Put addr in sled-agent endpoint -// - Detach and Delete EIP iff. Ephemeral -// -> why so late? Risk that we can't recover our IP in an unwind. -// - free up migration_id of instance. -// -> mark instance migration id as None +// This runs on similar logic to instance IP attach: see its head +// comment for an explanation of the structure wrt. other sagas. declare_saga_actions! { instance_ip_detach; @@ -177,13 +166,17 @@ async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> { async fn siid_nat_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_add_nat( + if let Err(e) = instance_ip_add_nat( &sagactx, ¶ms.serialized_authn, ¶ms.authz_instance, ) - .await?; + .await + { + error!(log, "siid_nat_undo: failed to notify DPD: {e}"); + } Ok(()) } @@ -198,8 +191,12 @@ async fn siid_update_opte( async fn siid_update_opte_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { + let log = sagactx.user_data().log(); let params = sagactx.saga_params::()?; - instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await?; + if let Err(e) = instance_ip_add_opte(&sagactx, ¶ms.authz_instance).await + { + error!(log, "siid_update_opte_undo: failed to notify sled-agent: {e}"); + } Ok(()) } @@ -220,7 +217,7 @@ async fn siid_complete_detach( { warn!( log, - "siia_complete_attach: external IP was deleted or call was idempotent" + "siid_complete_attach: external IP was deleted or call was idempotent" ) } @@ -263,7 +260,7 @@ pub(crate) mod test { }, Nexus, }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -342,11 +339,12 @@ pub(crate) mod test { let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; + let sled_agent = &cptestctx.sled_agent.sled_agent; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; let _ = ip_manip_test_setup(&client).await; - let _instance = + let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; attach_instance_ips(nexus, &opctx).await; @@ -358,6 +356,21 @@ pub(crate) mod test { let saga = nexus.create_runnable_saga(dag).await.unwrap(); nexus.run_saga(saga).await.expect("Detach saga should succeed"); } + + let instance_id = instance.id(); + + // Sled agent has removed its records of the external IPs. + let mut eips = sled_agent.external_ips.lock().await; + let my_eips = eips.entry(instance_id).or_default(); + assert!(my_eips.is_empty()); + + // DB only has record for SNAT. + let db_eips = datastore + .instance_lookup_external_ips(&opctx, instance_id) + .await + .unwrap(); + assert_eq!(db_eips.len(), 1); + assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); } pub(crate) async fn verify_clean_slate( @@ -401,9 +414,9 @@ pub(crate) mod test { .await .unwrap(); assert_eq!(db_eips.len(), 3); - assert!(db_eips.iter().find(|v| v.kind == IpKind::Ephemeral).is_some()); - assert!(db_eips.iter().find(|v| v.kind == IpKind::Floating).is_some()); - assert!(db_eips.iter().find(|v| v.kind == IpKind::SNat).is_some()); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Ephemeral)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::Floating)); + assert!(db_eips.iter().any(|v| v.kind == IpKind::SNat)); // No IP bindings remain on sled-agent. let eips = &*sled_agent.external_ips.lock().await; @@ -423,7 +436,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -451,7 +464,7 @@ pub(crate) mod test { let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; @@ -476,14 +489,13 @@ pub(crate) mod test { async fn test_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let log = &cptestctx.logctx.log; let client = &cptestctx.external_client; let apictx = &cptestctx.server.apictx(); let nexus = &apictx.nexus; let opctx = test_helpers::test_opctx(cptestctx); let datastore = &nexus.db_datastore; - let project_id = ip_manip_test_setup(&client).await; + let _project_id = ip_manip_test_setup(&client).await; let _instance = create_instance(client, PROJECT_NAME, INSTANCE_NAME).await; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 3a5633c6c3..21a9d39220 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -640,17 +640,13 @@ impl SledAgent { // High-level behaviour: this should always succeed UNLESS // trying to add a double ephemeral. if let InstanceExternalIpBody::Ephemeral(curr_ip) = &body_args { - if my_eips - .iter() - .find(|v| { - if let InstanceExternalIpBody::Ephemeral(other_ip) = v { - curr_ip != other_ip - } else { - false - } - }) - .is_some() - { + if my_eips.iter().any(|v| { + if let InstanceExternalIpBody::Ephemeral(other_ip) = v { + curr_ip != other_ip + } else { + false + } + }) { return Err(Error::invalid_request("cannot replace exisitng ephemeral IP without explicit removal")); } }