From 79fee6a2c9f0099fb9cca6effbb862fd75c0a4e4 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Thu, 21 Sep 2023 09:24:04 -0700 Subject: [PATCH] Move instance networking functions into their own module (#4123) `nexus/src/app/instance.rs` and `sled.rs` implement several networking-related functions that are less directly concerned with instance or sled management than their sibling routines. Tidy things up a bit by creating an `instance_network` module and moving instance V2P and NAT management functions there. Also, move instance NAT entry deletion logic into its own function that's called from the instance delete saga instead of implementing it inline in that saga. These changes aim to reduce clutter in `instance.rs` and to move NAT entry deletion to a function that can be reused by subsequent changes to the way Nexus handles instance stop. Except for some minor edits to error handling in `instance_delete_dpd_config` (needed because this function no longer returns a `steno::ActionError`), this PR only rearranges existing code and has no functional changes. --- nexus/src/app/instance.rs | 144 ------- nexus/src/app/instance_network.rs | 497 +++++++++++++++++++++++++ nexus/src/app/mod.rs | 1 + nexus/src/app/sagas/instance_delete.rs | 67 +--- nexus/src/app/sled.rs | 251 ------------- 5 files changed, 503 insertions(+), 457 deletions(-) create mode 100644 nexus/src/app/instance_network.rs diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 1ac63d19f6..f07ceae4a0 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -11,7 +11,6 @@ use super::MAX_NICS_PER_INSTANCE; use super::MAX_VCPU_PER_INSTANCE; use super::MIN_MEMORY_BYTES_PER_INSTANCE; use crate::app::sagas; -use crate::app::sagas::retry_until_known_result; use crate::cidata::InstanceCiData; use crate::external_api::params; use cancel_safe_futures::prelude::*; @@ -41,7 +40,6 @@ use omicron_common::api::external::NameOrId; use omicron_common::api::external::UpdateResult; use omicron_common::api::external::Vni; use omicron_common::api::internal::nexus; -use omicron_common::api::internal::shared::SwitchLocation; use propolis_client::support::tungstenite::protocol::frame::coding::CloseCode; use propolis_client::support::tungstenite::protocol::CloseFrame; use propolis_client::support::tungstenite::Message as WebSocketMessage; @@ -54,9 +52,7 @@ use sled_agent_client::types::InstancePutStateBody; use sled_agent_client::types::InstanceStateRequested; use sled_agent_client::types::SourceNatConfig; use sled_agent_client::Client as SledAgentClient; -use std::collections::HashSet; use std::net::SocketAddr; -use std::str::FromStr; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use uuid::Uuid; @@ -1232,146 +1228,6 @@ impl super::Nexus { Ok(()) } - // Switches with uplinks configured and boundary services enabled - pub(crate) async fn boundary_switches( - &self, - opctx: &OpContext, - ) -> Result, Error> { - let mut boundary_switches: HashSet = HashSet::new(); - let uplinks = self.list_switch_ports_with_uplinks(opctx).await?; - for uplink in &uplinks { - let location: SwitchLocation = - uplink.switch_location.parse().map_err(|_| { - Error::internal_error(&format!( - "invalid switch location in uplink config: {}", - uplink.switch_location - )) - })?; - boundary_switches.insert(location); - } - Ok(boundary_switches) - } - - /// Ensures that the Dendrite configuration for the supplied instance is - /// up-to-date. - /// - /// # Parameters - /// - /// - `opctx`: An operation context that grants read and list-children - /// permissions on the identified instance. - /// - `instance_id`: The ID of the instance to act on. - /// - `sled_ip_address`: The internal IP address assigned to the sled's - /// sled agent. - /// - `ip_index_filter`: An optional filter on the index into the instance's - /// external IP array. - /// - If this is `Some(n)`, this routine configures DPD state for only the - /// Nth external IP in the collection returned from CRDB. The caller is - /// responsible for ensuring that the IP collection has stable indices - /// when making this call. - /// - If this is `None`, this routine configures DPD for all external - /// IPs. - pub(crate) async fn instance_ensure_dpd_config( - &self, - opctx: &OpContext, - instance_id: Uuid, - sled_ip_address: &std::net::SocketAddrV6, - ip_index_filter: Option, - dpd_client: &Arc, - ) -> Result<(), Error> { - let log = &self.log; - - info!(log, "looking up instance's primary network interface"; - "instance_id" => %instance_id); - - let (.., authz_instance) = LookupPath::new(opctx, &self.db_datastore) - .instance_id(instance_id) - .lookup_for(authz::Action::ListChildren) - .await?; - - // All external IPs map to the primary network interface, so find that - // interface. If there is no such interface, there's no way to route - // traffic destined to those IPs, so there's nothing to configure and - // it's safe to return early. - let network_interface = match self - .db_datastore - .derive_guest_network_interface_info(&opctx, &authz_instance) - .await? - .into_iter() - .find(|interface| interface.primary) - { - Some(interface) => interface, - None => { - info!(log, "Instance has no primary network interface"; - "instance_id" => %instance_id); - return Ok(()); - } - }; - - let mac_address = - macaddr::MacAddr6::from_str(&network_interface.mac.to_string()) - .map_err(|e| { - Error::internal_error(&format!( - "failed to convert mac address: {e}" - )) - })?; - - let vni: u32 = network_interface.vni.into(); - - info!(log, "looking up instance's external IPs"; - "instance_id" => %instance_id); - - let ips = self - .db_datastore - .instance_lookup_external_ips(&opctx, instance_id) - .await?; - - if let Some(wanted_index) = ip_index_filter { - if let None = ips.get(wanted_index) { - return Err(Error::internal_error(&format!( - "failed to find external ip address at index: {}", - wanted_index - ))); - } - } - - for target_ip in ips - .iter() - .enumerate() - .filter(|(index, _)| { - if let Some(wanted_index) = ip_index_filter { - *index == wanted_index - } else { - true - } - }) - .map(|(_, ip)| ip) - { - retry_until_known_result(log, || async { - dpd_client - .ensure_nat_entry( - &log, - target_ip.ip, - dpd_client::types::MacAddr { - a: mac_address.into_array(), - }, - *target_ip.first_port, - *target_ip.last_port, - vni, - sled_ip_address.ip(), - ) - .await - }) - .await - .map_err(|e| { - Error::internal_error(&format!( - "failed to ensure dpd entry: {e}" - )) - })?; - } - - Ok(()) - } - /// Returns the requested range of serial console output bytes, /// provided they are still in the propolis-server's cache. pub(crate) async fn instance_serial_console_data( diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs new file mode 100644 index 0000000000..6b819a9e11 --- /dev/null +++ b/nexus/src/app/instance_network.rs @@ -0,0 +1,497 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Routines that manage instance-related networking state. + +use crate::app::sagas::retry_until_known_result; +use nexus_db_queries::authz; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::identity::Asset; +use nexus_db_queries::db::lookup::LookupPath; +use omicron_common::api::external::DataPageParams; +use omicron_common::api::external::Error; +use omicron_common::api::internal::shared::SwitchLocation; +use sled_agent_client::types::SetVirtualNetworkInterfaceHost; +use std::collections::HashSet; +use std::str::FromStr; +use std::sync::Arc; +use uuid::Uuid; + +impl super::Nexus { + /// Returns the set of switches with uplinks configured and boundary + /// services enabled. + pub(crate) async fn boundary_switches( + &self, + opctx: &OpContext, + ) -> Result, Error> { + let mut boundary_switches: HashSet = HashSet::new(); + let uplinks = self.list_switch_ports_with_uplinks(opctx).await?; + for uplink in &uplinks { + let location: SwitchLocation = + uplink.switch_location.parse().map_err(|_| { + Error::internal_error(&format!( + "invalid switch location in uplink config: {}", + uplink.switch_location + )) + })?; + boundary_switches.insert(location); + } + Ok(boundary_switches) + } + + /// Ensures that V2P mappings exist that indicate that the instance with ID + /// `instance_id` is resident on the sled with ID `sled_id`. + pub(crate) async fn create_instance_v2p_mappings( + &self, + opctx: &OpContext, + instance_id: Uuid, + sled_id: Uuid, + ) -> Result<(), Error> { + info!(&self.log, "creating V2P mappings for instance"; + "instance_id" => %instance_id, + "sled_id" => %sled_id); + + // For every sled that isn't the sled this instance was allocated to, create + // a virtual to physical mapping for each of this instance's NICs. + // + // For the mappings to be correct, a few invariants must hold: + // + // - mappings must be set whenever an instance's sled changes (eg. + // during instance creation, migration, stop + start) + // + // - an instances' sled must not change while its corresponding mappings + // are being created + // + // - the same mapping creation must be broadcast to all sleds + // + // A more targeted approach would be to see what other instances share + // the VPC this instance is in (or more generally, what instances should + // have connectivity to this one), see what sleds those are allocated + // to, and only create V2P mappings for those sleds. + // + // There's additional work with this approach: + // + // - it means that delete calls are required as well as set calls, + // meaning that now the ordering of those matters (this may also + // necessitate a generation number for V2P mappings) + // + // - V2P mappings have to be bidirectional in order for both instances's + // packets to make a round trip. This isn't a problem with the + // broadcast approach because one of the sides will exist already, but + // it is something to orchestrate with a more targeted approach. + // + // TODO-correctness Default firewall rules currently will block + // instances in different VPCs from connecting to each other. If it ever + // stops doing this, the broadcast approach will create V2P mappings + // that shouldn't exist. + let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) + .instance_id(instance_id) + .lookup_for(authz::Action::Read) + .await?; + + let instance_nics = self + .db_datastore + .derive_guest_network_interface_info(&opctx, &authz_instance) + .await?; + + // Look up the supplied sled's physical host IP. + let physical_host_ip = + *self.sled_lookup(&self.opctx_alloc, &sled_id)?.fetch().await?.1.ip; + + let mut last_sled_id: Option = None; + loop { + let pagparams = DataPageParams { + marker: last_sled_id.as_ref(), + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new(10).unwrap(), + }; + + let sleds_page = + self.sled_list(&self.opctx_alloc, &pagparams).await?; + let mut join_handles = + Vec::with_capacity(sleds_page.len() * instance_nics.len()); + + for sled in &sleds_page { + // set_v2p not required for sled instance was allocated to, OPTE + // currently does that automatically + // + // TODO(#3107): Remove this when XDE stops creating mappings + // implicitly. + if sled.id() == sled_id { + continue; + } + + for nic in &instance_nics { + let client = self.sled_client(&sled.id()).await?; + let nic_id = nic.id; + let mapping = SetVirtualNetworkInterfaceHost { + virtual_ip: nic.ip, + virtual_mac: nic.mac.clone(), + physical_host_ip, + vni: nic.vni.clone(), + }; + + let log = self.log.clone(); + + // This function is idempotent: calling the set_v2p ioctl with + // the same information is a no-op. + join_handles.push(tokio::spawn(futures::future::lazy( + move |_ctx| async move { + retry_until_known_result(&log, || async { + client.set_v2p(&nic_id, &mapping).await + }) + .await + }, + ))); + } + } + + // Concurrently run each future to completion, but return the last + // error seen. + let mut error = None; + for join_handle in join_handles { + let result = join_handle + .await + .map_err(|e| Error::internal_error(&e.to_string()))? + .await; + + if result.is_err() { + error!(self.log, "{:?}", result); + error = Some(result); + } + } + if let Some(e) = error { + return e.map(|_| ()).map_err(|e| e.into()); + } + + if sleds_page.len() < 10 { + break; + } + + if let Some(last) = sleds_page.last() { + last_sled_id = Some(last.id()); + } + } + + Ok(()) + } + + /// Ensure that the necessary v2p mappings for an instance are deleted + pub(crate) async fn delete_instance_v2p_mappings( + &self, + opctx: &OpContext, + instance_id: Uuid, + ) -> Result<(), Error> { + // For every sled that isn't the sled this instance was allocated to, delete + // the virtual to physical mapping for each of this instance's NICs. If + // there isn't a V2P mapping, del_v2p should be a no-op. + let (.., authz_instance, db_instance) = + LookupPath::new(&opctx, &self.db_datastore) + .instance_id(instance_id) + .fetch_for(authz::Action::Read) + .await?; + + let instance_nics = self + .db_datastore + .derive_guest_network_interface_info(&opctx, &authz_instance) + .await?; + + // Lookup the physical host IP of the sled hosting this instance + let instance_sled_id = db_instance.runtime().sled_id; + let physical_host_ip = *self + .sled_lookup(&self.opctx_alloc, &instance_sled_id)? + .fetch() + .await? + .1 + .ip; + + let mut last_sled_id: Option = None; + + loop { + let pagparams = DataPageParams { + marker: last_sled_id.as_ref(), + direction: dropshot::PaginationOrder::Ascending, + limit: std::num::NonZeroU32::new(10).unwrap(), + }; + + let sleds_page = + self.sled_list(&self.opctx_alloc, &pagparams).await?; + let mut join_handles = + Vec::with_capacity(sleds_page.len() * instance_nics.len()); + + for sled in &sleds_page { + // del_v2p not required for sled instance was allocated to, OPTE + // currently does that automatically + // + // TODO(#3107): Remove this when XDE stops deleting mappings + // implicitly. + if sled.id() == instance_sled_id { + continue; + } + + for nic in &instance_nics { + let client = self.sled_client(&sled.id()).await?; + let nic_id = nic.id; + let mapping = SetVirtualNetworkInterfaceHost { + virtual_ip: nic.ip, + virtual_mac: nic.mac.clone(), + physical_host_ip, + vni: nic.vni.clone(), + }; + + let log = self.log.clone(); + + // This function is idempotent: calling the set_v2p ioctl with + // the same information is a no-op. + join_handles.push(tokio::spawn(futures::future::lazy( + move |_ctx| async move { + retry_until_known_result(&log, || async { + client.del_v2p(&nic_id, &mapping).await + }) + .await + }, + ))); + } + } + + // Concurrently run each future to completion, but return the last + // error seen. + let mut error = None; + for join_handle in join_handles { + let result = join_handle + .await + .map_err(|e| Error::internal_error(&e.to_string()))? + .await; + + if result.is_err() { + error!(self.log, "{:?}", result); + error = Some(result); + } + } + if let Some(e) = error { + return e.map(|_| ()).map_err(|e| e.into()); + } + + if sleds_page.len() < 10 { + break; + } + + if let Some(last) = sleds_page.last() { + last_sled_id = Some(last.id()); + } + } + + Ok(()) + } + + /// Ensures that the Dendrite configuration for the supplied instance is + /// up-to-date. + /// + /// # Parameters + /// + /// - `opctx`: An operation context that grants read and list-children + /// permissions on the identified instance. + /// - `instance_id`: The ID of the instance to act on. + /// - `sled_ip_address`: The internal IP address assigned to the sled's + /// sled agent. + /// - `ip_index_filter`: An optional filter on the index into the instance's + /// external IP array. + /// - If this is `Some(n)`, this routine configures DPD state for only the + /// Nth external IP in the collection returned from CRDB. The caller is + /// responsible for ensuring that the IP collection has stable indices + /// when making this call. + /// - If this is `None`, this routine configures DPD for all external + /// IPs. + pub(crate) async fn instance_ensure_dpd_config( + &self, + opctx: &OpContext, + instance_id: Uuid, + sled_ip_address: &std::net::SocketAddrV6, + ip_index_filter: Option, + dpd_client: &Arc, + ) -> Result<(), Error> { + let log = &self.log; + + info!(log, "looking up instance's primary network interface"; + "instance_id" => %instance_id); + + let (.., authz_instance) = LookupPath::new(opctx, &self.db_datastore) + .instance_id(instance_id) + .lookup_for(authz::Action::ListChildren) + .await?; + + // All external IPs map to the primary network interface, so find that + // interface. If there is no such interface, there's no way to route + // traffic destined to those IPs, so there's nothing to configure and + // it's safe to return early. + let network_interface = match self + .db_datastore + .derive_guest_network_interface_info(&opctx, &authz_instance) + .await? + .into_iter() + .find(|interface| interface.primary) + { + Some(interface) => interface, + None => { + info!(log, "Instance has no primary network interface"; + "instance_id" => %instance_id); + return Ok(()); + } + }; + + let mac_address = + macaddr::MacAddr6::from_str(&network_interface.mac.to_string()) + .map_err(|e| { + Error::internal_error(&format!( + "failed to convert mac address: {e}" + )) + })?; + + let vni: u32 = network_interface.vni.into(); + + info!(log, "looking up instance's external IPs"; + "instance_id" => %instance_id); + + let ips = self + .db_datastore + .instance_lookup_external_ips(&opctx, instance_id) + .await?; + + if let Some(wanted_index) = ip_index_filter { + if let None = ips.get(wanted_index) { + return Err(Error::internal_error(&format!( + "failed to find external ip address at index: {}", + wanted_index + ))); + } + } + + for target_ip in ips + .iter() + .enumerate() + .filter(|(index, _)| { + if let Some(wanted_index) = ip_index_filter { + *index == wanted_index + } else { + true + } + }) + .map(|(_, ip)| ip) + { + retry_until_known_result(log, || async { + dpd_client + .ensure_nat_entry( + &log, + target_ip.ip, + dpd_client::types::MacAddr { + a: mac_address.into_array(), + }, + *target_ip.first_port, + *target_ip.last_port, + vni, + sled_ip_address.ip(), + ) + .await + }) + .await + .map_err(|e| { + Error::internal_error(&format!( + "failed to ensure dpd entry: {e}" + )) + })?; + } + + Ok(()) + } + + /// Attempts to delete all of the Dendrite NAT configuration for the + /// instance identified by `authz_instance`. + /// + /// # Return value + /// + /// - `Ok(())` if all NAT entries were successfully deleted. + /// - If an operation fails before this routine begins to walk and delete + /// individual NAT entries, this routine returns `Err` and reports that + /// error. + /// - If an operation fails while this routine is walking NAT entries, it + /// will continue trying to delete subsequent entries but will return the + /// first error it encountered. + pub(crate) async fn instance_delete_dpd_config( + &self, + opctx: &OpContext, + authz_instance: &authz::Instance, + ) -> Result<(), Error> { + let log = &self.log; + let instance_id = authz_instance.id(); + + info!(log, "deleting instance dpd configuration"; + "instance_id" => %instance_id); + + let external_ips = self + .db_datastore + .instance_lookup_external_ips(opctx, instance_id) + .await?; + + let boundary_switches = self.boundary_switches(opctx).await?; + + let mut errors = vec![]; + for entry in external_ips { + for switch in &boundary_switches { + debug!(log, "deleting instance nat mapping"; + "instance_id" => %instance_id, + "switch" => switch.to_string(), + "entry" => #?entry); + + let client_result = + self.dpd_clients.get(switch).ok_or_else(|| { + Error::internal_error(&format!( + "unable to find dendrite client for {switch}" + )) + }); + + let dpd_client = match client_result { + Ok(client) => client, + Err(new_error) => { + errors.push(new_error); + continue; + } + }; + + let result = retry_until_known_result(log, || async { + dpd_client + .ensure_nat_entry_deleted( + log, + entry.ip, + *entry.first_port, + ) + .await + }) + .await; + + if let Err(e) = result { + let e = Error::internal_error(&format!( + "failed to delete nat entry via dpd: {e}" + )); + + error!(log, "error deleting nat mapping: {e:#?}"; + "instance_id" => %instance_id, + "switch" => switch.to_string(), + "entry" => #?entry); + errors.push(e); + } else { + debug!(log, "deleting nat mapping successful"; + "instance_id" => %instance_id, + "switch" => switch.to_string(), + "entry" => #?entry); + } + } + } + + if let Some(e) = errors.into_iter().nth(0) { + return Err(e); + } + + Ok(()) + } +} diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 4dd93f7707..5bab5e2820 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -42,6 +42,7 @@ mod external_ip; mod iam; mod image; mod instance; +mod instance_network; mod ip_pool; mod metrics; mod network_interface; diff --git a/nexus/src/app/sagas/instance_delete.rs b/nexus/src/app/sagas/instance_delete.rs index 88a5823ad0..005e9724a6 100644 --- a/nexus/src/app/sagas/instance_delete.rs +++ b/nexus/src/app/sagas/instance_delete.rs @@ -8,7 +8,6 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; use crate::app::sagas::declare_saga_actions; -use crate::app::sagas::retry_until_known_result; use nexus_db_queries::db; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::{authn, authz}; @@ -151,70 +150,14 @@ async fn sid_delete_network_config( &sagactx, ¶ms.serialized_authn, ); + let authz_instance = ¶ms.authz_instance; let osagactx = sagactx.user_data(); - let datastore = &osagactx.datastore(); - let log = sagactx.user_data().log(); - - debug!(log, "fetching external ip addresses"); - - let external_ips = &datastore - .instance_lookup_external_ips(&opctx, params.authz_instance.id()) + osagactx + .nexus() + .instance_delete_dpd_config(&opctx, authz_instance) .await - .map_err(ActionError::action_failed)?; - - let mut errors: Vec = vec![]; - - // Here we are attempting to delete every existing NAT entry while deferring - // any error handling. If we don't defer error handling, we might end up - // bailing out before we've attempted deletion of all entries. - for entry in external_ips { - for switch in ¶ms.boundary_switches { - debug!(log, "deleting nat mapping"; "switch" => switch.to_string(), "entry" => #?entry); - - let client_result = - osagactx.nexus().dpd_clients.get(switch).ok_or_else(|| { - ActionError::action_failed(Error::internal_error(&format!( - "unable to find dendrite client for {switch}" - ))) - }); - - let dpd_client = match client_result { - Ok(client) => client, - Err(new_error) => { - errors.push(new_error); - continue; - } - }; - - let result = retry_until_known_result(log, || async { - dpd_client - .ensure_nat_entry_deleted(log, entry.ip, *entry.first_port) - .await - }) - .await; - - match result { - Ok(_) => { - debug!(log, "deleting nat mapping successful"; "switch" => switch.to_string(), "entry" => format!("{entry:#?}")); - } - Err(e) => { - let new_error = - ActionError::action_failed(Error::internal_error( - &format!("failed to delete nat entry via dpd: {e}"), - )); - error!(log, "{new_error:#?}"); - errors.push(new_error); - } - } - } - } - - if let Some(error) = errors.first() { - return Err(error.clone()); - } - - Ok(()) + .map_err(ActionError::action_failed) } async fn sid_delete_instance_record( diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 56a330f526..da89e7e25a 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -4,15 +4,12 @@ //! Sleds, and the hardware and services within them. -use crate::app::sagas::retry_until_known_result; use crate::internal_api::params::{ PhysicalDiskDeleteRequest, PhysicalDiskPutRequest, SledAgentStartupInfo, SledRole, ZpoolPutRequest, }; -use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; -use nexus_db_queries::db::identity::Asset; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::model::DatasetKind; @@ -20,7 +17,6 @@ use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; -use sled_agent_client::types::SetVirtualNetworkInterfaceHost; use sled_agent_client::Client as SledAgentClient; use std::net::SocketAddrV6; use std::sync::Arc; @@ -317,251 +313,4 @@ impl super::Nexus { .await?; Ok(()) } - - // OPTE V2P mappings - - /// Ensures that V2P mappings exist that indicate that the instance with ID - /// `instance_id` is resident on the sled with ID `sled_id`. - pub(crate) async fn create_instance_v2p_mappings( - &self, - opctx: &OpContext, - instance_id: Uuid, - sled_id: Uuid, - ) -> Result<(), Error> { - info!(&self.log, "creating V2P mappings for instance"; - "instance_id" => %instance_id, - "sled_id" => %sled_id); - - // For every sled that isn't the sled this instance was allocated to, create - // a virtual to physical mapping for each of this instance's NICs. - // - // For the mappings to be correct, a few invariants must hold: - // - // - mappings must be set whenever an instance's sled changes (eg. - // during instance creation, migration, stop + start) - // - // - an instances' sled must not change while its corresponding mappings - // are being created - // - // - the same mapping creation must be broadcast to all sleds - // - // A more targeted approach would be to see what other instances share - // the VPC this instance is in (or more generally, what instances should - // have connectivity to this one), see what sleds those are allocated - // to, and only create V2P mappings for those sleds. - // - // There's additional work with this approach: - // - // - it means that delete calls are required as well as set calls, - // meaning that now the ordering of those matters (this may also - // necessitate a generation number for V2P mappings) - // - // - V2P mappings have to be bidirectional in order for both instances's - // packets to make a round trip. This isn't a problem with the - // broadcast approach because one of the sides will exist already, but - // it is something to orchestrate with a more targeted approach. - // - // TODO-correctness Default firewall rules currently will block - // instances in different VPCs from connecting to each other. If it ever - // stops doing this, the broadcast approach will create V2P mappings - // that shouldn't exist. - let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) - .instance_id(instance_id) - .lookup_for(authz::Action::Read) - .await?; - - let instance_nics = self - .db_datastore - .derive_guest_network_interface_info(&opctx, &authz_instance) - .await?; - - // Look up the supplied sled's physical host IP. - let physical_host_ip = - *self.sled_lookup(&self.opctx_alloc, &sled_id)?.fetch().await?.1.ip; - - let mut last_sled_id: Option = None; - loop { - let pagparams = DataPageParams { - marker: last_sled_id.as_ref(), - direction: dropshot::PaginationOrder::Ascending, - limit: std::num::NonZeroU32::new(10).unwrap(), - }; - - let sleds_page = - self.sled_list(&self.opctx_alloc, &pagparams).await?; - let mut join_handles = - Vec::with_capacity(sleds_page.len() * instance_nics.len()); - - for sled in &sleds_page { - // set_v2p not required for sled instance was allocated to, OPTE - // currently does that automatically - // - // TODO(#3107): Remove this when XDE stops creating mappings - // implicitly. - if sled.id() == sled_id { - continue; - } - - for nic in &instance_nics { - let client = self.sled_client(&sled.id()).await?; - let nic_id = nic.id; - let mapping = SetVirtualNetworkInterfaceHost { - virtual_ip: nic.ip, - virtual_mac: nic.mac.clone(), - physical_host_ip, - vni: nic.vni.clone(), - }; - - let log = self.log.clone(); - - // This function is idempotent: calling the set_v2p ioctl with - // the same information is a no-op. - join_handles.push(tokio::spawn(futures::future::lazy( - move |_ctx| async move { - retry_until_known_result(&log, || async { - client.set_v2p(&nic_id, &mapping).await - }) - .await - }, - ))); - } - } - - // Concurrently run each future to completion, but return the last - // error seen. - let mut error = None; - for join_handle in join_handles { - let result = join_handle - .await - .map_err(|e| Error::internal_error(&e.to_string()))? - .await; - - if result.is_err() { - error!(self.log, "{:?}", result); - error = Some(result); - } - } - if let Some(e) = error { - return e.map(|_| ()).map_err(|e| e.into()); - } - - if sleds_page.len() < 10 { - break; - } - - if let Some(last) = sleds_page.last() { - last_sled_id = Some(last.id()); - } - } - - Ok(()) - } - - /// Ensure that the necessary v2p mappings for an instance are deleted - pub(crate) async fn delete_instance_v2p_mappings( - &self, - opctx: &OpContext, - instance_id: Uuid, - ) -> Result<(), Error> { - // For every sled that isn't the sled this instance was allocated to, delete - // the virtual to physical mapping for each of this instance's NICs. If - // there isn't a V2P mapping, del_v2p should be a no-op. - let (.., authz_instance, db_instance) = - LookupPath::new(&opctx, &self.db_datastore) - .instance_id(instance_id) - .fetch_for(authz::Action::Read) - .await?; - - let instance_nics = self - .db_datastore - .derive_guest_network_interface_info(&opctx, &authz_instance) - .await?; - - // Lookup the physical host IP of the sled hosting this instance - let instance_sled_id = db_instance.runtime().sled_id; - let physical_host_ip = *self - .sled_lookup(&self.opctx_alloc, &instance_sled_id)? - .fetch() - .await? - .1 - .ip; - - let mut last_sled_id: Option = None; - - loop { - let pagparams = DataPageParams { - marker: last_sled_id.as_ref(), - direction: dropshot::PaginationOrder::Ascending, - limit: std::num::NonZeroU32::new(10).unwrap(), - }; - - let sleds_page = - self.sled_list(&self.opctx_alloc, &pagparams).await?; - let mut join_handles = - Vec::with_capacity(sleds_page.len() * instance_nics.len()); - - for sled in &sleds_page { - // del_v2p not required for sled instance was allocated to, OPTE - // currently does that automatically - // - // TODO(#3107): Remove this when XDE stops deleting mappings - // implicitly. - if sled.id() == instance_sled_id { - continue; - } - - for nic in &instance_nics { - let client = self.sled_client(&sled.id()).await?; - let nic_id = nic.id; - let mapping = SetVirtualNetworkInterfaceHost { - virtual_ip: nic.ip, - virtual_mac: nic.mac.clone(), - physical_host_ip, - vni: nic.vni.clone(), - }; - - let log = self.log.clone(); - - // This function is idempotent: calling the set_v2p ioctl with - // the same information is a no-op. - join_handles.push(tokio::spawn(futures::future::lazy( - move |_ctx| async move { - retry_until_known_result(&log, || async { - client.del_v2p(&nic_id, &mapping).await - }) - .await - }, - ))); - } - } - - // Concurrently run each future to completion, but return the last - // error seen. - let mut error = None; - for join_handle in join_handles { - let result = join_handle - .await - .map_err(|e| Error::internal_error(&e.to_string()))? - .await; - - if result.is_err() { - error!(self.log, "{:?}", result); - error = Some(result); - } - } - if let Some(e) = error { - return e.map(|_| ()).map_err(|e| e.into()); - } - - if sleds_page.len() < 10 { - break; - } - - if let Some(last) = sleds_page.last() { - last_sled_id = Some(last.id()); - } - } - - Ok(()) - } }