From e1c3dd75b818147b545f9a46acfcf3fc19291473 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 12 Feb 2024 13:25:54 -0800 Subject: [PATCH 1/5] [nexus] put DNS servers in DNS, so you can DNS while you DNS (#5033) Currently, the DNS propagation background task in Nexus uses the `services` table to enumerate the list of DNS servers that it's responsible for keeping up to date. However, we'd really like to get rid of said `services` table (see #4947), and the DNS propagation code is the only remaining user of the `services` table. Therefore, this branch changes how DNS servers are discovered for DNS propagation. Rather than discovering DNS server addresses from the `services` table, the `DnsWatcher` background task now discovers DNS servers...using internal DNS. As described in #4889, this may _seem_ like a cyclical dependency, but, because the initial set of internal DNS servers operate at known addresses -- by design -- so that they can always be discovered. And they have to be up and discoverable for Nexus to even come up and find CockroachDB. So, internal DNS can safely be assumed to be up if Nexus has come up at all. Now, the `services` table is no longer used, and This change breaks the existing tests `nexus::app::background::init::test_dns_propagation_basic` and `nexus::app::background::dns_servers::test_basic`. I've rewritten the `test_dns_propagation_basic` test to test the new expected behavior: - creating a new internal DNS server and adding a DNS record for it to the database's DNS config table results in that server's DNS records being propagated to the existing DNS serve - the `DnsWatcher` background task then picks up the DNS records for the new DNS server by querying the existing known DNS server - the current DNS config generation is then propagated to the new DNS server - a subsequent generation is propagated to both the initial and new DNS servers The `dns_servers::test_basic` test tested the discovery of DNS server addresses from the database. Because these no longer come from the db, and now come from internal DNS, this test would now end up exercising most of the functionality tested in `test_dns_propagation_basic`. I didn't think it was necessary to have two tests for this, so I made the judgement call to delete `dns_servers::test_basic`. If we think having a more isolated test that exercises only the DNS watcher task and not the DNS propagation task, we could put this back and create records on the DNS server by manually hitting its API with new configs, but I didn't think this was really worth the effort. I've also removed the `Datastore::upsert_service` method, which was used only for test code and is now dead. I considered deleting all code related to querying the `services` table in this branch as well. However, I noticed that it's still populated when initializing the rack, and that `omdb` has commands for querying that table. I wasn't sure if there were alternative data sources for the `omdb` debugging commands yet, so I didn't remove them. If the data provided by those commands is available elsewhere, or if their only real purpose is _just_ to print the state of this table, I'm happy to delete them in this branch, as well. Closes #4889 ![image](https://github.com/oxidecomputer/omicron/assets/2796466/c37a0d31-26f7-4a5d-9748-ef7212cde9a9) --- dev-tools/omdb/tests/env.out | 12 +- dev-tools/omdb/tests/successes.out | 8 +- internal-dns/src/names.rs | 2 +- nexus/src/app/background/dns_config.rs | 5 +- nexus/src/app/background/dns_servers.rs | 214 ++++------------------- nexus/src/app/background/init.rs | 223 +++++++++++++----------- nexus/src/app/sled.rs | 38 ---- 7 files changed, 168 insertions(+), 334 deletions(-) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 0600945194..3e6e89d508 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -57,11 +57,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -147,11 +147,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -224,11 +224,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 1cd85262f6..3086c98f32 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -251,11 +251,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -313,7 +313,7 @@ task: "dns_servers_internal" task: "dns_propagation_internal" configured period: every 1m currently executing: no - last completed activation: iter 5, triggered by a dependent task completing + last completed activation: iter 4, triggered by a dependent task completing started at (s ago) and ran for ms attempt to propagate generation: 1 @@ -341,7 +341,7 @@ task: "dns_servers_external" task: "dns_propagation_external" configured period: every 1m currently executing: no - last completed activation: iter 5, triggered by a dependent task completing + last completed activation: iter 4, triggered by a dependent task completing started at (s ago) and ran for ms attempt to propagate generation: 2 diff --git a/internal-dns/src/names.rs b/internal-dns/src/names.rs index e0c9b79555..bffe6e829a 100644 --- a/internal-dns/src/names.rs +++ b/internal-dns/src/names.rs @@ -65,7 +65,7 @@ impl ServiceName { /// Returns the DNS name for this service, ignoring the zone part of the DNS /// name - pub(crate) fn dns_name(&self) -> String { + pub fn dns_name(&self) -> String { match self { ServiceName::Clickhouse | ServiceName::ClickhouseKeeper diff --git a/nexus/src/app/background/dns_config.rs b/nexus/src/app/background/dns_config.rs index 959cf1843e..be18ac3612 100644 --- a/nexus/src/app/background/dns_config.rs +++ b/nexus/src/app/background/dns_config.rs @@ -159,7 +159,6 @@ impl BackgroundTask for DnsConfigWatcher { mod test { use crate::app::background::common::BackgroundTask; use crate::app::background::dns_config::DnsConfigWatcher; - use crate::app::background::init::test::read_internal_dns_zone_id; use crate::app::background::init::test::write_test_dns_generation; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; @@ -197,9 +196,7 @@ mod test { // Now write generation 2, activate again, and verify that the update // was sent to the watcher. - let internal_dns_zone_id = - read_internal_dns_zone_id(&opctx, &datastore).await; - write_test_dns_generation(&datastore, internal_dns_zone_id).await; + write_test_dns_generation(&opctx, &datastore).await; assert_eq!(watcher.borrow().as_ref().unwrap().generation, 1); let value = task.activate(&opctx).await; assert_eq!(watcher.borrow().as_ref().unwrap().generation, 2); diff --git a/nexus/src/app/background/dns_servers.rs b/nexus/src/app/background/dns_servers.rs index 97fb3510b7..8f4cce4ee0 100644 --- a/nexus/src/app/background/dns_servers.rs +++ b/nexus/src/app/background/dns_servers.rs @@ -7,22 +7,15 @@ use super::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; +use internal_dns::names::ServiceName; +use internal_dns::resolver::Resolver; use nexus_db_model::DnsGroup; -use nexus_db_model::ServiceKind; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::DataStore; -use omicron_common::api::external::DataPageParams; use serde::Serialize; use serde_json::json; -use std::net::{SocketAddr, SocketAddrV6}; -use std::num::NonZeroU32; -use std::sync::Arc; +use std::net::SocketAddr; use tokio::sync::watch; -// This constraint could be relaxed by paginating through the list of servers, -// but we don't expect to have this many servers any time soon. -const MAX_DNS_SERVERS: usize = 10; - #[derive(Debug, Clone, Eq, PartialEq, Serialize)] pub struct DnsServersList { pub addresses: Vec, @@ -31,20 +24,17 @@ pub struct DnsServersList { /// Background task that keeps track of the latest list of DNS servers for a DNS /// group pub struct DnsServersWatcher { - datastore: Arc, dns_group: DnsGroup, + resolver: Resolver, last: Option, tx: watch::Sender>, rx: watch::Receiver>, } impl DnsServersWatcher { - pub fn new( - datastore: Arc, - dns_group: DnsGroup, - ) -> DnsServersWatcher { + pub fn new(dns_group: DnsGroup, resolver: Resolver) -> DnsServersWatcher { let (tx, rx) = watch::channel(None); - DnsServersWatcher { datastore, dns_group, last: None, tx, rx } + DnsServersWatcher { dns_group, last: None, tx, rx, resolver } } /// Exposes the latest list of DNS servers for this DNS group @@ -75,58 +65,38 @@ impl BackgroundTask for DnsServersWatcher { }; // Read the latest service configuration for this DNS group. - let service_kind = match self.dns_group { - DnsGroup::Internal => ServiceKind::InternalDns, - DnsGroup::External => ServiceKind::ExternalDns, + let service_name = match self.dns_group { + DnsGroup::Internal => ServiceName::InternalDns, + DnsGroup::External => ServiceName::ExternalDns, }; - let pagparams = DataPageParams { - marker: None, - limit: NonZeroU32::try_from( - u32::try_from(MAX_DNS_SERVERS).unwrap(), - ) - .unwrap(), - direction: dropshot::PaginationOrder::Ascending, + let result = self.resolver.lookup_all_socket_v6(service_name).await; + let addresses = match result { + Err(error) => { + warn!( + &log, + "failed to lookup DNS servers"; + "error" => format!("{:#}", error) + ); + return json!({ + "error": + format!( + "failed to read list of DNS servers: {:#}", + error + ) + }); + } + Ok(addresses) => { + // TODO(eliza): it would be nicer if `Resolver` had a method + // returning an iterator instead of a `Vec`, so we didn't + // have to drain the Vec and then collect it into a new + // one... + addresses.into_iter().map(SocketAddr::V6).collect() + } }; - let result = self - .datastore - .services_list_kind(opctx, service_kind, &pagparams) - .await; - - if let Err(error) = result { - warn!( - &log, - "failed to read list of DNS servers"; - "error" => format!("{:#}", error) - ); - return json!({ - "error": - format!( - "failed to read list of DNS servers: {:#}", - error - ) - }); - } - - let services = result.unwrap(); - if services.len() >= MAX_DNS_SERVERS { - warn!( - &log, - "found {} servers, which is more than MAX_DNS_SERVERS \ - ({}). There may be more that will not be used.", - services.len(), - MAX_DNS_SERVERS - ); - } - - let new_config = DnsServersList { - addresses: services - .into_iter() - .map(|s| SocketAddrV6::new(*s.ip, *s.port, 0, 0).into()) - .collect(), - }; - let new_addrs_dbg = format!("{:?}", new_config); + let new_config = DnsServersList { addresses }; + let new_addrs_dbg = format!("{new_config:?}"); let rv = serde_json::to_value(&new_config).unwrap_or_else(|error| { json!({ @@ -177,119 +147,3 @@ impl BackgroundTask for DnsServersWatcher { .boxed() } } - -#[cfg(test)] -mod test { - use crate::app::background::common::BackgroundTask; - use crate::app::background::dns_servers::DnsServersList; - use crate::app::background::dns_servers::DnsServersWatcher; - use crate::app::background::dns_servers::MAX_DNS_SERVERS; - use assert_matches::assert_matches; - use async_bb8_diesel::AsyncRunQueryDsl; - use diesel::ExpressionMethods; - use diesel::QueryDsl; - use nexus_db_model::DnsGroup; - use nexus_db_queries::context::OpContext; - use nexus_db_queries::db::model::Service; - use nexus_db_queries::db::model::ServiceKind; - use nexus_test_utils_macros::nexus_test; - use std::net::Ipv6Addr; - use std::net::SocketAddrV6; - use uuid::Uuid; - - type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - - #[nexus_test(server = crate::Server)] - async fn test_basic(cptestctx: &ControlPlaneTestContext) { - let nexus = &cptestctx.server.apictx().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - // Verify the initial state. - let mut task = - DnsServersWatcher::new(datastore.clone(), DnsGroup::Internal); - let watcher = task.watcher(); - assert_matches!(*watcher.borrow(), None); - - // The datastore from the ControlPlaneTestContext is initialized with - // one DNS server. - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == 1); - - // If we add another server, we should see it. - { - use nexus_db_queries::db::schema::service::dsl; - diesel::insert_into(dsl::service) - .values(Service::new( - Uuid::new_v4(), - Uuid::new_v4(), - Some(Uuid::new_v4()), - SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0), - ServiceKind::InternalDns, - )) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == 2); - - // If we add MAX_DNS_SERVERS more servers, we should see - // MAX_DNS_SERVERS. - { - use nexus_db_queries::db::schema::service::dsl; - let new_services = (0..u16::try_from(MAX_DNS_SERVERS).unwrap()) - .map(|i| { - Service::new( - Uuid::new_v4(), - Uuid::new_v4(), - Some(Uuid::new_v4()), - SocketAddrV6::new(Ipv6Addr::LOCALHOST, i + 2, 0, 0), - ServiceKind::InternalDns, - ) - }) - .collect::>(); - - diesel::insert_into(dsl::service) - .values(new_services) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == MAX_DNS_SERVERS); - - // Now delete all the servers and try again. - { - use nexus_db_queries::db::schema::service::dsl; - diesel::delete( - dsl::service.filter(dsl::kind.eq(ServiceKind::InternalDns)), - ) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.is_empty()); - } -} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 9d078f10d0..27e58a298c 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -101,6 +101,7 @@ impl BackgroundTasks { opctx, datastore.clone(), DnsGroup::Internal, + resolver.clone(), &config.dns_internal, ); let (task_external_dns_config, task_external_dns_servers) = init_dns( @@ -108,6 +109,7 @@ impl BackgroundTasks { opctx, datastore.clone(), DnsGroup::External, + resolver.clone(), &config.dns_external, ); @@ -301,6 +303,7 @@ fn init_dns( opctx: &OpContext, datastore: Arc, dns_group: DnsGroup, + resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, ) -> (common::TaskHandle, common::TaskHandle) { let dns_group_name = dns_group.to_string(); @@ -321,13 +324,13 @@ fn init_dns( ); // Background task: DNS server list watcher - let dns_servers = dns_servers::DnsServersWatcher::new(datastore, dns_group); + let dns_servers = dns_servers::DnsServersWatcher::new(dns_group, resolver); let dns_servers_watcher = dns_servers.watcher(); let task_name_servers = format!("dns_servers_{}", dns_group); let task_servers = driver.register( task_name_servers.clone(), format!( - "watches list of {} DNS servers stored in CockroachDB", + "watches list of {} DNS servers stored in internal DNS", dns_group, ), config.period_secs_servers, @@ -361,22 +364,17 @@ fn init_dns( #[cfg(test)] pub mod test { - use async_bb8_diesel::AsyncRunQueryDsl; use dropshot::HandlerTaskMode; use nexus_db_model::DnsGroup; - use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; use nexus_test_utils_macros::nexus_test; use nexus_types::internal_api::params as nexus_params; - use nexus_types::internal_api::params::ServiceKind; - use omicron_common::api::external::DataPageParams; use omicron_test_utils::dev::poll; use std::net::SocketAddr; - use std::num::NonZeroU32; use std::time::Duration; use tempfile::TempDir; - use uuid::Uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -423,12 +421,23 @@ pub mod test { .expect("failed to get initial DNS server config"); assert_eq!(config.generation, 1); - // We'll need the id of the internal DNS zone. - let internal_dns_zone_id = - read_internal_dns_zone_id(&opctx, datastore).await; + let internal_dns_srv_name = + internal_dns::ServiceName::InternalDns.dns_name(); + + let initial_srv_record = { + let zone = + config.zones.get(0).expect("DNS config must have a zone"); + let Some(record) = zone.records.get(&internal_dns_srv_name) else { + panic!("zone must have a record for {internal_dns_srv_name}") + }; + match record.get(0) { + Some(dns_service_client::types::DnsRecord::Srv(srv)) => srv, + record => panic!("expected a SRV record for {internal_dns_srv_name}, found {record:?}"), + } + }; // Now spin up another DNS server, add it to the list of servers, and - // make sure that DNS gets propagated to it. Note that we shouldn't + // make sure that DNS gets propagated to it. Note that we shouldn't // have to explicitly activate the background task because inserting a // new service ought to do that for us. let log = &cptestctx.logctx.log; @@ -467,29 +476,76 @@ pub mod test { SocketAddr::V4(_) => panic!("expected v6 address"), SocketAddr::V6(a) => a, }; + + // In order to test that DNS gets propagated to a newly-added server, we + // first need to update the source of truth about DNS (the database). + // Then we need to wait for that to get propagated (by this same + // mechanism) to the existing DNS servers. Only then would we expect + // the mechanism to see the new DNS server and then propagate + // configuration to it. + let update = { + use nexus_params::{DnsRecord, Srv}; + + let target = "my-great-dns-server.host"; + + let mut update = test_dns_update_builder(); + update.remove_name(internal_dns_srv_name.clone()).unwrap(); + update + .add_name( + internal_dns_srv_name, + vec![ + DnsRecord::Srv(Srv { + prio: 0, + weight: 0, + port: new_dns_addr.port(), + target: format!( + "{target}.control-plane.oxide.internal" + ), + }), + DnsRecord::Srv(initial_srv_record.clone()), + ], + ) + .unwrap(); + update + .add_name( + target.to_string(), + vec![DnsRecord::Aaaa(*new_dns_addr.ip())], + ) + .unwrap(); + update + }; + write_dns_update(&opctx, datastore, update).await; + info!(&cptestctx.logctx.log, "updated new dns records"); + + // Activate the internal DNS propagation pipeline. nexus - .upsert_service( - &opctx, - Uuid::new_v4(), - cptestctx.sled_agent.sled_agent.id, - Some(Uuid::new_v4()), - new_dns_addr, - ServiceKind::InternalDns.into(), - ) - .await - .unwrap(); + .background_tasks + .activate(&nexus.background_tasks.task_internal_dns_config); + + wait_propagate_dns( + &cptestctx.logctx.log, + "initial", + initial_dns_dropshot_server.local_addr(), + 2, + ) + .await; + + // Discover the new internal DNS server from internal DNS. + nexus + .background_tasks + .activate(&nexus.background_tasks.task_internal_dns_servers); wait_propagate_dns( &cptestctx.logctx.log, "new", new_dns_dropshot_server.local_addr(), - 1, + 2, ) .await; - // Now, write version 2 of the internal DNS configuration with one + // Now, write version 3 of the internal DNS configuration with one // additional record. - write_test_dns_generation(datastore, internal_dns_zone_id).await; + write_test_dns_generation(&opctx, datastore).await; // Activate the internal DNS propagation pipeline. nexus @@ -501,7 +557,7 @@ pub mod test { &cptestctx.logctx.log, "initial", initial_dns_dropshot_server.local_addr(), - 2, + 3, ) .await; @@ -509,7 +565,7 @@ pub mod test { &cptestctx.logctx.log, "new", new_dns_dropshot_server.local_addr(), - 2, + 3, ) .await; } @@ -522,15 +578,16 @@ pub mod test { generation: u64, ) { println!( - "waiting for propagation of generation {} to {} DNS server ({})", - generation, label, addr + "waiting for propagation of generation {generation} to {label} \ + DNS server ({addr})", ); let client = dns_service_client::Client::new( &format!("http://{}", addr), log.clone(), ); - poll::wait_for_condition( + let poll_max = Duration::from_secs(30); + let result = poll::wait_for_condition( || async { match client.dns_config_get().await { Err(error) => { @@ -548,87 +605,51 @@ pub mod test { } }, &Duration::from_millis(50), - &Duration::from_secs(30), + &poll_max, ) - .await - .expect("DNS config not propagated in expected time"); + .await; + if let Err(err) = result { + panic!( + "DNS generation {generation} not propagated to \ + {label} DNS server ({addr}) within {poll_max:?}: {err}" + ); + } else { + println!( + "DNS generation {generation} propagated to {label} \ + DNS server ({addr}) successfully." + ); + } } - pub(crate) async fn write_test_dns_generation( + pub(crate) async fn write_dns_update( + opctx: &OpContext, datastore: &DataStore, - internal_dns_zone_id: Uuid, + update: DnsVersionUpdateBuilder, ) { - { - let conn = datastore.pool_connection_for_tests().await.unwrap(); - let _: Result<(), _> = datastore - .transaction_retry_wrapper("write_test_dns_generation") - .transaction(&conn, |conn| async move { - { - use nexus_db_queries::db::model::DnsVersion; - use nexus_db_queries::db::schema::dns_version::dsl; - - diesel::insert_into(dsl::dns_version) - .values(DnsVersion { - dns_group: DnsGroup::Internal, - version: Generation(2u32.try_into().unwrap()), - time_created: chrono::Utc::now(), - creator: String::from("test suite"), - comment: String::from("test suite"), - }) - .execute_async(&conn) - .await - .unwrap(); - } - - { - use nexus_db_queries::db::model::DnsName; - use nexus_db_queries::db::schema::dns_name::dsl; - - diesel::insert_into(dsl::dns_name) - .values( - DnsName::new( - internal_dns_zone_id, - String::from("we-got-beets"), - Generation(2u32.try_into().unwrap()), - None, - vec![nexus_params::DnsRecord::Aaaa( - "fe80::3".parse().unwrap(), - )], - ) - .unwrap(), - ) - .execute_async(&conn) - .await - .unwrap(); - } - - Ok(()) - }) - .await; - } + let conn = datastore.pool_connection_for_tests().await.unwrap(); + info!(opctx.log, "writing DNS update..."); + datastore.dns_update(opctx, &conn, update).await.unwrap(); } - pub(crate) async fn read_internal_dns_zone_id( + pub(crate) async fn write_test_dns_generation( opctx: &OpContext, datastore: &DataStore, - ) -> Uuid { - let dns_zones = datastore - .dns_zones_list( - &opctx, - DnsGroup::Internal, - &DataPageParams { - marker: None, - direction: dropshot::PaginationOrder::Ascending, - limit: NonZeroU32::new(2).unwrap(), - }, + ) { + let mut update = test_dns_update_builder(); + update + .add_name( + "we-got-beets".to_string(), + vec![nexus_params::DnsRecord::Aaaa("fe80::3".parse().unwrap())], ) - .await .unwrap(); - assert_eq!( - dns_zones.len(), - 1, - "expected exactly one internal DNS zone" - ); - dns_zones[0].id + write_dns_update(opctx, datastore, update).await + } + + fn test_dns_update_builder() -> DnsVersionUpdateBuilder { + DnsVersionUpdateBuilder::new( + DnsGroup::Internal, + "test suite DNS update".to_string(), + "test suite".to_string(), + ) } } diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 943490ac04..738dae1d6d 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -23,9 +23,6 @@ use std::net::SocketAddrV6; use std::sync::Arc; use uuid::Uuid; -#[cfg(test)] -use nexus_db_queries::db::model::ServiceKind; - impl super::Nexus { // Sleds pub fn sled_lookup<'a>( @@ -276,41 +273,6 @@ impl super::Nexus { Ok(()) } - // Services - - /// Upserts a Service into the database, updating it if it already exists. - #[cfg(test)] - pub(crate) async fn upsert_service( - &self, - opctx: &OpContext, - id: Uuid, - sled_id: Uuid, - zone_id: Option, - address: SocketAddrV6, - kind: ServiceKind, - ) -> Result<(), Error> { - info!( - self.log, - "upserting service"; - "sled_id" => sled_id.to_string(), - "service_id" => id.to_string(), - "address" => address.to_string(), - ); - let service = - db::model::Service::new(id, sled_id, zone_id, address, kind); - self.db_datastore.service_upsert(opctx, service).await?; - - if kind == ServiceKind::ExternalDns { - self.background_tasks - .activate(&self.background_tasks.task_external_dns_servers); - } else if kind == ServiceKind::InternalDns { - self.background_tasks - .activate(&self.background_tasks.task_internal_dns_servers); - } - - Ok(()) - } - /// Ensure firewall rules for internal services get reflected on all the relevant sleds. pub(crate) async fn plumb_service_firewall_rules( &self, From f1eb93d488b4337a02b010d9f73a08a813d7f001 Mon Sep 17 00:00:00 2001 From: "oxide-reflector-bot[bot]" <130185838+oxide-reflector-bot[bot]@users.noreply.github.com> Date: Tue, 13 Feb 2024 03:44:26 +0000 Subject: [PATCH 2/5] Update dendrite to 3618dd6 (#5049) Updated dendrite to commit 3618dd6. Co-authored-by: reflector[bot] <130185838+reflector[bot]@users.noreply.github.com> --- package-manifest.toml | 12 ++++++------ tools/dendrite_openapi_version | 2 +- tools/dendrite_stub_checksums | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/package-manifest.toml b/package-manifest.toml index 9858fd980e..9b72dd7d18 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -497,8 +497,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "d58b59b2682183855cdf137ab86a298ecdaaf697" -source.sha256 = "d48408ac1c12332c0bf95834a9372f20d9ff53efa1d79513ea868f616a91cb97" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "eb98985871f321411f7875ef7751dba85ae0dd3034877b63ccb78cedcb96e6e7" output.type = "zone" output.intermediate_only = true @@ -522,8 +522,8 @@ only_for_targets.image = "standard" # 2. Copy the output zone image from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "d58b59b2682183855cdf137ab86a298ecdaaf697" -source.sha256 = "fb1b276c48d320a129376f8d574f61474734de20ed3e6f90408c56a3d205c597" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "cc0429f0d9ce6df94e834cea89cabbdf4d1fbfe623369dd3eb84c5b2677414be" output.type = "zone" output.intermediate_only = true @@ -540,8 +540,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out/dendrite-softnpu.tar.gz source.type = "prebuilt" source.repo = "dendrite" -source.commit = "d58b59b2682183855cdf137ab86a298ecdaaf697" -source.sha256 = "197ed4e1fde14062bf6ae037895c72571b028078ff5691591aec3ad063cad3ea" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "fa25585fb3aa1a888b76133af3060b859cbea8e53287bb1cc64e70889db37679" output.type = "zone" output.intermediate_only = true diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index 69283e142d..6895170e02 100644 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="d58b59b2682183855cdf137ab86a298ecdaaf697" +COMMIT="3618dd6017b363c5d34399273453cf50b9c9a43e" SHA2="aa670165e5b459fab4caba36ae4d382a09264ff5cf6a2dac0dae0a0db39a378e" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 18e525c0d3..74b379f359 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="d48408ac1c12332c0bf95834a9372f20d9ff53efa1d79513ea868f616a91cb97" -CIDL_SHA256_LINUX_DPD="b6d45d713a879f1d03422eb0bbfa33767da776b05b0d5fbb855b602974ceba01" +CIDL_SHA256_ILLUMOS="eb98985871f321411f7875ef7751dba85ae0dd3034877b63ccb78cedcb96e6e7" +CIDL_SHA256_LINUX_DPD="cb9a1978d1fe3a3f2391757f80436d8cc87c0041161652ad2234e7cf83e9ae36" CIDL_SHA256_LINUX_SWADM="b7e737be56a8a815a95624f0b5c42ce1e339b07feeae7b3d7b9b4bc17c204245" From 5e9e59c312cd787ba50cf6776f220951917dfa14 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 13 Feb 2024 12:04:22 -0500 Subject: [PATCH 3/5] Remove thing-flinger (#5038) Thing-flinger was used to deploy multinode omicron clusters running on commodity hardware for early testing needs. This was primarily used for early experimentation with trust-quorum secret sharing and other preliminary multi-node experiments. It deployed via SSH and was never intended to be used forever. It was a stop-gap that nobody has used in a while now that we have real hardware with wicket and the a4x2 falcon testbed. Fixes #5034 --- Cargo.lock | 39 - Cargo.toml | 2 - dev-tools/thing-flinger/.gitignore | 1 - dev-tools/thing-flinger/Cargo.toml | 21 - dev-tools/thing-flinger/README.adoc | 222 ---- .../src/bin/deployment-example.toml | 36 - .../thing-flinger/src/bin/thing-flinger.rs | 968 ------------------ 7 files changed, 1289 deletions(-) delete mode 100644 dev-tools/thing-flinger/.gitignore delete mode 100644 dev-tools/thing-flinger/Cargo.toml delete mode 100644 dev-tools/thing-flinger/README.adoc delete mode 100644 dev-tools/thing-flinger/src/bin/deployment-example.toml delete mode 100644 dev-tools/thing-flinger/src/bin/thing-flinger.rs diff --git a/Cargo.lock b/Cargo.lock index dcdc4b9832..caddb6d8ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1235,20 +1235,6 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7059fff8937831a9ae6f0fe4d658ffabf58f2ca96aa9dec1c889f936f705f216" -[[package]] -name = "crossbeam" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" -dependencies = [ - "cfg-if", - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1283,16 +1269,6 @@ dependencies = [ "scopeguard", ] -[[package]] -name = "crossbeam-queue" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.16" @@ -4915,21 +4891,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "omicron-deploy" -version = "0.1.0" -dependencies = [ - "anyhow", - "camino", - "clap 4.4.3", - "crossbeam", - "omicron-package", - "omicron-workspace-hack", - "serde", - "serde_derive", - "thiserror", -] - [[package]] name = "omicron-dev" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 16f064d64d..d33758fc06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,6 @@ members = [ "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/oxlog", - "dev-tools/thing-flinger", "dev-tools/xtask", "dns-server", "end-to-end-tests", @@ -96,7 +95,6 @@ default-members = [ "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/oxlog", - "dev-tools/thing-flinger", # Do not include xtask in the list of default members, because this causes # hakari to not work as well and build times to be longer. # See omicron#4392. diff --git a/dev-tools/thing-flinger/.gitignore b/dev-tools/thing-flinger/.gitignore deleted file mode 100644 index ea8c4bf7f3..0000000000 --- a/dev-tools/thing-flinger/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/target diff --git a/dev-tools/thing-flinger/Cargo.toml b/dev-tools/thing-flinger/Cargo.toml deleted file mode 100644 index a427685871..0000000000 --- a/dev-tools/thing-flinger/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "omicron-deploy" -description = "Tools for deploying Omicron software to target machines" -version = "0.1.0" -edition = "2021" -license = "MPL-2.0" - -[dependencies] -anyhow.workspace = true -camino.workspace = true -clap.workspace = true -crossbeam.workspace = true -omicron-package.workspace = true -serde.workspace = true -serde_derive.workspace = true -thiserror.workspace = true -omicron-workspace-hack.workspace = true - -[[bin]] -name = "thing-flinger" -doc = false diff --git a/dev-tools/thing-flinger/README.adoc b/dev-tools/thing-flinger/README.adoc deleted file mode 100644 index 9966a7b747..0000000000 --- a/dev-tools/thing-flinger/README.adoc +++ /dev/null @@ -1,222 +0,0 @@ -Omicron is a complex piece of software consisting of many build and install-time dependencies. It's -intended to run primarily on illumos based systems, and as such is built to use runtime facilities -of illumos, such as https://illumos.org/man/5/smf[SMF]. Furthermore, Omicron is fundamentally a -distributed system, with its components intended to run on multiple servers communicating over the -network. In order to secure the system, certain cryptographic primitives, such as asymmetric key -pairs and shared secrets are required. Due to the nature of these cryptographic primitives, there is -a requirement for the distribution or creation of files unique to a specific server, such that no -other server has access to those files. Examples of this are private keys, and threshold key -shares, although other non-cryptographic unique files may also become necessary over time. - -In order to satisfy the above requirements of building and deploying a complex distributed system -consisting of unique, private files, two CLI tools have been created: - - . link:src/bin/omicron-package.rs[omicron-package] - build, package, install on local machine - . link:src/bin/thing-flinger.rs[thing-flinger] - build, package, deploy to remote machines - - -If a user is working on their local illumos based machine, and only wants to run -omicron in single node mode, they should follow the install instruction in -the link:../README.adoc[Omicron README] and use `omicron-package`. If the user -wishes for a more complete workflow, where they can code on their local laptop, -use a remote build machine, and install to multiple machines for a more realistic -deployment, they should use `thing-flinger`. - -The remainder of this document will describe a typical workflow for using -thing-flinger, pointing out room for improvement. - -== Environment and Configuration - - - +------------------+ +------------------+ - | | | | - | | | | - | Client |----------------> Builder | - | | | | - | | | | - +------------------+ +------------------+ - | - | - | - | - +---------------------------+--------------------------+ - | | | - | | | - | | | - +--------v---------+ +---------v--------+ +---------v--------+ - | | | | | | - | | | | | | - | Deployed Server | | Deployed Server | | Deployed Server | - | | | | | | - | | | | | | - +------------------+ +------------------+ +------------------+ - - -`thing-flinger` defines three types of nodes: - - * Client - Where a user typically edits their code and runs thing-flinger. This can run any OS. - * Builder - A Helios box where Omicron is built and packaged - * Deployed Server - Helios machines where Omicron will be installed and run - -It's not at all necessary for these to be separate nodes. For example, a client and builder can be -the same machine, as long as it's a Helios box. Same goes for Builder and a deployment server. The -benefit of this separation though, is that it allows editing on something like a laptop, without -having to worry about setting up a development environment on an illumos based host. - -Machine topology is configured in a `TOML` file that is passed on the command line. All illumos -machines are listed under `servers`, and just the names are used for configuring a builder and -deployment servers. An link:src/bin/deployment-example.toml[example] is provided. - -Thing flinger works over SSH, and so the user must have the public key of their client configured -for their account on all servers. SSH agent forwarding is used to prevent the need for the keys of -the builder to also be on the other servers, thus minimizing needed server configuration. - -== Typical Workflow - -=== Prerequisites - -Ensure you have an account on all illumos boxes, with the client public key in -`~/.ssh/authorized_keys`. - -.The build machine must have Rust and cargo installed, as well as -all the dependencies for Omicron installed. Following the *prerequisites* in the -https://github.com/oxidecomputer/omicron/#build-and-run[Build and run] section of the main Omicron -README is probably a good idea. - -==== Update `config-rss.toml` - -Currently rack setup is driven by a configuration file that lives at -`smf/sled-agent/non-gimlet/config-rss.toml` in the root of this repository. The committed -configuration of that file contains a single `requests` entry (with many -services inside it), which means it will start services on only one sled. To -start services (e.g., nexus) on multiple sleds, add additional entries to that -configuration file before proceeding. - -=== Command Based Workflow - -==== sync -Copy your source code to the builder. - -`+cargo run --bin thing-flinger -- -c sync+` - -==== Install Prerequisites -Install necessary build and runtime dependencies (including downloading prebuilt -binaries like Clickhouse and CockroachDB) on the builder and all deployment -targets. This step only needs to be performed once, absent any changes to the -dependencies, but is idempotent so may be run multiple times. - -`+cargo run --bin thing-flinger -- -c install-prereqs+` - -==== check (optional) -Run `cargo check` on the builder against the copy of `omicron` that was sync'd -to it in the previous step. - -`+cargo run --bin thing-flinger -- -c build check+` - -==== package -Build and package omicron using `omicron-package` on the builder. - -`+cargo run --bin thing-flinger -- -c build package+` - -==== overlay -Create files that are unique to each deployment server. - -`+cargo run --bin thing-flinger -- -c overlay+` - -==== install -Install omicron to all machines, in parallel. This consists of copying the packaged omicron tarballs -along with overlay files, and omicron-package and its manifest to a `staging` directory on each -deployment server, and then running omicron-package, installing overlay files, and restarting -services. - -`+cargo run --bin thing-flinger -- -c deploy install+` - -==== uninstall -Uninstall omicron from all machines. - -`+cargo run --bin thing-flinger -- -c deploy uninstall+` - -=== Current Limitations - -`thing-flinger` is an early prototype. It has served so far to demonstrate that unique files, -specifically secret shares, can be created and distributed over ssh, and that omicron can be -installed remotely using `omicron-package`. It is not currently complete enough to fully test a -distributed omicron setup, as the underlying dependencies are not configured yet. Specifically, -`CockroachDB` and perhaps `Clickhouse`, need to be configured to run in multiple server mode. It's -anticipated that the `overlay` feature of `thing-flinger` can be used to generate and distribute -configs for this. - -=== Design rationale - -`thing-flinger` is a command line program written in rust. It was written this way to build upon -`omicron-package`, which is also in rust, as that is our default language of choice at Oxide. -`thing-flinger` is based around SSH, as that is the minimal viable requirement for a test tool such -as this. Additionally, it provides for the most straightforward implementation, and takes the least -effort to use securely. This particular implementation wraps the openssh ssh client via -`std::process::Command`, rather than using the `ssh2` crate, because ssh2, as a wrapper around -`libssh`, does not support agent-forwarding. - -== Notes on Using VMs as Deployed Servers on a Linux Host - -TODO: This section should be fleshed out more and potentially lifted to its own -document; for now this is a collection of rough notes. - ---- - -It's possible to use a Linux libvirt host running multiple helios VMs as the -builder/deployment server targets, but it requires some additional setup beyond -`https://github.com/oxidecomputer/helios-engvm[helios-engvm]`. - -`thing-flinger` does not have any support for running the -`tools/create_virtual_hardware.sh` script; this will need to be done by hand on -each VM. - ---- - -To enable communication between the VMs over their IPv6 bootstrap networks: - -1. Enable IPv6 and DHCP on the virtual network libvirt uses for the VMs; e.g., - -```xml - - - - - -``` - -After booting the VMs with this enabled, they should be able to ping each other -over their acquired IPv6 addresses, but connecting to each other over the -`bootstrap6` interface that sled-agent creates will fail. - -2. Explicitly add routes in the Linux host for the `bootstrap6` addresses, -specifying the virtual interface libvirt created that is used by the VMs. - -``` -bash% sudo ip -6 route add fdb0:5254:13:7331::1/64 dev virbr1 -bash% sudo ip -6 route add fdb0:5254:f0:acfd::1/64 dev virbr1 -``` - -3. Once the sled-agents advance sufficiently to set up `sled6` interfaces, -routes need to be added for them both in the Linux host and in the Helios VMs. -Assuming two sleds with these interfaces: - -``` -# VM 1 -vioif0/sled6 static ok fd00:1122:3344:1::1/64 -# VM 2 -vioif0/sled6 static ok fd00:1122:3344:2::1/64 -``` - -The Linux host needs to be told to route that subnet to the appropriate virtual -interface: - -``` -bash% ip -6 route add fd00:1122:3344::1/48 dev virbr1 -``` - -and each Helios VM needs to be told to route that subnet to the host gateway: - -``` -vm% pfexec route add -inet6 fd00:1122:3344::/48 $IPV6_HOST_GATEWAY_ADDR -``` diff --git a/dev-tools/thing-flinger/src/bin/deployment-example.toml b/dev-tools/thing-flinger/src/bin/deployment-example.toml deleted file mode 100644 index 6d85de2ba6..0000000000 --- a/dev-tools/thing-flinger/src/bin/deployment-example.toml +++ /dev/null @@ -1,36 +0,0 @@ -# This manifest describes the servers that omicron will be installed to, along -# with any ancillary information specific to a given server. -# -# It is ingested by the `thing-flinger` tool. - -# This must be an absolute path. It refers to the path to Omicron on the -# machine where thing-flinger is being executed. -omicron_path = "/local/path/to/omicron" - -[builder] -# `server` must refer to one of the `servers` in the servers table -server = "foo" -# This must be an absolute path. It refers to the path to Omicron on the -# builder server. -omicron_path = "/remote/path/to/omicron" - -[deployment] -# which server is responsible for running the rack setup service; must -# refer to one of the `servers` in the servers table -rss_server = "foo" -# Location where files to install will be placed before running -# `omicron-package install` -# -# This must be an absolute path -# We specifically allow for $HOME in validating the absolute path -staging_dir = "$HOME/omicron_staging" -# which servers to deploy -servers = ["foo", "bar"] - -[servers.foo] -username = "me" -addr = "foo" - -[servers.bar] -username = "me" -addr = "bar" diff --git a/dev-tools/thing-flinger/src/bin/thing-flinger.rs b/dev-tools/thing-flinger/src/bin/thing-flinger.rs deleted file mode 100644 index 43b137790d..0000000000 --- a/dev-tools/thing-flinger/src/bin/thing-flinger.rs +++ /dev/null @@ -1,968 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Utility for deploying Omicron to remote machines - -use omicron_package::{parse, BuildCommand, DeployCommand}; - -use camino::{Utf8Path, Utf8PathBuf}; -use std::collections::{BTreeMap, BTreeSet}; -use std::process::Command; - -use anyhow::{Context, Result}; -use clap::{Parser, Subcommand}; -use crossbeam::thread::{self, ScopedJoinHandle}; -use serde_derive::Deserialize; -use thiserror::Error; - -// A server on which omicron source should be compiled into packages. -#[derive(Deserialize, Debug)] -struct Builder { - server: String, - omicron_path: Utf8PathBuf, -} - -// A server on which an omicron package is deployed. -#[derive(Deserialize, Debug, Eq, PartialEq)] -struct Server { - username: String, - addr: String, -} - -#[derive(Deserialize, Debug)] -struct Deployment { - rss_server: String, - staging_dir: Utf8PathBuf, - servers: BTreeSet, -} - -#[derive(Debug, Deserialize)] -struct Config { - omicron_path: Utf8PathBuf, - builder: Builder, - servers: BTreeMap, - deployment: Deployment, - - #[serde(default)] - rss_config_path: Option, - - #[serde(default)] - debug: bool, -} - -impl Config { - fn release_arg(&self) -> &str { - if self.debug { - "" - } else { - "--release" - } - } - - fn deployment_servers(&self) -> impl Iterator { - self.servers.iter().filter_map(|(name, s)| { - if self.deployment.servers.contains(name) { - Some(s) - } else { - None - } - }) - } -} - -fn parse_into_set(src: &str) -> Result, &'static str> { - Ok(src.split_whitespace().map(|s| s.to_owned()).collect()) -} - -#[derive(Debug, Subcommand)] -enum SubCommand { - /// Run the given command on the given servers, or all servers if none are - /// specified. - /// - /// Be careful! - Exec { - /// The command to run - #[clap(short, long, action)] - cmd: String, - - /// The servers to run the command on - #[clap(short, long, value_parser = parse_into_set)] - servers: Option>, - }, - - /// Install necessary prerequisites on the "builder" server and all "deploy" - /// servers. - InstallPrereqs, - - /// Sync our local source to the build host - Sync, - - /// Runs a command on the "builder" server. - #[clap(name = "build", subcommand)] - Builder(BuildCommand), - - /// Runs a command on all the "deploy" servers. - #[clap(subcommand)] - Deploy(DeployCommand), - - /// Create an overlay directory tree for each deployment server - /// - /// Each directory tree contains unique files for the given server that will - /// be populated in the svc/pkg dir. - /// - /// This is a separate subcommand so that we can reconstruct overlays - /// without rebuilding or repackaging. - Overlay, -} - -#[derive(Debug, Parser)] -#[clap( - name = "thing-flinger", - about = "A tool for synchronizing packages and configs between machines" -)] -struct Args { - /// The path to the deployment manifest TOML file - #[clap( - short, - long, - help = "Path to deployment manifest toml file", - action - )] - config: Utf8PathBuf, - - #[clap( - short, - long, - help = "The name of the build target to use for this command" - )] - target: String, - - /// The output directory, where artifacts should be built and staged - #[clap(long = "artifacts", default_value = "out/")] - artifact_dir: Utf8PathBuf, - - #[clap(subcommand)] - subcommand: SubCommand, -} - -/// Errors which can be returned when executing subcommands -#[derive(Error, Debug)] -enum FlingError { - #[error("Servers not listed in configuration: {0:?}")] - InvalidServers(Vec), - - /// Failed to rsync omicron to build host - #[error("Failed to sync {src} with {dst}")] - FailedSync { src: String, dst: String }, - - /// The given path must be absolute - #[error("Path for {field} must be absolute")] - NotAbsolutePath { field: &'static str }, -} - -// How should `ssh_exec` be run? -enum SshStrategy { - // Forward agent and source .profile - Forward, - - // Don't forward agent, but source .profile - NoForward, - - // Don't forward agent and don't source .profile - NoForwardNoProfile, -} - -impl SshStrategy { - fn forward_agent(&self) -> bool { - match self { - SshStrategy::Forward => true, - _ => false, - } - } - - fn source_profile(&self) -> bool { - match self { - SshStrategy::Forward | &SshStrategy::NoForward => true, - _ => false, - } - } -} - -// TODO: run in parallel when that option is given -fn do_exec( - config: &Config, - cmd: String, - servers: Option>, -) -> Result<()> { - if let Some(ref servers) = servers { - validate_servers(servers, &config.servers)?; - - for name in servers { - let server = &config.servers[name]; - ssh_exec(&server, &cmd, SshStrategy::NoForward)?; - } - } else { - for (_, server) in config.servers.iter() { - ssh_exec(&server, &cmd, SshStrategy::NoForward)?; - } - } - Ok(()) -} - -// start an `rsync` command with args common to all our uses -fn rsync_common() -> Command { - let mut cmd = Command::new("rsync"); - cmd.arg("-az") - .arg("-e") - .arg("ssh -o StrictHostKeyChecking=no") - .arg("--delete") - .arg("--progress") - .arg("--out-format") - .arg("File changed: %o %t %f"); - cmd -} - -fn do_sync(config: &Config) -> Result<()> { - let builder = - config.servers.get(&config.builder.server).ok_or_else(|| { - FlingError::InvalidServers(vec![config.builder.server.clone()]) - })?; - - // For rsync to copy from the source appropriately we must guarantee a - // trailing slash. - let src = format!( - "{}/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - let dst = format!( - "{}@{}:{}", - builder.username, builder.addr, config.builder.omicron_path - ); - - println!("Synchronizing source files to: {}", dst); - let mut cmd = rsync_common(); - - // exclude build and development environment artifacts - cmd.arg("--exclude") - .arg("target/") - .arg("--exclude") - .arg("*.vdev") - .arg("--exclude") - .arg("*.swp") - .arg("--exclude") - .arg(".git/") - .arg("--exclude") - .arg("out/"); - - // exclude `config-rss.toml`, which needs to be sent to only one target - // system. we handle this in `do_overlay` below. - cmd.arg("--exclude").arg("**/config-rss.toml"); - - // finish with src/dst - cmd.arg(&src).arg(&dst); - let status = - cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err(FlingError::FailedSync { src, dst }.into()); - } - - Ok(()) -} - -fn copy_to_deployment_staging_dir( - config: &Config, - src: String, - description: &str, -) -> Result<()> { - let partial_cmd = || { - let mut cmd = rsync_common(); - cmd.arg("--relative"); - cmd.arg(&src); - cmd - }; - - // A function for each deployment server to run in parallel - let fns = config.deployment_servers().map(|server| { - || { - let dst = format!( - "{}@{}:{}", - server.username, server.addr, config.deployment.staging_dir - ); - let mut cmd = partial_cmd(); - cmd.arg(&dst); - let status = cmd - .status() - .context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err( - FlingError::FailedSync { src: src.clone(), dst }.into() - ); - } - Ok(()) - } - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel(description, named_fns); - - Ok(()) -} - -fn rsync_config_needed_for_tools(config: &Config) -> Result<()> { - let src = format!( - // the `./` here is load-bearing; it interacts with `--relative` to tell - // rsync to create `smf/sled-agent` but none of its parents - "{}/./smf/sled-agent/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - - copy_to_deployment_staging_dir(config, src, "Copy smf/sled-agent dir") -} - -fn rsync_tools_dir_to_deployment_servers(config: &Config) -> Result<()> { - // we need to rsync `./tools/*` to each of the deployment targets (the - // "builder" already has it via `do_sync()`), and then run `pfexec - // tools/install_prerequisites.sh` on each system. - let src = format!( - // the `./` here is load-bearing; it interacts with `--relative` to tell - // rsync to create `tools` but none of its parents - "{}/./tools/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - copy_to_deployment_staging_dir(config, src, "Copy tools dir") -} - -fn do_install_prereqs(config: &Config) -> Result<()> { - rsync_config_needed_for_tools(config)?; - rsync_tools_dir_to_deployment_servers(config)?; - install_rustup_on_deployment_servers(config); - create_virtual_hardware_on_deployment_servers(config); - create_external_tls_cert_on_builder(config)?; - - // Create a set of servers to install prereqs to - let builder = &config.servers[&config.builder.server]; - let build_server = (builder, &config.builder.omicron_path); - let all_servers = std::iter::once(build_server).chain( - config.deployment_servers().filter_map(|server| { - // Don't duplicate the builder - if server.addr != builder.addr { - Some((server, &config.deployment.staging_dir)) - } else { - None - } - }), - ); - - let server_names = std::iter::once(&config.builder.server).chain( - config - .deployment - .servers - .iter() - .filter(|s| **s != config.builder.server), - ); - - // Install functions to run in parallel on each server - let fns = all_servers.map(|(server, root_path)| { - || { - // -y: assume yes instead of prompting - // -p: skip check that deps end up in $PATH - let (script, script_type) = if *server == *builder { - ("install_builder_prerequisites.sh -y -p", "builder") - } else { - ("install_runner_prerequisites.sh -y", "runner") - }; - - let cmd = format!( - "cd {} && mkdir -p out && pfexec ./tools/{}", - root_path.clone(), - script - ); - println!( - "Install {} prerequisites on {}", - script_type, server.addr - ); - ssh_exec(server, &cmd, SshStrategy::NoForward) - } - }); - - let named_fns = server_names.zip(fns); - run_in_parallel("Install prerequisites", named_fns); - - Ok(()) -} - -fn create_external_tls_cert_on_builder(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - let cmd = format!( - "cd {} && ./tools/create_self_signed_cert.sh", - config.builder.omicron_path, - ); - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn create_virtual_hardware_on_deployment_servers(config: &Config) { - let cmd = format!( - "cd {} && pfexec ./tools/create_virtual_hardware.sh", - config.deployment.staging_dir - ); - let fns = config.deployment_servers().map(|server| { - || { - println!("Create virtual hardware on {}", server.addr); - ssh_exec(server, &cmd, SshStrategy::NoForward) - } - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel("Create virtual hardware", named_fns); -} - -fn install_rustup_on_deployment_servers(config: &Config) { - let cmd = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y"; - let fns = config.deployment_servers().map(|server| { - || ssh_exec(server, cmd, SshStrategy::NoForwardNoProfile) - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel("Install rustup", named_fns); -} - -// Build omicron-package and omicron-deploy on the builder -// -// We need to build omicron-deploy for overlay file generation -fn do_build_minimal(config: &Config) -> Result<()> { - let server = &config.servers[&config.builder.server]; - let cmd = format!( - "cd {} && cargo build {} -p {} -p {}", - config.builder.omicron_path, - config.release_arg(), - "omicron-package", - "omicron-deploy" - ); - ssh_exec(&server, &cmd, SshStrategy::NoForward) -} - -fn do_package(config: &Config, artifact_dir: Utf8PathBuf) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - - // We use a bash login shell to get a proper environment, so we have a path to - // postgres, and $DEP_PQ_LIBDIRS is filled in. This is required for building - // nexus. - // - // See https://github.com/oxidecomputer/omicron/blob/8757ec542ea4ffbadd6f26094ed4ba357715d70d/rpaths/src/lib.rs - let cmd = format!( - "bash -lc \ - 'cd {} && \ - cargo run {} --bin omicron-package -- package --out {}'", - config.builder.omicron_path, - config.release_arg(), - artifact_dir, - ); - - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn do_dot(_config: &Config) -> Result<()> { - anyhow::bail!("\"dot\" command is not supported for thing-flinger"); -} - -fn do_check(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - - let cmd = format!( - "bash -lc \ - 'cd {} && \ - cargo run {} --bin omicron-package -- check'", - config.builder.omicron_path, - config.release_arg(), - ); - - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn do_uninstall(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - for server in config.deployment_servers() { - copy_omicron_package_binary_to_staging(config, builder, server)?; - - // Run `omicron-package uninstall` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package uninstall", - config.deployment.staging_dir, - ); - println!("$ {}", cmd); - ssh_exec(&server, &cmd, SshStrategy::Forward)?; - } - Ok(()) -} - -fn do_clean( - config: &Config, - artifact_dir: Utf8PathBuf, - install_dir: Utf8PathBuf, -) -> Result<()> { - let mut deployment_src = Utf8PathBuf::from(&config.deployment.staging_dir); - deployment_src.push(&artifact_dir); - let builder = &config.servers[&config.builder.server]; - for server in config.deployment_servers() { - copy_omicron_package_binary_to_staging(config, builder, server)?; - - // Run `omicron-package uninstall` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package clean --in {} --out {}", - config.deployment.staging_dir, deployment_src, install_dir, - ); - println!("$ {}", cmd); - ssh_exec(&server, &cmd, SshStrategy::Forward)?; - } - Ok(()) -} - -fn run_in_parallel<'a, F>(op: &str, cmds: impl Iterator) -where - F: FnOnce() -> Result<()> + Send, -{ - thread::scope(|s| { - let named_handles: Vec<(_, ScopedJoinHandle<'_, Result<()>>)> = cmds - .map(|(server_name, f)| (server_name, s.spawn(|_| f()))) - .collect(); - - // Join all the handles and print the install status - for (server_name, handle) in named_handles { - match handle.join() { - Ok(Ok(())) => { - println!("{} completed for server: {}", op, server_name) - } - Ok(Err(e)) => { - println!( - "{} failed for server: {} with error: {}", - op, server_name, e - ) - } - Err(_) => { - println!( - "{} failed for server: {}. Thread panicked.", - op, server_name - ) - } - } - } - }) - .unwrap(); -} - -fn do_install( - config: &Config, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, -) { - let builder = &config.servers[&config.builder.server]; - let mut pkg_dir = Utf8PathBuf::from(&config.builder.omicron_path); - pkg_dir.push(artifact_dir); - - let fns = config.deployment.servers.iter().map(|server_name| { - (server_name, || { - single_server_install( - config, - &artifact_dir, - &install_dir, - pkg_dir.as_str(), - builder, - server_name, - ) - }) - }); - - run_in_parallel("Install", fns); -} - -fn do_overlay(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - let mut root_path = Utf8PathBuf::from(&config.builder.omicron_path); - // TODO: This needs to match the artifact_dir in `package` - root_path.push("out/overlay"); - - // Build a list of directories for each server to be deployed and tag which - // one is the server to run RSS; e.g., for servers ["foo", "bar", "baz"] - // with root_path "/my/path", we produce - // [ - // "/my/path/foo/sled-agent/pkg", - // "/my/path/bar/sled-agent/pkg", - // "/my/path/baz/sled-agent/pkg", - // ] - // As we're doing so, record which directory is the one for the server that - // will run RSS. - let mut rss_server_dir = None; - - for server_name in &config.deployment.servers { - let mut dir = root_path.clone(); - dir.push(server_name); - dir.push("sled-agent/pkg"); - if *server_name == config.deployment.rss_server { - rss_server_dir = Some(dir.clone()); - break; - } - } - - // we know exactly one of the servers matches `rss_server` from our config - // validation, so we can unwrap here - let rss_server_dir = rss_server_dir.unwrap(); - - overlay_rss_config(builder, config, &rss_server_dir)?; - - Ok(()) -} - -fn overlay_rss_config( - builder: &Server, - config: &Config, - rss_server_dir: &Utf8Path, -) -> Result<()> { - // Sync `config-rss.toml` to the directory for the RSS server on the - // builder. - let src = if let Some(src) = &config.rss_config_path { - src.clone() - } else { - config.omicron_path.join("smf/sled-agent/non-gimlet/config-rss.toml") - }; - let dst = format!( - "{}@{}:{}/config-rss.toml", - builder.username, builder.addr, rss_server_dir - ); - - let mut cmd = rsync_common(); - cmd.arg(&src).arg(&dst); - - let status = - cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err(FlingError::FailedSync { src: src.to_string(), dst }.into()); - } - - Ok(()) -} - -fn single_server_install( - config: &Config, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, - pkg_dir: &str, - builder: &Server, - server_name: &str, -) -> Result<()> { - let server = &config.servers[server_name]; - - println!( - "COPYING packages from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_package_artifacts_to_staging(config, pkg_dir, builder, server)?; - - println!( - "COPYING deploy tool from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_omicron_package_binary_to_staging(config, builder, server)?; - - println!( - "COPYING manifest from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_package_manifest_to_staging(config, builder, server)?; - - println!("UNPACKING packages on deploy server ({})", server_name); - run_omicron_package_unpack_from_staging( - config, - server, - &artifact_dir, - &install_dir, - )?; - - println!( - "COPYING overlay files from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_overlay_files_to_staging( - config, - pkg_dir, - builder, - server, - server_name, - )?; - - println!("INSTALLING overlay files into the install directory of the deploy server ({})", server_name); - install_overlay_files_from_staging(config, server, &install_dir)?; - - println!("STARTING services on the deploy server ({})", server_name); - run_omicron_package_activate_from_staging(config, server, &install_dir) -} - -// Copy package artifacts as a result of `omicron-package package` from the -// builder to the deployment server staging directory. -// -// This staging directory acts as an intermediate location where -// packages may reside prior to being installed. -fn copy_package_artifacts_to_staging( - config: &Config, - pkg_dir: &str, - builder: &Server, - destination: &Server, -) -> Result<()> { - let cmd = format!( - "rsync -avz -e 'ssh -o StrictHostKeyChecking=no' \ - --include 'out/' \ - --include 'out/*.tar' \ - --include 'out/*.tar.gz' \ - --exclude '*' \ - {} {}@{}:{}", - pkg_dir, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn copy_omicron_package_binary_to_staging( - config: &Config, - builder: &Server, - destination: &Server, -) -> Result<()> { - let mut bin_path = Utf8PathBuf::from(&config.builder.omicron_path); - bin_path.push(format!( - "target/{}/omicron-package", - if config.debug { "debug" } else { "release" } - )); - let cmd = format!( - "rsync -avz {} {}@{}:{}", - bin_path, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn copy_package_manifest_to_staging( - config: &Config, - builder: &Server, - destination: &Server, -) -> Result<()> { - let mut path = Utf8PathBuf::from(&config.builder.omicron_path); - path.push("package-manifest.toml"); - let cmd = format!( - "rsync {} {}@{}:{}", - path, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn run_omicron_package_activate_from_staging( - config: &Config, - destination: &Server, - install_dir: &Utf8Path, -) -> Result<()> { - // Run `omicron-package activate` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package activate --out {}", - config.deployment.staging_dir, install_dir, - ); - - println!("$ {}", cmd); - ssh_exec(destination, &cmd, SshStrategy::Forward) -} - -fn run_omicron_package_unpack_from_staging( - config: &Config, - destination: &Server, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, -) -> Result<()> { - let mut deployment_src = Utf8PathBuf::from(&config.deployment.staging_dir); - deployment_src.push(&artifact_dir); - - // Run `omicron-package unpack` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package unpack --in {} --out {}", - config.deployment.staging_dir, deployment_src, install_dir, - ); - - println!("$ {}", cmd); - ssh_exec(destination, &cmd, SshStrategy::Forward) -} - -fn copy_overlay_files_to_staging( - config: &Config, - pkg_dir: &str, - builder: &Server, - destination: &Server, - destination_name: &str, -) -> Result<()> { - let cmd = format!( - "rsync -avz {}/overlay/{}/ {}@{}:{}/overlay/", - pkg_dir, - destination_name, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn install_overlay_files_from_staging( - config: &Config, - destination: &Server, - install_dir: &Utf8Path, -) -> Result<()> { - let cmd = format!( - "pfexec cp -r {}/overlay/* {}", - config.deployment.staging_dir, install_dir - ); - println!("$ {}", cmd); - ssh_exec(&destination, &cmd, SshStrategy::NoForward) -} - -fn ssh_exec( - server: &Server, - remote_cmd: &str, - strategy: SshStrategy, -) -> Result<()> { - let remote_cmd = if strategy.source_profile() { - // Source .profile, so we have access to cargo. Rustup installs knowledge - // about the cargo path here. - String::from(". $HOME/.profile && ") + remote_cmd - } else { - remote_cmd.into() - }; - - let mut cmd = Command::new("ssh"); - if strategy.forward_agent() { - cmd.arg("-A"); - } - cmd.arg("-o") - .arg("StrictHostKeyChecking=no") - .arg("-l") - .arg(&server.username) - .arg(&server.addr) - .arg(&remote_cmd); - - // If the builder is the same as the client, this will likely not be set, - // as the keys will reside on the builder. - if let Some(auth_sock) = std::env::var_os("SSH_AUTH_SOCK") { - cmd.env("SSH_AUTH_SOCK", auth_sock); - } - let exit_status = cmd - .status() - .context(format!("Failed to run {} on {}", remote_cmd, server.addr))?; - if !exit_status.success() { - anyhow::bail!("Command failed: {}", exit_status); - } - - Ok(()) -} - -fn validate_servers( - chosen: &BTreeSet, - all: &BTreeMap, -) -> Result<(), FlingError> { - let all = all.keys().cloned().collect(); - let diff: Vec = chosen.difference(&all).cloned().collect(); - if !diff.is_empty() { - Err(FlingError::InvalidServers(diff)) - } else { - Ok(()) - } -} - -fn validate_absolute_path( - path: &Utf8Path, - field: &'static str, -) -> Result<(), FlingError> { - if path.is_absolute() || path.starts_with("$HOME") { - Ok(()) - } else { - Err(FlingError::NotAbsolutePath { field }) - } -} - -fn validate(config: &Config) -> Result<(), FlingError> { - validate_absolute_path(&config.omicron_path, "omicron_path")?; - validate_absolute_path( - &config.builder.omicron_path, - "builder.omicron_path", - )?; - validate_absolute_path( - &config.deployment.staging_dir, - "deployment.staging_dir", - )?; - - validate_servers( - &BTreeSet::from([ - config.builder.server.clone(), - config.deployment.rss_server.clone(), - ]), - &config.servers, - ) -} - -fn main() -> Result<()> { - let args = Args::try_parse()?; - let config = parse::<_, Config>(args.config)?; - - validate(&config)?; - - match args.subcommand { - SubCommand::Exec { cmd, servers } => { - do_exec(&config, cmd, servers)?; - } - SubCommand::Sync => do_sync(&config)?, - SubCommand::InstallPrereqs => do_install_prereqs(&config)?, - SubCommand::Builder(BuildCommand::Target { .. }) => { - todo!("Setting target not supported through thing-flinger") - } - SubCommand::Builder(BuildCommand::Package { .. }) => { - do_package(&config, args.artifact_dir)?; - } - SubCommand::Builder(BuildCommand::Stamp { .. }) => { - anyhow::bail!("Distributed package stamping not supported") - } - SubCommand::Builder(BuildCommand::Check) => do_check(&config)?, - SubCommand::Builder(BuildCommand::Dot) => { - do_dot(&config)?; - } - SubCommand::Deploy(DeployCommand::Install { install_dir }) => { - do_build_minimal(&config)?; - do_install(&config, &args.artifact_dir, &install_dir); - } - SubCommand::Deploy(DeployCommand::Uninstall) => { - do_build_minimal(&config)?; - do_uninstall(&config)?; - } - SubCommand::Deploy(DeployCommand::Clean { install_dir }) => { - do_build_minimal(&config)?; - do_clean(&config, args.artifact_dir, install_dir)?; - } - // TODO: It doesn't really make sense to allow the user direct access - // to these low level operations in thing-flinger. Should we not use - // the DeployCommand from omicron-package directly? - SubCommand::Deploy(_) => anyhow::bail!("Unsupported action"), - SubCommand::Overlay => do_overlay(&config)?, - } - Ok(()) -} From 6c9c9b73d82c91389f0c8ccb46e33aaaf0afdf03 Mon Sep 17 00:00:00 2001 From: "Andrew J. Stone" Date: Tue, 13 Feb 2024 14:58:05 -0500 Subject: [PATCH 4/5] Add `omdb sled-agent bootstore status` (#5041) Fixes #4295 and #3722 --- bootstore/src/schemes/v0/peer.rs | 8 ++ dev-tools/omdb/src/bin/omdb/sled_agent.rs | 56 ++++++++++++++ dev-tools/omdb/tests/usage_errors.out | 7 +- openapi/sled-agent.json | 93 +++++++++++++++++++++++ sled-agent/src/http_entrypoints.rs | 23 +++++- sled-agent/src/params.rs | 43 +++++++++++ 6 files changed, 226 insertions(+), 4 deletions(-) diff --git a/bootstore/src/schemes/v0/peer.rs b/bootstore/src/schemes/v0/peer.rs index 5290e64672..1175e64143 100644 --- a/bootstore/src/schemes/v0/peer.rs +++ b/bootstore/src/schemes/v0/peer.rs @@ -62,6 +62,14 @@ pub enum NodeRequestError { }, } +impl From for omicron_common::api::external::Error { + fn from(error: NodeRequestError) -> Self { + omicron_common::api::external::Error::internal_error(&format!( + "{error}" + )) + } +} + /// A request sent to the `Node` task from the `NodeHandle` pub enum NodeApiRequest { /// Initialize a rack at the behest of RSS running on the same scrimlet as diff --git a/dev-tools/omdb/src/bin/omdb/sled_agent.rs b/dev-tools/omdb/src/bin/omdb/sled_agent.rs index c413a2ba43..2d9e19d253 100644 --- a/dev-tools/omdb/src/bin/omdb/sled_agent.rs +++ b/dev-tools/omdb/src/bin/omdb/sled_agent.rs @@ -31,6 +31,10 @@ enum SledAgentCommands { /// print information about zpools #[clap(subcommand)] Zpools(ZpoolCommands), + + /// print information about the local bootstore node + #[clap(subcommand)] + Bootstore(BootstoreCommands), } #[derive(Debug, Subcommand)] @@ -45,6 +49,12 @@ enum ZpoolCommands { List, } +#[derive(Debug, Subcommand)] +enum BootstoreCommands { + /// Show the internal state of the local bootstore node + Status, +} + impl SledAgentArgs { /// Run a `omdb sled-agent` subcommand. pub(crate) async fn run_cmd( @@ -70,6 +80,9 @@ impl SledAgentArgs { SledAgentCommands::Zpools(ZpoolCommands::List) => { cmd_zpools_list(&client).await } + SledAgentCommands::Bootstore(BootstoreCommands::Status) => { + cmd_bootstore_status(&client).await + } } } } @@ -110,3 +123,46 @@ async fn cmd_zpools_list( Ok(()) } + +/// Runs `omdb sled-agent bootstore status` +async fn cmd_bootstore_status( + client: &sled_agent_client::Client, +) -> Result<(), anyhow::Error> { + let status = client.bootstore_status().await.context("bootstore status")?; + println!("fsm ledger generation: {}", status.fsm_ledger_generation); + println!( + "network config ledger generation: {:?}", + status.network_config_ledger_generation + ); + println!("fsm state: {}", status.fsm_state); + println!("peers (found by ddmd):"); + if status.peers.is_empty() { + println!(" "); + } + for peer in status.peers.iter() { + println!(" {peer}"); + } + println!("established connections:"); + if status.established_connections.is_empty() { + println!(" "); + } + for c in status.established_connections.iter() { + println!(" {:?} : {}", c.baseboard, c.addr); + } + println!("accepted connections:"); + if status.accepted_connections.is_empty() { + println!(" "); + } + for addr in status.accepted_connections.iter() { + println!(" {addr}"); + } + println!("negotiating connections:"); + if status.negotiating_connections.is_empty() { + println!(" "); + } + for addr in status.negotiating_connections.iter() { + println!(" {addr}"); + } + + Ok(()) +} diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 6f9b539371..bb7da1be57 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -331,9 +331,10 @@ Debug a specific Sled Usage: omdb sled-agent [OPTIONS] Commands: - zones print information about zones - zpools print information about zpools - help Print this message or the help of the given subcommand(s) + zones print information about zones + zpools print information about zpools + bootstore print information about the local bootstore node + help Print this message or the help of the given subcommand(s) Options: --sled-agent-url URL of the Sled internal API [env: OMDB_SLED_AGENT_URL=] diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 395394defb..99156fffd4 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -136,6 +136,30 @@ } } }, + "/bootstore/status": { + "get": { + "summary": "Get the internal state of the local bootstore node", + "operationId": "bootstore_status", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BootstoreStatus" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/cockroachdb": { "post": { "summary": "Initializes a CockroachDB cluster", @@ -2531,6 +2555,60 @@ } ] }, + "BootstoreStatus": { + "type": "object", + "properties": { + "accepted_connections": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "established_connections": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EstablishedConnection" + } + }, + "fsm_ledger_generation": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "fsm_state": { + "type": "string" + }, + "negotiating_connections": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "network_config_ledger_generation": { + "nullable": true, + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "peers": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + } + }, + "required": [ + "accepted_connections", + "established_connections", + "fsm_ledger_generation", + "fsm_state", + "negotiating_connections", + "peers" + ] + }, "BundleUtilization": { "description": "The portion of a debug dataset used for zone bundles.", "type": "object", @@ -3847,6 +3925,21 @@ "request_id" ] }, + "EstablishedConnection": { + "type": "object", + "properties": { + "addr": { + "type": "string" + }, + "baseboard": { + "$ref": "#/components/schemas/Baseboard" + } + }, + "required": [ + "addr", + "baseboard" + ] + }, "Field": { "description": "A `Field` is a named aspect of a target or metric.", "type": "object", diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 0798aed664..5f888504db 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -8,7 +8,7 @@ use super::sled_agent::SledAgent; use crate::bootstrap::early_networking::EarlyNetworkConfig; use crate::bootstrap::params::AddSledRequest; use crate::params::{ - CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, + BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRulesEnsureBody, @@ -85,6 +85,7 @@ pub fn api() -> SledApiDescription { api.register(host_os_write_status_get)?; api.register(host_os_write_status_delete)?; api.register(inventory)?; + api.register(bootstore_status)?; Ok(()) } @@ -972,3 +973,23 @@ async fn inventory( let sa = request_context.context(); Ok(HttpResponseOk(sa.inventory()?)) } + +/// Get the internal state of the local bootstore node +#[endpoint { + method = GET, + path = "/bootstore/status", +}] +async fn bootstore_status( + request_context: RequestContext, +) -> Result, HttpError> { + let sa = request_context.context(); + let bootstore = sa.bootstore(); + let status = bootstore + .get_status() + .await + .map_err(|e| { + HttpError::from(omicron_common::api::external::Error::from(e)) + })? + .into(); + Ok(HttpResponseOk(status)) +} diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 52bfb20e5d..7ed1264d9c 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -25,6 +25,7 @@ use sled_hardware::Baseboard; pub use sled_hardware::DendriteAsic; use sled_storage::dataset::DatasetKind; use sled_storage::dataset::DatasetName; +use std::collections::BTreeSet; use std::fmt::{Debug, Display, Formatter, Result as FormatResult}; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::str::FromStr; @@ -865,3 +866,45 @@ pub struct Inventory { pub usable_physical_ram: ByteCount, pub reservoir_size: ByteCount, } + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct EstablishedConnection { + baseboard: Baseboard, + addr: SocketAddrV6, +} + +impl From<(Baseboard, SocketAddrV6)> for EstablishedConnection { + fn from(value: (Baseboard, SocketAddrV6)) -> Self { + EstablishedConnection { baseboard: value.0, addr: value.1 } + } +} + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BootstoreStatus { + pub fsm_ledger_generation: u64, + pub network_config_ledger_generation: Option, + pub fsm_state: String, + pub peers: BTreeSet, + pub established_connections: Vec, + pub accepted_connections: BTreeSet, + pub negotiating_connections: BTreeSet, +} + +impl From for BootstoreStatus { + fn from(value: bootstore::schemes::v0::Status) -> Self { + BootstoreStatus { + fsm_ledger_generation: value.fsm_ledger_generation, + network_config_ledger_generation: value + .network_config_ledger_generation, + fsm_state: value.fsm_state.to_string(), + peers: value.peers, + established_connections: value + .connections + .into_iter() + .map(EstablishedConnection::from) + .collect(), + accepted_connections: value.accepted_connections, + negotiating_connections: value.negotiating_connections, + } + } +} From 8b4d2e9338a8c98290ca2845cf23f82ceee3f6eb Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 13 Feb 2024 14:47:49 -0800 Subject: [PATCH 5/5] [nexus][sled-agent] Try more to load firewall rules during service bringup (#5054) Fixes https://github.com/oxidecomputer/omicron/issues/5053 , with this solution implemented: > Nexus Internal Endpoint to Ask for FW Rules, Triggered After Individual Zones Launch. Rather than waiting for "all zones to launch", the Sled Agent could just re-apply firewall rules after any zones that need FW rules come up. I used a bit of a brute-force solution here -- "just do it each time we try to launch a set of zones". This could certainly be more fine-grained, but it gets the job done. --- nexus/src/app/sled.rs | 14 ++++-- nexus/src/internal_api/http_entrypoints.rs | 26 +++++++++++ openapi/nexus-internal.json | 29 ++++++++++++ sled-agent/src/sled_agent.rs | 51 +++++++++++++++++----- 4 files changed, 106 insertions(+), 14 deletions(-) diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 738dae1d6d..ec3f11dc6f 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -41,7 +41,7 @@ impl super::Nexus { // unless the DNS lookups at sled-agent are only for rack-local nexuses. pub(crate) async fn upsert_sled( &self, - opctx: &OpContext, + _opctx: &OpContext, id: Uuid, info: SledAgentStartupInfo, ) -> Result<(), Error> { @@ -70,10 +70,16 @@ impl super::Nexus { ); self.db_datastore.sled_upsert(sled).await?; - // Make sure any firewall rules for serices that may - // be running on this sled get plumbed - self.plumb_service_firewall_rules(opctx, &[id]).await?; + Ok(()) + } + pub(crate) async fn sled_request_firewall_rules( + &self, + opctx: &OpContext, + id: Uuid, + ) -> Result<(), Error> { + info!(self.log, "requesting firewall rules"; "sled_uuid" => id.to_string()); + self.plumb_service_firewall_rules(opctx, &[id]).await?; Ok(()) } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 0122d9b439..eddc834a2a 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -57,6 +57,7 @@ type NexusApiDescription = ApiDescription>; pub(crate) fn internal_api() -> NexusApiDescription { fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> { api.register(sled_agent_put)?; + api.register(sled_firewall_rules_request)?; api.register(switch_put)?; api.register(rack_initialization_complete)?; api.register(physical_disk_put)?; @@ -126,6 +127,31 @@ async fn sled_agent_put( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Request a new set of firewall rules for a sled. +/// +/// This causes Nexus to read the latest set of rules for the sled, +/// and call a Sled endpoint which applies the rules to all OPTE ports +/// that happen to exist. +#[endpoint { + method = POST, + path = "/sled-agents/{sled_id}/firewall-rules-update", + }] +async fn sled_firewall_rules_request( + rqctx: RequestContext>, + path_params: Path, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let path = path_params.into_inner(); + let sled_id = &path.sled_id; + let handler = async { + nexus.sled_request_firewall_rules(&opctx, *sled_id).await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + /// Path parameters for Rack requests. #[derive(Deserialize, JsonSchema)] struct RackPathParam { diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 4714b64c52..a1a78deb84 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -835,6 +835,35 @@ } } }, + "/sled-agents/{sled_id}/firewall-rules-update": { + "post": { + "summary": "Request a new set of firewall rules for a sled.", + "description": "This causes Nexus to read the latest set of rules for the sled, and call a Sled endpoint which applies the rules to all OPTE ports that happen to exist.", + "operationId": "sled_firewall_rules_request", + "parameters": [ + { + "in": "path", + "name": "sled_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/sled-agents/{sled_id}/zpools/{zpool_id}": { "put": { "summary": "Report that a pool for a specified sled has come online.", diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index bcc354232e..1a634a6346 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -105,6 +105,9 @@ pub enum Error { #[error("Failed to operate on underlay device: {0}")] Underlay(#[from] underlay::Error), + #[error("Failed to request firewall rules")] + FirewallRequest(#[source] nexus_client::Error), + #[error(transparent)] Services(#[from] crate::services::Error), @@ -602,11 +605,24 @@ impl SledAgent { retry_notify( retry_policy_internal_service_aggressive(), || async { - self.inner - .services - .load_services() + // Load as many services as we can, and don't exit immediately + // upon failure... + let load_services_result = + self.inner.services.load_services().await.map_err(|err| { + BackoffError::transient(Error::from(err)) + }); + + // ... and request firewall rule updates for as many services as + // we can. Note that we still make this request even if we only + // partially load some services. + let firewall_result = self + .request_firewall_update() .await - .map_err(|err| BackoffError::transient(err)) + .map_err(|err| BackoffError::transient(err)); + + // Only complete if we have loaded all services and firewall + // rules successfully. + load_services_result.and(firewall_result) }, |err, delay| { warn!( @@ -618,10 +634,6 @@ impl SledAgent { ) .await .unwrap(); // we retry forever, so this can't fail - - // Now that we've initialized the sled services, notify nexus again - // at which point it'll plumb any necessary firewall rules back to us. - self.notify_nexus_about_self(&self.log); } pub(crate) fn switch_zone_underlay_info( @@ -642,7 +654,26 @@ impl SledAgent { &self.inner.start_request } - // Sends a request to Nexus informing it that the current sled exists. + /// Requests firewall rules from Nexus. + /// + /// Does not retry upon failure. + async fn request_firewall_update(&self) -> Result<(), Error> { + let sled_id = self.inner.id; + + self.inner + .nexus_client + .client() + .sled_firewall_rules_request(&sled_id) + .await + .map_err(|err| Error::FirewallRequest(err))?; + Ok(()) + } + + /// Sends a request to Nexus informing it that the current sled exists, + /// with information abou the existing set of hardware. + /// + /// Does not block until Nexus is available -- the future created by this + /// function is retried in a queue that is polled in the background. pub(crate) fn notify_nexus_about_self(&self, log: &Logger) { let sled_id = self.inner.id; let nexus_client = self.inner.nexus_client.clone(); @@ -658,7 +689,7 @@ impl SledAgent { let log = log.clone(); let fut = async move { // Notify the control plane that we're up, and continue trying this - // until it succeeds. We retry with an randomized, capped + // until it succeeds. We retry with a randomized, capped // exponential backoff. // // TODO-robustness if this returns a 400 error, we probably want to