diff --git a/Cargo.lock b/Cargo.lock index dcdc4b9832..caddb6d8ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1235,20 +1235,6 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7059fff8937831a9ae6f0fe4d658ffabf58f2ca96aa9dec1c889f936f705f216" -[[package]] -name = "crossbeam" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" -dependencies = [ - "cfg-if", - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1283,16 +1269,6 @@ dependencies = [ "scopeguard", ] -[[package]] -name = "crossbeam-queue" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.16" @@ -4915,21 +4891,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "omicron-deploy" -version = "0.1.0" -dependencies = [ - "anyhow", - "camino", - "clap 4.4.3", - "crossbeam", - "omicron-package", - "omicron-workspace-hack", - "serde", - "serde_derive", - "thiserror", -] - [[package]] name = "omicron-dev" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 16f064d64d..d33758fc06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,6 @@ members = [ "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/oxlog", - "dev-tools/thing-flinger", "dev-tools/xtask", "dns-server", "end-to-end-tests", @@ -96,7 +95,6 @@ default-members = [ "dev-tools/omdb", "dev-tools/omicron-dev", "dev-tools/oxlog", - "dev-tools/thing-flinger", # Do not include xtask in the list of default members, because this causes # hakari to not work as well and build times to be longer. # See omicron#4392. diff --git a/bootstore/src/schemes/v0/peer.rs b/bootstore/src/schemes/v0/peer.rs index 5290e64672..1175e64143 100644 --- a/bootstore/src/schemes/v0/peer.rs +++ b/bootstore/src/schemes/v0/peer.rs @@ -62,6 +62,14 @@ pub enum NodeRequestError { }, } +impl From for omicron_common::api::external::Error { + fn from(error: NodeRequestError) -> Self { + omicron_common::api::external::Error::internal_error(&format!( + "{error}" + )) + } +} + /// A request sent to the `Node` task from the `NodeHandle` pub enum NodeApiRequest { /// Initialize a rack at the behest of RSS running on the same scrimlet as diff --git a/dev-tools/omdb/src/bin/omdb/sled_agent.rs b/dev-tools/omdb/src/bin/omdb/sled_agent.rs index c413a2ba43..2d9e19d253 100644 --- a/dev-tools/omdb/src/bin/omdb/sled_agent.rs +++ b/dev-tools/omdb/src/bin/omdb/sled_agent.rs @@ -31,6 +31,10 @@ enum SledAgentCommands { /// print information about zpools #[clap(subcommand)] Zpools(ZpoolCommands), + + /// print information about the local bootstore node + #[clap(subcommand)] + Bootstore(BootstoreCommands), } #[derive(Debug, Subcommand)] @@ -45,6 +49,12 @@ enum ZpoolCommands { List, } +#[derive(Debug, Subcommand)] +enum BootstoreCommands { + /// Show the internal state of the local bootstore node + Status, +} + impl SledAgentArgs { /// Run a `omdb sled-agent` subcommand. pub(crate) async fn run_cmd( @@ -70,6 +80,9 @@ impl SledAgentArgs { SledAgentCommands::Zpools(ZpoolCommands::List) => { cmd_zpools_list(&client).await } + SledAgentCommands::Bootstore(BootstoreCommands::Status) => { + cmd_bootstore_status(&client).await + } } } } @@ -110,3 +123,46 @@ async fn cmd_zpools_list( Ok(()) } + +/// Runs `omdb sled-agent bootstore status` +async fn cmd_bootstore_status( + client: &sled_agent_client::Client, +) -> Result<(), anyhow::Error> { + let status = client.bootstore_status().await.context("bootstore status")?; + println!("fsm ledger generation: {}", status.fsm_ledger_generation); + println!( + "network config ledger generation: {:?}", + status.network_config_ledger_generation + ); + println!("fsm state: {}", status.fsm_state); + println!("peers (found by ddmd):"); + if status.peers.is_empty() { + println!(" "); + } + for peer in status.peers.iter() { + println!(" {peer}"); + } + println!("established connections:"); + if status.established_connections.is_empty() { + println!(" "); + } + for c in status.established_connections.iter() { + println!(" {:?} : {}", c.baseboard, c.addr); + } + println!("accepted connections:"); + if status.accepted_connections.is_empty() { + println!(" "); + } + for addr in status.accepted_connections.iter() { + println!(" {addr}"); + } + println!("negotiating connections:"); + if status.negotiating_connections.is_empty() { + println!(" "); + } + for addr in status.negotiating_connections.iter() { + println!(" {addr}"); + } + + Ok(()) +} diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 0600945194..3e6e89d508 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -57,11 +57,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -147,11 +147,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -224,11 +224,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 1cd85262f6..3086c98f32 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -251,11 +251,11 @@ task: "dns_propagation_internal" task: "dns_servers_external" - watches list of external DNS servers stored in CockroachDB + watches list of external DNS servers stored in internal DNS task: "dns_servers_internal" - watches list of internal DNS servers stored in CockroachDB + watches list of internal DNS servers stored in internal DNS task: "external_endpoints" @@ -313,7 +313,7 @@ task: "dns_servers_internal" task: "dns_propagation_internal" configured period: every 1m currently executing: no - last completed activation: iter 5, triggered by a dependent task completing + last completed activation: iter 4, triggered by a dependent task completing started at (s ago) and ran for ms attempt to propagate generation: 1 @@ -341,7 +341,7 @@ task: "dns_servers_external" task: "dns_propagation_external" configured period: every 1m currently executing: no - last completed activation: iter 5, triggered by a dependent task completing + last completed activation: iter 4, triggered by a dependent task completing started at (s ago) and ran for ms attempt to propagate generation: 2 diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index 6f9b539371..bb7da1be57 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -331,9 +331,10 @@ Debug a specific Sled Usage: omdb sled-agent [OPTIONS] Commands: - zones print information about zones - zpools print information about zpools - help Print this message or the help of the given subcommand(s) + zones print information about zones + zpools print information about zpools + bootstore print information about the local bootstore node + help Print this message or the help of the given subcommand(s) Options: --sled-agent-url URL of the Sled internal API [env: OMDB_SLED_AGENT_URL=] diff --git a/dev-tools/thing-flinger/.gitignore b/dev-tools/thing-flinger/.gitignore deleted file mode 100644 index ea8c4bf7f3..0000000000 --- a/dev-tools/thing-flinger/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/target diff --git a/dev-tools/thing-flinger/Cargo.toml b/dev-tools/thing-flinger/Cargo.toml deleted file mode 100644 index a427685871..0000000000 --- a/dev-tools/thing-flinger/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "omicron-deploy" -description = "Tools for deploying Omicron software to target machines" -version = "0.1.0" -edition = "2021" -license = "MPL-2.0" - -[dependencies] -anyhow.workspace = true -camino.workspace = true -clap.workspace = true -crossbeam.workspace = true -omicron-package.workspace = true -serde.workspace = true -serde_derive.workspace = true -thiserror.workspace = true -omicron-workspace-hack.workspace = true - -[[bin]] -name = "thing-flinger" -doc = false diff --git a/dev-tools/thing-flinger/README.adoc b/dev-tools/thing-flinger/README.adoc deleted file mode 100644 index 9966a7b747..0000000000 --- a/dev-tools/thing-flinger/README.adoc +++ /dev/null @@ -1,222 +0,0 @@ -Omicron is a complex piece of software consisting of many build and install-time dependencies. It's -intended to run primarily on illumos based systems, and as such is built to use runtime facilities -of illumos, such as https://illumos.org/man/5/smf[SMF]. Furthermore, Omicron is fundamentally a -distributed system, with its components intended to run on multiple servers communicating over the -network. In order to secure the system, certain cryptographic primitives, such as asymmetric key -pairs and shared secrets are required. Due to the nature of these cryptographic primitives, there is -a requirement for the distribution or creation of files unique to a specific server, such that no -other server has access to those files. Examples of this are private keys, and threshold key -shares, although other non-cryptographic unique files may also become necessary over time. - -In order to satisfy the above requirements of building and deploying a complex distributed system -consisting of unique, private files, two CLI tools have been created: - - . link:src/bin/omicron-package.rs[omicron-package] - build, package, install on local machine - . link:src/bin/thing-flinger.rs[thing-flinger] - build, package, deploy to remote machines - - -If a user is working on their local illumos based machine, and only wants to run -omicron in single node mode, they should follow the install instruction in -the link:../README.adoc[Omicron README] and use `omicron-package`. If the user -wishes for a more complete workflow, where they can code on their local laptop, -use a remote build machine, and install to multiple machines for a more realistic -deployment, they should use `thing-flinger`. - -The remainder of this document will describe a typical workflow for using -thing-flinger, pointing out room for improvement. - -== Environment and Configuration - - - +------------------+ +------------------+ - | | | | - | | | | - | Client |----------------> Builder | - | | | | - | | | | - +------------------+ +------------------+ - | - | - | - | - +---------------------------+--------------------------+ - | | | - | | | - | | | - +--------v---------+ +---------v--------+ +---------v--------+ - | | | | | | - | | | | | | - | Deployed Server | | Deployed Server | | Deployed Server | - | | | | | | - | | | | | | - +------------------+ +------------------+ +------------------+ - - -`thing-flinger` defines three types of nodes: - - * Client - Where a user typically edits their code and runs thing-flinger. This can run any OS. - * Builder - A Helios box where Omicron is built and packaged - * Deployed Server - Helios machines where Omicron will be installed and run - -It's not at all necessary for these to be separate nodes. For example, a client and builder can be -the same machine, as long as it's a Helios box. Same goes for Builder and a deployment server. The -benefit of this separation though, is that it allows editing on something like a laptop, without -having to worry about setting up a development environment on an illumos based host. - -Machine topology is configured in a `TOML` file that is passed on the command line. All illumos -machines are listed under `servers`, and just the names are used for configuring a builder and -deployment servers. An link:src/bin/deployment-example.toml[example] is provided. - -Thing flinger works over SSH, and so the user must have the public key of their client configured -for their account on all servers. SSH agent forwarding is used to prevent the need for the keys of -the builder to also be on the other servers, thus minimizing needed server configuration. - -== Typical Workflow - -=== Prerequisites - -Ensure you have an account on all illumos boxes, with the client public key in -`~/.ssh/authorized_keys`. - -.The build machine must have Rust and cargo installed, as well as -all the dependencies for Omicron installed. Following the *prerequisites* in the -https://github.com/oxidecomputer/omicron/#build-and-run[Build and run] section of the main Omicron -README is probably a good idea. - -==== Update `config-rss.toml` - -Currently rack setup is driven by a configuration file that lives at -`smf/sled-agent/non-gimlet/config-rss.toml` in the root of this repository. The committed -configuration of that file contains a single `requests` entry (with many -services inside it), which means it will start services on only one sled. To -start services (e.g., nexus) on multiple sleds, add additional entries to that -configuration file before proceeding. - -=== Command Based Workflow - -==== sync -Copy your source code to the builder. - -`+cargo run --bin thing-flinger -- -c sync+` - -==== Install Prerequisites -Install necessary build and runtime dependencies (including downloading prebuilt -binaries like Clickhouse and CockroachDB) on the builder and all deployment -targets. This step only needs to be performed once, absent any changes to the -dependencies, but is idempotent so may be run multiple times. - -`+cargo run --bin thing-flinger -- -c install-prereqs+` - -==== check (optional) -Run `cargo check` on the builder against the copy of `omicron` that was sync'd -to it in the previous step. - -`+cargo run --bin thing-flinger -- -c build check+` - -==== package -Build and package omicron using `omicron-package` on the builder. - -`+cargo run --bin thing-flinger -- -c build package+` - -==== overlay -Create files that are unique to each deployment server. - -`+cargo run --bin thing-flinger -- -c overlay+` - -==== install -Install omicron to all machines, in parallel. This consists of copying the packaged omicron tarballs -along with overlay files, and omicron-package and its manifest to a `staging` directory on each -deployment server, and then running omicron-package, installing overlay files, and restarting -services. - -`+cargo run --bin thing-flinger -- -c deploy install+` - -==== uninstall -Uninstall omicron from all machines. - -`+cargo run --bin thing-flinger -- -c deploy uninstall+` - -=== Current Limitations - -`thing-flinger` is an early prototype. It has served so far to demonstrate that unique files, -specifically secret shares, can be created and distributed over ssh, and that omicron can be -installed remotely using `omicron-package`. It is not currently complete enough to fully test a -distributed omicron setup, as the underlying dependencies are not configured yet. Specifically, -`CockroachDB` and perhaps `Clickhouse`, need to be configured to run in multiple server mode. It's -anticipated that the `overlay` feature of `thing-flinger` can be used to generate and distribute -configs for this. - -=== Design rationale - -`thing-flinger` is a command line program written in rust. It was written this way to build upon -`omicron-package`, which is also in rust, as that is our default language of choice at Oxide. -`thing-flinger` is based around SSH, as that is the minimal viable requirement for a test tool such -as this. Additionally, it provides for the most straightforward implementation, and takes the least -effort to use securely. This particular implementation wraps the openssh ssh client via -`std::process::Command`, rather than using the `ssh2` crate, because ssh2, as a wrapper around -`libssh`, does not support agent-forwarding. - -== Notes on Using VMs as Deployed Servers on a Linux Host - -TODO: This section should be fleshed out more and potentially lifted to its own -document; for now this is a collection of rough notes. - ---- - -It's possible to use a Linux libvirt host running multiple helios VMs as the -builder/deployment server targets, but it requires some additional setup beyond -`https://github.com/oxidecomputer/helios-engvm[helios-engvm]`. - -`thing-flinger` does not have any support for running the -`tools/create_virtual_hardware.sh` script; this will need to be done by hand on -each VM. - ---- - -To enable communication between the VMs over their IPv6 bootstrap networks: - -1. Enable IPv6 and DHCP on the virtual network libvirt uses for the VMs; e.g., - -```xml - - - - - -``` - -After booting the VMs with this enabled, they should be able to ping each other -over their acquired IPv6 addresses, but connecting to each other over the -`bootstrap6` interface that sled-agent creates will fail. - -2. Explicitly add routes in the Linux host for the `bootstrap6` addresses, -specifying the virtual interface libvirt created that is used by the VMs. - -``` -bash% sudo ip -6 route add fdb0:5254:13:7331::1/64 dev virbr1 -bash% sudo ip -6 route add fdb0:5254:f0:acfd::1/64 dev virbr1 -``` - -3. Once the sled-agents advance sufficiently to set up `sled6` interfaces, -routes need to be added for them both in the Linux host and in the Helios VMs. -Assuming two sleds with these interfaces: - -``` -# VM 1 -vioif0/sled6 static ok fd00:1122:3344:1::1/64 -# VM 2 -vioif0/sled6 static ok fd00:1122:3344:2::1/64 -``` - -The Linux host needs to be told to route that subnet to the appropriate virtual -interface: - -``` -bash% ip -6 route add fd00:1122:3344::1/48 dev virbr1 -``` - -and each Helios VM needs to be told to route that subnet to the host gateway: - -``` -vm% pfexec route add -inet6 fd00:1122:3344::/48 $IPV6_HOST_GATEWAY_ADDR -``` diff --git a/dev-tools/thing-flinger/src/bin/deployment-example.toml b/dev-tools/thing-flinger/src/bin/deployment-example.toml deleted file mode 100644 index 6d85de2ba6..0000000000 --- a/dev-tools/thing-flinger/src/bin/deployment-example.toml +++ /dev/null @@ -1,36 +0,0 @@ -# This manifest describes the servers that omicron will be installed to, along -# with any ancillary information specific to a given server. -# -# It is ingested by the `thing-flinger` tool. - -# This must be an absolute path. It refers to the path to Omicron on the -# machine where thing-flinger is being executed. -omicron_path = "/local/path/to/omicron" - -[builder] -# `server` must refer to one of the `servers` in the servers table -server = "foo" -# This must be an absolute path. It refers to the path to Omicron on the -# builder server. -omicron_path = "/remote/path/to/omicron" - -[deployment] -# which server is responsible for running the rack setup service; must -# refer to one of the `servers` in the servers table -rss_server = "foo" -# Location where files to install will be placed before running -# `omicron-package install` -# -# This must be an absolute path -# We specifically allow for $HOME in validating the absolute path -staging_dir = "$HOME/omicron_staging" -# which servers to deploy -servers = ["foo", "bar"] - -[servers.foo] -username = "me" -addr = "foo" - -[servers.bar] -username = "me" -addr = "bar" diff --git a/dev-tools/thing-flinger/src/bin/thing-flinger.rs b/dev-tools/thing-flinger/src/bin/thing-flinger.rs deleted file mode 100644 index 43b137790d..0000000000 --- a/dev-tools/thing-flinger/src/bin/thing-flinger.rs +++ /dev/null @@ -1,968 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Utility for deploying Omicron to remote machines - -use omicron_package::{parse, BuildCommand, DeployCommand}; - -use camino::{Utf8Path, Utf8PathBuf}; -use std::collections::{BTreeMap, BTreeSet}; -use std::process::Command; - -use anyhow::{Context, Result}; -use clap::{Parser, Subcommand}; -use crossbeam::thread::{self, ScopedJoinHandle}; -use serde_derive::Deserialize; -use thiserror::Error; - -// A server on which omicron source should be compiled into packages. -#[derive(Deserialize, Debug)] -struct Builder { - server: String, - omicron_path: Utf8PathBuf, -} - -// A server on which an omicron package is deployed. -#[derive(Deserialize, Debug, Eq, PartialEq)] -struct Server { - username: String, - addr: String, -} - -#[derive(Deserialize, Debug)] -struct Deployment { - rss_server: String, - staging_dir: Utf8PathBuf, - servers: BTreeSet, -} - -#[derive(Debug, Deserialize)] -struct Config { - omicron_path: Utf8PathBuf, - builder: Builder, - servers: BTreeMap, - deployment: Deployment, - - #[serde(default)] - rss_config_path: Option, - - #[serde(default)] - debug: bool, -} - -impl Config { - fn release_arg(&self) -> &str { - if self.debug { - "" - } else { - "--release" - } - } - - fn deployment_servers(&self) -> impl Iterator { - self.servers.iter().filter_map(|(name, s)| { - if self.deployment.servers.contains(name) { - Some(s) - } else { - None - } - }) - } -} - -fn parse_into_set(src: &str) -> Result, &'static str> { - Ok(src.split_whitespace().map(|s| s.to_owned()).collect()) -} - -#[derive(Debug, Subcommand)] -enum SubCommand { - /// Run the given command on the given servers, or all servers if none are - /// specified. - /// - /// Be careful! - Exec { - /// The command to run - #[clap(short, long, action)] - cmd: String, - - /// The servers to run the command on - #[clap(short, long, value_parser = parse_into_set)] - servers: Option>, - }, - - /// Install necessary prerequisites on the "builder" server and all "deploy" - /// servers. - InstallPrereqs, - - /// Sync our local source to the build host - Sync, - - /// Runs a command on the "builder" server. - #[clap(name = "build", subcommand)] - Builder(BuildCommand), - - /// Runs a command on all the "deploy" servers. - #[clap(subcommand)] - Deploy(DeployCommand), - - /// Create an overlay directory tree for each deployment server - /// - /// Each directory tree contains unique files for the given server that will - /// be populated in the svc/pkg dir. - /// - /// This is a separate subcommand so that we can reconstruct overlays - /// without rebuilding or repackaging. - Overlay, -} - -#[derive(Debug, Parser)] -#[clap( - name = "thing-flinger", - about = "A tool for synchronizing packages and configs between machines" -)] -struct Args { - /// The path to the deployment manifest TOML file - #[clap( - short, - long, - help = "Path to deployment manifest toml file", - action - )] - config: Utf8PathBuf, - - #[clap( - short, - long, - help = "The name of the build target to use for this command" - )] - target: String, - - /// The output directory, where artifacts should be built and staged - #[clap(long = "artifacts", default_value = "out/")] - artifact_dir: Utf8PathBuf, - - #[clap(subcommand)] - subcommand: SubCommand, -} - -/// Errors which can be returned when executing subcommands -#[derive(Error, Debug)] -enum FlingError { - #[error("Servers not listed in configuration: {0:?}")] - InvalidServers(Vec), - - /// Failed to rsync omicron to build host - #[error("Failed to sync {src} with {dst}")] - FailedSync { src: String, dst: String }, - - /// The given path must be absolute - #[error("Path for {field} must be absolute")] - NotAbsolutePath { field: &'static str }, -} - -// How should `ssh_exec` be run? -enum SshStrategy { - // Forward agent and source .profile - Forward, - - // Don't forward agent, but source .profile - NoForward, - - // Don't forward agent and don't source .profile - NoForwardNoProfile, -} - -impl SshStrategy { - fn forward_agent(&self) -> bool { - match self { - SshStrategy::Forward => true, - _ => false, - } - } - - fn source_profile(&self) -> bool { - match self { - SshStrategy::Forward | &SshStrategy::NoForward => true, - _ => false, - } - } -} - -// TODO: run in parallel when that option is given -fn do_exec( - config: &Config, - cmd: String, - servers: Option>, -) -> Result<()> { - if let Some(ref servers) = servers { - validate_servers(servers, &config.servers)?; - - for name in servers { - let server = &config.servers[name]; - ssh_exec(&server, &cmd, SshStrategy::NoForward)?; - } - } else { - for (_, server) in config.servers.iter() { - ssh_exec(&server, &cmd, SshStrategy::NoForward)?; - } - } - Ok(()) -} - -// start an `rsync` command with args common to all our uses -fn rsync_common() -> Command { - let mut cmd = Command::new("rsync"); - cmd.arg("-az") - .arg("-e") - .arg("ssh -o StrictHostKeyChecking=no") - .arg("--delete") - .arg("--progress") - .arg("--out-format") - .arg("File changed: %o %t %f"); - cmd -} - -fn do_sync(config: &Config) -> Result<()> { - let builder = - config.servers.get(&config.builder.server).ok_or_else(|| { - FlingError::InvalidServers(vec![config.builder.server.clone()]) - })?; - - // For rsync to copy from the source appropriately we must guarantee a - // trailing slash. - let src = format!( - "{}/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - let dst = format!( - "{}@{}:{}", - builder.username, builder.addr, config.builder.omicron_path - ); - - println!("Synchronizing source files to: {}", dst); - let mut cmd = rsync_common(); - - // exclude build and development environment artifacts - cmd.arg("--exclude") - .arg("target/") - .arg("--exclude") - .arg("*.vdev") - .arg("--exclude") - .arg("*.swp") - .arg("--exclude") - .arg(".git/") - .arg("--exclude") - .arg("out/"); - - // exclude `config-rss.toml`, which needs to be sent to only one target - // system. we handle this in `do_overlay` below. - cmd.arg("--exclude").arg("**/config-rss.toml"); - - // finish with src/dst - cmd.arg(&src).arg(&dst); - let status = - cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err(FlingError::FailedSync { src, dst }.into()); - } - - Ok(()) -} - -fn copy_to_deployment_staging_dir( - config: &Config, - src: String, - description: &str, -) -> Result<()> { - let partial_cmd = || { - let mut cmd = rsync_common(); - cmd.arg("--relative"); - cmd.arg(&src); - cmd - }; - - // A function for each deployment server to run in parallel - let fns = config.deployment_servers().map(|server| { - || { - let dst = format!( - "{}@{}:{}", - server.username, server.addr, config.deployment.staging_dir - ); - let mut cmd = partial_cmd(); - cmd.arg(&dst); - let status = cmd - .status() - .context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err( - FlingError::FailedSync { src: src.clone(), dst }.into() - ); - } - Ok(()) - } - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel(description, named_fns); - - Ok(()) -} - -fn rsync_config_needed_for_tools(config: &Config) -> Result<()> { - let src = format!( - // the `./` here is load-bearing; it interacts with `--relative` to tell - // rsync to create `smf/sled-agent` but none of its parents - "{}/./smf/sled-agent/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - - copy_to_deployment_staging_dir(config, src, "Copy smf/sled-agent dir") -} - -fn rsync_tools_dir_to_deployment_servers(config: &Config) -> Result<()> { - // we need to rsync `./tools/*` to each of the deployment targets (the - // "builder" already has it via `do_sync()`), and then run `pfexec - // tools/install_prerequisites.sh` on each system. - let src = format!( - // the `./` here is load-bearing; it interacts with `--relative` to tell - // rsync to create `tools` but none of its parents - "{}/./tools/", - config.omicron_path.canonicalize_utf8().with_context(|| format!( - "could not canonicalize {}", - config.omicron_path - ))? - ); - copy_to_deployment_staging_dir(config, src, "Copy tools dir") -} - -fn do_install_prereqs(config: &Config) -> Result<()> { - rsync_config_needed_for_tools(config)?; - rsync_tools_dir_to_deployment_servers(config)?; - install_rustup_on_deployment_servers(config); - create_virtual_hardware_on_deployment_servers(config); - create_external_tls_cert_on_builder(config)?; - - // Create a set of servers to install prereqs to - let builder = &config.servers[&config.builder.server]; - let build_server = (builder, &config.builder.omicron_path); - let all_servers = std::iter::once(build_server).chain( - config.deployment_servers().filter_map(|server| { - // Don't duplicate the builder - if server.addr != builder.addr { - Some((server, &config.deployment.staging_dir)) - } else { - None - } - }), - ); - - let server_names = std::iter::once(&config.builder.server).chain( - config - .deployment - .servers - .iter() - .filter(|s| **s != config.builder.server), - ); - - // Install functions to run in parallel on each server - let fns = all_servers.map(|(server, root_path)| { - || { - // -y: assume yes instead of prompting - // -p: skip check that deps end up in $PATH - let (script, script_type) = if *server == *builder { - ("install_builder_prerequisites.sh -y -p", "builder") - } else { - ("install_runner_prerequisites.sh -y", "runner") - }; - - let cmd = format!( - "cd {} && mkdir -p out && pfexec ./tools/{}", - root_path.clone(), - script - ); - println!( - "Install {} prerequisites on {}", - script_type, server.addr - ); - ssh_exec(server, &cmd, SshStrategy::NoForward) - } - }); - - let named_fns = server_names.zip(fns); - run_in_parallel("Install prerequisites", named_fns); - - Ok(()) -} - -fn create_external_tls_cert_on_builder(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - let cmd = format!( - "cd {} && ./tools/create_self_signed_cert.sh", - config.builder.omicron_path, - ); - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn create_virtual_hardware_on_deployment_servers(config: &Config) { - let cmd = format!( - "cd {} && pfexec ./tools/create_virtual_hardware.sh", - config.deployment.staging_dir - ); - let fns = config.deployment_servers().map(|server| { - || { - println!("Create virtual hardware on {}", server.addr); - ssh_exec(server, &cmd, SshStrategy::NoForward) - } - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel("Create virtual hardware", named_fns); -} - -fn install_rustup_on_deployment_servers(config: &Config) { - let cmd = "curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y"; - let fns = config.deployment_servers().map(|server| { - || ssh_exec(server, cmd, SshStrategy::NoForwardNoProfile) - }); - - let named_fns = config.deployment.servers.iter().zip(fns); - run_in_parallel("Install rustup", named_fns); -} - -// Build omicron-package and omicron-deploy on the builder -// -// We need to build omicron-deploy for overlay file generation -fn do_build_minimal(config: &Config) -> Result<()> { - let server = &config.servers[&config.builder.server]; - let cmd = format!( - "cd {} && cargo build {} -p {} -p {}", - config.builder.omicron_path, - config.release_arg(), - "omicron-package", - "omicron-deploy" - ); - ssh_exec(&server, &cmd, SshStrategy::NoForward) -} - -fn do_package(config: &Config, artifact_dir: Utf8PathBuf) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - - // We use a bash login shell to get a proper environment, so we have a path to - // postgres, and $DEP_PQ_LIBDIRS is filled in. This is required for building - // nexus. - // - // See https://github.com/oxidecomputer/omicron/blob/8757ec542ea4ffbadd6f26094ed4ba357715d70d/rpaths/src/lib.rs - let cmd = format!( - "bash -lc \ - 'cd {} && \ - cargo run {} --bin omicron-package -- package --out {}'", - config.builder.omicron_path, - config.release_arg(), - artifact_dir, - ); - - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn do_dot(_config: &Config) -> Result<()> { - anyhow::bail!("\"dot\" command is not supported for thing-flinger"); -} - -fn do_check(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - - let cmd = format!( - "bash -lc \ - 'cd {} && \ - cargo run {} --bin omicron-package -- check'", - config.builder.omicron_path, - config.release_arg(), - ); - - ssh_exec(&builder, &cmd, SshStrategy::NoForward) -} - -fn do_uninstall(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - for server in config.deployment_servers() { - copy_omicron_package_binary_to_staging(config, builder, server)?; - - // Run `omicron-package uninstall` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package uninstall", - config.deployment.staging_dir, - ); - println!("$ {}", cmd); - ssh_exec(&server, &cmd, SshStrategy::Forward)?; - } - Ok(()) -} - -fn do_clean( - config: &Config, - artifact_dir: Utf8PathBuf, - install_dir: Utf8PathBuf, -) -> Result<()> { - let mut deployment_src = Utf8PathBuf::from(&config.deployment.staging_dir); - deployment_src.push(&artifact_dir); - let builder = &config.servers[&config.builder.server]; - for server in config.deployment_servers() { - copy_omicron_package_binary_to_staging(config, builder, server)?; - - // Run `omicron-package uninstall` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package clean --in {} --out {}", - config.deployment.staging_dir, deployment_src, install_dir, - ); - println!("$ {}", cmd); - ssh_exec(&server, &cmd, SshStrategy::Forward)?; - } - Ok(()) -} - -fn run_in_parallel<'a, F>(op: &str, cmds: impl Iterator) -where - F: FnOnce() -> Result<()> + Send, -{ - thread::scope(|s| { - let named_handles: Vec<(_, ScopedJoinHandle<'_, Result<()>>)> = cmds - .map(|(server_name, f)| (server_name, s.spawn(|_| f()))) - .collect(); - - // Join all the handles and print the install status - for (server_name, handle) in named_handles { - match handle.join() { - Ok(Ok(())) => { - println!("{} completed for server: {}", op, server_name) - } - Ok(Err(e)) => { - println!( - "{} failed for server: {} with error: {}", - op, server_name, e - ) - } - Err(_) => { - println!( - "{} failed for server: {}. Thread panicked.", - op, server_name - ) - } - } - } - }) - .unwrap(); -} - -fn do_install( - config: &Config, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, -) { - let builder = &config.servers[&config.builder.server]; - let mut pkg_dir = Utf8PathBuf::from(&config.builder.omicron_path); - pkg_dir.push(artifact_dir); - - let fns = config.deployment.servers.iter().map(|server_name| { - (server_name, || { - single_server_install( - config, - &artifact_dir, - &install_dir, - pkg_dir.as_str(), - builder, - server_name, - ) - }) - }); - - run_in_parallel("Install", fns); -} - -fn do_overlay(config: &Config) -> Result<()> { - let builder = &config.servers[&config.builder.server]; - let mut root_path = Utf8PathBuf::from(&config.builder.omicron_path); - // TODO: This needs to match the artifact_dir in `package` - root_path.push("out/overlay"); - - // Build a list of directories for each server to be deployed and tag which - // one is the server to run RSS; e.g., for servers ["foo", "bar", "baz"] - // with root_path "/my/path", we produce - // [ - // "/my/path/foo/sled-agent/pkg", - // "/my/path/bar/sled-agent/pkg", - // "/my/path/baz/sled-agent/pkg", - // ] - // As we're doing so, record which directory is the one for the server that - // will run RSS. - let mut rss_server_dir = None; - - for server_name in &config.deployment.servers { - let mut dir = root_path.clone(); - dir.push(server_name); - dir.push("sled-agent/pkg"); - if *server_name == config.deployment.rss_server { - rss_server_dir = Some(dir.clone()); - break; - } - } - - // we know exactly one of the servers matches `rss_server` from our config - // validation, so we can unwrap here - let rss_server_dir = rss_server_dir.unwrap(); - - overlay_rss_config(builder, config, &rss_server_dir)?; - - Ok(()) -} - -fn overlay_rss_config( - builder: &Server, - config: &Config, - rss_server_dir: &Utf8Path, -) -> Result<()> { - // Sync `config-rss.toml` to the directory for the RSS server on the - // builder. - let src = if let Some(src) = &config.rss_config_path { - src.clone() - } else { - config.omicron_path.join("smf/sled-agent/non-gimlet/config-rss.toml") - }; - let dst = format!( - "{}@{}:{}/config-rss.toml", - builder.username, builder.addr, rss_server_dir - ); - - let mut cmd = rsync_common(); - cmd.arg(&src).arg(&dst); - - let status = - cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; - if !status.success() { - return Err(FlingError::FailedSync { src: src.to_string(), dst }.into()); - } - - Ok(()) -} - -fn single_server_install( - config: &Config, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, - pkg_dir: &str, - builder: &Server, - server_name: &str, -) -> Result<()> { - let server = &config.servers[server_name]; - - println!( - "COPYING packages from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_package_artifacts_to_staging(config, pkg_dir, builder, server)?; - - println!( - "COPYING deploy tool from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_omicron_package_binary_to_staging(config, builder, server)?; - - println!( - "COPYING manifest from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_package_manifest_to_staging(config, builder, server)?; - - println!("UNPACKING packages on deploy server ({})", server_name); - run_omicron_package_unpack_from_staging( - config, - server, - &artifact_dir, - &install_dir, - )?; - - println!( - "COPYING overlay files from builder ({}) -> deploy server ({})", - builder.addr, server_name - ); - copy_overlay_files_to_staging( - config, - pkg_dir, - builder, - server, - server_name, - )?; - - println!("INSTALLING overlay files into the install directory of the deploy server ({})", server_name); - install_overlay_files_from_staging(config, server, &install_dir)?; - - println!("STARTING services on the deploy server ({})", server_name); - run_omicron_package_activate_from_staging(config, server, &install_dir) -} - -// Copy package artifacts as a result of `omicron-package package` from the -// builder to the deployment server staging directory. -// -// This staging directory acts as an intermediate location where -// packages may reside prior to being installed. -fn copy_package_artifacts_to_staging( - config: &Config, - pkg_dir: &str, - builder: &Server, - destination: &Server, -) -> Result<()> { - let cmd = format!( - "rsync -avz -e 'ssh -o StrictHostKeyChecking=no' \ - --include 'out/' \ - --include 'out/*.tar' \ - --include 'out/*.tar.gz' \ - --exclude '*' \ - {} {}@{}:{}", - pkg_dir, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn copy_omicron_package_binary_to_staging( - config: &Config, - builder: &Server, - destination: &Server, -) -> Result<()> { - let mut bin_path = Utf8PathBuf::from(&config.builder.omicron_path); - bin_path.push(format!( - "target/{}/omicron-package", - if config.debug { "debug" } else { "release" } - )); - let cmd = format!( - "rsync -avz {} {}@{}:{}", - bin_path, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn copy_package_manifest_to_staging( - config: &Config, - builder: &Server, - destination: &Server, -) -> Result<()> { - let mut path = Utf8PathBuf::from(&config.builder.omicron_path); - path.push("package-manifest.toml"); - let cmd = format!( - "rsync {} {}@{}:{}", - path, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn run_omicron_package_activate_from_staging( - config: &Config, - destination: &Server, - install_dir: &Utf8Path, -) -> Result<()> { - // Run `omicron-package activate` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package activate --out {}", - config.deployment.staging_dir, install_dir, - ); - - println!("$ {}", cmd); - ssh_exec(destination, &cmd, SshStrategy::Forward) -} - -fn run_omicron_package_unpack_from_staging( - config: &Config, - destination: &Server, - artifact_dir: &Utf8Path, - install_dir: &Utf8Path, -) -> Result<()> { - let mut deployment_src = Utf8PathBuf::from(&config.deployment.staging_dir); - deployment_src.push(&artifact_dir); - - // Run `omicron-package unpack` on the deployment server - let cmd = format!( - "cd {} && pfexec ./omicron-package unpack --in {} --out {}", - config.deployment.staging_dir, deployment_src, install_dir, - ); - - println!("$ {}", cmd); - ssh_exec(destination, &cmd, SshStrategy::Forward) -} - -fn copy_overlay_files_to_staging( - config: &Config, - pkg_dir: &str, - builder: &Server, - destination: &Server, - destination_name: &str, -) -> Result<()> { - let cmd = format!( - "rsync -avz {}/overlay/{}/ {}@{}:{}/overlay/", - pkg_dir, - destination_name, - destination.username, - destination.addr, - config.deployment.staging_dir - ); - println!("$ {}", cmd); - ssh_exec(builder, &cmd, SshStrategy::Forward) -} - -fn install_overlay_files_from_staging( - config: &Config, - destination: &Server, - install_dir: &Utf8Path, -) -> Result<()> { - let cmd = format!( - "pfexec cp -r {}/overlay/* {}", - config.deployment.staging_dir, install_dir - ); - println!("$ {}", cmd); - ssh_exec(&destination, &cmd, SshStrategy::NoForward) -} - -fn ssh_exec( - server: &Server, - remote_cmd: &str, - strategy: SshStrategy, -) -> Result<()> { - let remote_cmd = if strategy.source_profile() { - // Source .profile, so we have access to cargo. Rustup installs knowledge - // about the cargo path here. - String::from(". $HOME/.profile && ") + remote_cmd - } else { - remote_cmd.into() - }; - - let mut cmd = Command::new("ssh"); - if strategy.forward_agent() { - cmd.arg("-A"); - } - cmd.arg("-o") - .arg("StrictHostKeyChecking=no") - .arg("-l") - .arg(&server.username) - .arg(&server.addr) - .arg(&remote_cmd); - - // If the builder is the same as the client, this will likely not be set, - // as the keys will reside on the builder. - if let Some(auth_sock) = std::env::var_os("SSH_AUTH_SOCK") { - cmd.env("SSH_AUTH_SOCK", auth_sock); - } - let exit_status = cmd - .status() - .context(format!("Failed to run {} on {}", remote_cmd, server.addr))?; - if !exit_status.success() { - anyhow::bail!("Command failed: {}", exit_status); - } - - Ok(()) -} - -fn validate_servers( - chosen: &BTreeSet, - all: &BTreeMap, -) -> Result<(), FlingError> { - let all = all.keys().cloned().collect(); - let diff: Vec = chosen.difference(&all).cloned().collect(); - if !diff.is_empty() { - Err(FlingError::InvalidServers(diff)) - } else { - Ok(()) - } -} - -fn validate_absolute_path( - path: &Utf8Path, - field: &'static str, -) -> Result<(), FlingError> { - if path.is_absolute() || path.starts_with("$HOME") { - Ok(()) - } else { - Err(FlingError::NotAbsolutePath { field }) - } -} - -fn validate(config: &Config) -> Result<(), FlingError> { - validate_absolute_path(&config.omicron_path, "omicron_path")?; - validate_absolute_path( - &config.builder.omicron_path, - "builder.omicron_path", - )?; - validate_absolute_path( - &config.deployment.staging_dir, - "deployment.staging_dir", - )?; - - validate_servers( - &BTreeSet::from([ - config.builder.server.clone(), - config.deployment.rss_server.clone(), - ]), - &config.servers, - ) -} - -fn main() -> Result<()> { - let args = Args::try_parse()?; - let config = parse::<_, Config>(args.config)?; - - validate(&config)?; - - match args.subcommand { - SubCommand::Exec { cmd, servers } => { - do_exec(&config, cmd, servers)?; - } - SubCommand::Sync => do_sync(&config)?, - SubCommand::InstallPrereqs => do_install_prereqs(&config)?, - SubCommand::Builder(BuildCommand::Target { .. }) => { - todo!("Setting target not supported through thing-flinger") - } - SubCommand::Builder(BuildCommand::Package { .. }) => { - do_package(&config, args.artifact_dir)?; - } - SubCommand::Builder(BuildCommand::Stamp { .. }) => { - anyhow::bail!("Distributed package stamping not supported") - } - SubCommand::Builder(BuildCommand::Check) => do_check(&config)?, - SubCommand::Builder(BuildCommand::Dot) => { - do_dot(&config)?; - } - SubCommand::Deploy(DeployCommand::Install { install_dir }) => { - do_build_minimal(&config)?; - do_install(&config, &args.artifact_dir, &install_dir); - } - SubCommand::Deploy(DeployCommand::Uninstall) => { - do_build_minimal(&config)?; - do_uninstall(&config)?; - } - SubCommand::Deploy(DeployCommand::Clean { install_dir }) => { - do_build_minimal(&config)?; - do_clean(&config, args.artifact_dir, install_dir)?; - } - // TODO: It doesn't really make sense to allow the user direct access - // to these low level operations in thing-flinger. Should we not use - // the DeployCommand from omicron-package directly? - SubCommand::Deploy(_) => anyhow::bail!("Unsupported action"), - SubCommand::Overlay => do_overlay(&config)?, - } - Ok(()) -} diff --git a/internal-dns/src/names.rs b/internal-dns/src/names.rs index e0c9b79555..bffe6e829a 100644 --- a/internal-dns/src/names.rs +++ b/internal-dns/src/names.rs @@ -65,7 +65,7 @@ impl ServiceName { /// Returns the DNS name for this service, ignoring the zone part of the DNS /// name - pub(crate) fn dns_name(&self) -> String { + pub fn dns_name(&self) -> String { match self { ServiceName::Clickhouse | ServiceName::ClickhouseKeeper diff --git a/nexus/db-queries/src/db/on_conflict_ext.rs b/nexus/db-queries/src/db/on_conflict_ext.rs index 4f82c89725..5420e3e7f9 100644 --- a/nexus/db-queries/src/db/on_conflict_ext.rs +++ b/nexus/db-queries/src/db/on_conflict_ext.rs @@ -22,7 +22,7 @@ where Target: DecoratableTarget>, { type AsPartialIndexOutput = - IncompleteOnConflict>; + IncompleteOnConflict>; fn as_partial_index(self) -> Self::AsPartialIndexOutput { #[allow(clippy::disallowed_methods)] @@ -32,5 +32,5 @@ where } } -type DecoratedBool = +type DecoratedAsPartialIndex = >>::FilterOutput; diff --git a/nexus/src/app/background/dns_config.rs b/nexus/src/app/background/dns_config.rs index 959cf1843e..be18ac3612 100644 --- a/nexus/src/app/background/dns_config.rs +++ b/nexus/src/app/background/dns_config.rs @@ -159,7 +159,6 @@ impl BackgroundTask for DnsConfigWatcher { mod test { use crate::app::background::common::BackgroundTask; use crate::app::background::dns_config::DnsConfigWatcher; - use crate::app::background::init::test::read_internal_dns_zone_id; use crate::app::background::init::test::write_test_dns_generation; use assert_matches::assert_matches; use async_bb8_diesel::AsyncRunQueryDsl; @@ -197,9 +196,7 @@ mod test { // Now write generation 2, activate again, and verify that the update // was sent to the watcher. - let internal_dns_zone_id = - read_internal_dns_zone_id(&opctx, &datastore).await; - write_test_dns_generation(&datastore, internal_dns_zone_id).await; + write_test_dns_generation(&opctx, &datastore).await; assert_eq!(watcher.borrow().as_ref().unwrap().generation, 1); let value = task.activate(&opctx).await; assert_eq!(watcher.borrow().as_ref().unwrap().generation, 2); diff --git a/nexus/src/app/background/dns_servers.rs b/nexus/src/app/background/dns_servers.rs index 97fb3510b7..8f4cce4ee0 100644 --- a/nexus/src/app/background/dns_servers.rs +++ b/nexus/src/app/background/dns_servers.rs @@ -7,22 +7,15 @@ use super::common::BackgroundTask; use futures::future::BoxFuture; use futures::FutureExt; +use internal_dns::names::ServiceName; +use internal_dns::resolver::Resolver; use nexus_db_model::DnsGroup; -use nexus_db_model::ServiceKind; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::DataStore; -use omicron_common::api::external::DataPageParams; use serde::Serialize; use serde_json::json; -use std::net::{SocketAddr, SocketAddrV6}; -use std::num::NonZeroU32; -use std::sync::Arc; +use std::net::SocketAddr; use tokio::sync::watch; -// This constraint could be relaxed by paginating through the list of servers, -// but we don't expect to have this many servers any time soon. -const MAX_DNS_SERVERS: usize = 10; - #[derive(Debug, Clone, Eq, PartialEq, Serialize)] pub struct DnsServersList { pub addresses: Vec, @@ -31,20 +24,17 @@ pub struct DnsServersList { /// Background task that keeps track of the latest list of DNS servers for a DNS /// group pub struct DnsServersWatcher { - datastore: Arc, dns_group: DnsGroup, + resolver: Resolver, last: Option, tx: watch::Sender>, rx: watch::Receiver>, } impl DnsServersWatcher { - pub fn new( - datastore: Arc, - dns_group: DnsGroup, - ) -> DnsServersWatcher { + pub fn new(dns_group: DnsGroup, resolver: Resolver) -> DnsServersWatcher { let (tx, rx) = watch::channel(None); - DnsServersWatcher { datastore, dns_group, last: None, tx, rx } + DnsServersWatcher { dns_group, last: None, tx, rx, resolver } } /// Exposes the latest list of DNS servers for this DNS group @@ -75,58 +65,38 @@ impl BackgroundTask for DnsServersWatcher { }; // Read the latest service configuration for this DNS group. - let service_kind = match self.dns_group { - DnsGroup::Internal => ServiceKind::InternalDns, - DnsGroup::External => ServiceKind::ExternalDns, + let service_name = match self.dns_group { + DnsGroup::Internal => ServiceName::InternalDns, + DnsGroup::External => ServiceName::ExternalDns, }; - let pagparams = DataPageParams { - marker: None, - limit: NonZeroU32::try_from( - u32::try_from(MAX_DNS_SERVERS).unwrap(), - ) - .unwrap(), - direction: dropshot::PaginationOrder::Ascending, + let result = self.resolver.lookup_all_socket_v6(service_name).await; + let addresses = match result { + Err(error) => { + warn!( + &log, + "failed to lookup DNS servers"; + "error" => format!("{:#}", error) + ); + return json!({ + "error": + format!( + "failed to read list of DNS servers: {:#}", + error + ) + }); + } + Ok(addresses) => { + // TODO(eliza): it would be nicer if `Resolver` had a method + // returning an iterator instead of a `Vec`, so we didn't + // have to drain the Vec and then collect it into a new + // one... + addresses.into_iter().map(SocketAddr::V6).collect() + } }; - let result = self - .datastore - .services_list_kind(opctx, service_kind, &pagparams) - .await; - - if let Err(error) = result { - warn!( - &log, - "failed to read list of DNS servers"; - "error" => format!("{:#}", error) - ); - return json!({ - "error": - format!( - "failed to read list of DNS servers: {:#}", - error - ) - }); - } - - let services = result.unwrap(); - if services.len() >= MAX_DNS_SERVERS { - warn!( - &log, - "found {} servers, which is more than MAX_DNS_SERVERS \ - ({}). There may be more that will not be used.", - services.len(), - MAX_DNS_SERVERS - ); - } - - let new_config = DnsServersList { - addresses: services - .into_iter() - .map(|s| SocketAddrV6::new(*s.ip, *s.port, 0, 0).into()) - .collect(), - }; - let new_addrs_dbg = format!("{:?}", new_config); + let new_config = DnsServersList { addresses }; + let new_addrs_dbg = format!("{new_config:?}"); let rv = serde_json::to_value(&new_config).unwrap_or_else(|error| { json!({ @@ -177,119 +147,3 @@ impl BackgroundTask for DnsServersWatcher { .boxed() } } - -#[cfg(test)] -mod test { - use crate::app::background::common::BackgroundTask; - use crate::app::background::dns_servers::DnsServersList; - use crate::app::background::dns_servers::DnsServersWatcher; - use crate::app::background::dns_servers::MAX_DNS_SERVERS; - use assert_matches::assert_matches; - use async_bb8_diesel::AsyncRunQueryDsl; - use diesel::ExpressionMethods; - use diesel::QueryDsl; - use nexus_db_model::DnsGroup; - use nexus_db_queries::context::OpContext; - use nexus_db_queries::db::model::Service; - use nexus_db_queries::db::model::ServiceKind; - use nexus_test_utils_macros::nexus_test; - use std::net::Ipv6Addr; - use std::net::SocketAddrV6; - use uuid::Uuid; - - type ControlPlaneTestContext = - nexus_test_utils::ControlPlaneTestContext; - - #[nexus_test(server = crate::Server)] - async fn test_basic(cptestctx: &ControlPlaneTestContext) { - let nexus = &cptestctx.server.apictx().nexus; - let datastore = nexus.datastore(); - let opctx = OpContext::for_tests( - cptestctx.logctx.log.clone(), - datastore.clone(), - ); - - // Verify the initial state. - let mut task = - DnsServersWatcher::new(datastore.clone(), DnsGroup::Internal); - let watcher = task.watcher(); - assert_matches!(*watcher.borrow(), None); - - // The datastore from the ControlPlaneTestContext is initialized with - // one DNS server. - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == 1); - - // If we add another server, we should see it. - { - use nexus_db_queries::db::schema::service::dsl; - diesel::insert_into(dsl::service) - .values(Service::new( - Uuid::new_v4(), - Uuid::new_v4(), - Some(Uuid::new_v4()), - SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0), - ServiceKind::InternalDns, - )) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == 2); - - // If we add MAX_DNS_SERVERS more servers, we should see - // MAX_DNS_SERVERS. - { - use nexus_db_queries::db::schema::service::dsl; - let new_services = (0..u16::try_from(MAX_DNS_SERVERS).unwrap()) - .map(|i| { - Service::new( - Uuid::new_v4(), - Uuid::new_v4(), - Some(Uuid::new_v4()), - SocketAddrV6::new(Ipv6Addr::LOCALHOST, i + 2, 0, 0), - ServiceKind::InternalDns, - ) - }) - .collect::>(); - - diesel::insert_into(dsl::service) - .values(new_services) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.len() == MAX_DNS_SERVERS); - - // Now delete all the servers and try again. - { - use nexus_db_queries::db::schema::service::dsl; - diesel::delete( - dsl::service.filter(dsl::kind.eq(ServiceKind::InternalDns)), - ) - .execute_async( - &*datastore.pool_connection_for_tests().await.unwrap(), - ) - .await - .unwrap(); - } - let _ = task.activate(&opctx).await; - assert_matches!(*watcher.borrow(), Some(DnsServersList { - ref addresses - }) if addresses.is_empty()); - } -} diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 9d078f10d0..27e58a298c 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -101,6 +101,7 @@ impl BackgroundTasks { opctx, datastore.clone(), DnsGroup::Internal, + resolver.clone(), &config.dns_internal, ); let (task_external_dns_config, task_external_dns_servers) = init_dns( @@ -108,6 +109,7 @@ impl BackgroundTasks { opctx, datastore.clone(), DnsGroup::External, + resolver.clone(), &config.dns_external, ); @@ -301,6 +303,7 @@ fn init_dns( opctx: &OpContext, datastore: Arc, dns_group: DnsGroup, + resolver: internal_dns::resolver::Resolver, config: &DnsTasksConfig, ) -> (common::TaskHandle, common::TaskHandle) { let dns_group_name = dns_group.to_string(); @@ -321,13 +324,13 @@ fn init_dns( ); // Background task: DNS server list watcher - let dns_servers = dns_servers::DnsServersWatcher::new(datastore, dns_group); + let dns_servers = dns_servers::DnsServersWatcher::new(dns_group, resolver); let dns_servers_watcher = dns_servers.watcher(); let task_name_servers = format!("dns_servers_{}", dns_group); let task_servers = driver.register( task_name_servers.clone(), format!( - "watches list of {} DNS servers stored in CockroachDB", + "watches list of {} DNS servers stored in internal DNS", dns_group, ), config.period_secs_servers, @@ -361,22 +364,17 @@ fn init_dns( #[cfg(test)] pub mod test { - use async_bb8_diesel::AsyncRunQueryDsl; use dropshot::HandlerTaskMode; use nexus_db_model::DnsGroup; - use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; + use nexus_db_queries::db::datastore::DnsVersionUpdateBuilder; use nexus_db_queries::db::DataStore; use nexus_test_utils_macros::nexus_test; use nexus_types::internal_api::params as nexus_params; - use nexus_types::internal_api::params::ServiceKind; - use omicron_common::api::external::DataPageParams; use omicron_test_utils::dev::poll; use std::net::SocketAddr; - use std::num::NonZeroU32; use std::time::Duration; use tempfile::TempDir; - use uuid::Uuid; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -423,12 +421,23 @@ pub mod test { .expect("failed to get initial DNS server config"); assert_eq!(config.generation, 1); - // We'll need the id of the internal DNS zone. - let internal_dns_zone_id = - read_internal_dns_zone_id(&opctx, datastore).await; + let internal_dns_srv_name = + internal_dns::ServiceName::InternalDns.dns_name(); + + let initial_srv_record = { + let zone = + config.zones.get(0).expect("DNS config must have a zone"); + let Some(record) = zone.records.get(&internal_dns_srv_name) else { + panic!("zone must have a record for {internal_dns_srv_name}") + }; + match record.get(0) { + Some(dns_service_client::types::DnsRecord::Srv(srv)) => srv, + record => panic!("expected a SRV record for {internal_dns_srv_name}, found {record:?}"), + } + }; // Now spin up another DNS server, add it to the list of servers, and - // make sure that DNS gets propagated to it. Note that we shouldn't + // make sure that DNS gets propagated to it. Note that we shouldn't // have to explicitly activate the background task because inserting a // new service ought to do that for us. let log = &cptestctx.logctx.log; @@ -467,29 +476,76 @@ pub mod test { SocketAddr::V4(_) => panic!("expected v6 address"), SocketAddr::V6(a) => a, }; + + // In order to test that DNS gets propagated to a newly-added server, we + // first need to update the source of truth about DNS (the database). + // Then we need to wait for that to get propagated (by this same + // mechanism) to the existing DNS servers. Only then would we expect + // the mechanism to see the new DNS server and then propagate + // configuration to it. + let update = { + use nexus_params::{DnsRecord, Srv}; + + let target = "my-great-dns-server.host"; + + let mut update = test_dns_update_builder(); + update.remove_name(internal_dns_srv_name.clone()).unwrap(); + update + .add_name( + internal_dns_srv_name, + vec![ + DnsRecord::Srv(Srv { + prio: 0, + weight: 0, + port: new_dns_addr.port(), + target: format!( + "{target}.control-plane.oxide.internal" + ), + }), + DnsRecord::Srv(initial_srv_record.clone()), + ], + ) + .unwrap(); + update + .add_name( + target.to_string(), + vec![DnsRecord::Aaaa(*new_dns_addr.ip())], + ) + .unwrap(); + update + }; + write_dns_update(&opctx, datastore, update).await; + info!(&cptestctx.logctx.log, "updated new dns records"); + + // Activate the internal DNS propagation pipeline. nexus - .upsert_service( - &opctx, - Uuid::new_v4(), - cptestctx.sled_agent.sled_agent.id, - Some(Uuid::new_v4()), - new_dns_addr, - ServiceKind::InternalDns.into(), - ) - .await - .unwrap(); + .background_tasks + .activate(&nexus.background_tasks.task_internal_dns_config); + + wait_propagate_dns( + &cptestctx.logctx.log, + "initial", + initial_dns_dropshot_server.local_addr(), + 2, + ) + .await; + + // Discover the new internal DNS server from internal DNS. + nexus + .background_tasks + .activate(&nexus.background_tasks.task_internal_dns_servers); wait_propagate_dns( &cptestctx.logctx.log, "new", new_dns_dropshot_server.local_addr(), - 1, + 2, ) .await; - // Now, write version 2 of the internal DNS configuration with one + // Now, write version 3 of the internal DNS configuration with one // additional record. - write_test_dns_generation(datastore, internal_dns_zone_id).await; + write_test_dns_generation(&opctx, datastore).await; // Activate the internal DNS propagation pipeline. nexus @@ -501,7 +557,7 @@ pub mod test { &cptestctx.logctx.log, "initial", initial_dns_dropshot_server.local_addr(), - 2, + 3, ) .await; @@ -509,7 +565,7 @@ pub mod test { &cptestctx.logctx.log, "new", new_dns_dropshot_server.local_addr(), - 2, + 3, ) .await; } @@ -522,15 +578,16 @@ pub mod test { generation: u64, ) { println!( - "waiting for propagation of generation {} to {} DNS server ({})", - generation, label, addr + "waiting for propagation of generation {generation} to {label} \ + DNS server ({addr})", ); let client = dns_service_client::Client::new( &format!("http://{}", addr), log.clone(), ); - poll::wait_for_condition( + let poll_max = Duration::from_secs(30); + let result = poll::wait_for_condition( || async { match client.dns_config_get().await { Err(error) => { @@ -548,87 +605,51 @@ pub mod test { } }, &Duration::from_millis(50), - &Duration::from_secs(30), + &poll_max, ) - .await - .expect("DNS config not propagated in expected time"); + .await; + if let Err(err) = result { + panic!( + "DNS generation {generation} not propagated to \ + {label} DNS server ({addr}) within {poll_max:?}: {err}" + ); + } else { + println!( + "DNS generation {generation} propagated to {label} \ + DNS server ({addr}) successfully." + ); + } } - pub(crate) async fn write_test_dns_generation( + pub(crate) async fn write_dns_update( + opctx: &OpContext, datastore: &DataStore, - internal_dns_zone_id: Uuid, + update: DnsVersionUpdateBuilder, ) { - { - let conn = datastore.pool_connection_for_tests().await.unwrap(); - let _: Result<(), _> = datastore - .transaction_retry_wrapper("write_test_dns_generation") - .transaction(&conn, |conn| async move { - { - use nexus_db_queries::db::model::DnsVersion; - use nexus_db_queries::db::schema::dns_version::dsl; - - diesel::insert_into(dsl::dns_version) - .values(DnsVersion { - dns_group: DnsGroup::Internal, - version: Generation(2u32.try_into().unwrap()), - time_created: chrono::Utc::now(), - creator: String::from("test suite"), - comment: String::from("test suite"), - }) - .execute_async(&conn) - .await - .unwrap(); - } - - { - use nexus_db_queries::db::model::DnsName; - use nexus_db_queries::db::schema::dns_name::dsl; - - diesel::insert_into(dsl::dns_name) - .values( - DnsName::new( - internal_dns_zone_id, - String::from("we-got-beets"), - Generation(2u32.try_into().unwrap()), - None, - vec![nexus_params::DnsRecord::Aaaa( - "fe80::3".parse().unwrap(), - )], - ) - .unwrap(), - ) - .execute_async(&conn) - .await - .unwrap(); - } - - Ok(()) - }) - .await; - } + let conn = datastore.pool_connection_for_tests().await.unwrap(); + info!(opctx.log, "writing DNS update..."); + datastore.dns_update(opctx, &conn, update).await.unwrap(); } - pub(crate) async fn read_internal_dns_zone_id( + pub(crate) async fn write_test_dns_generation( opctx: &OpContext, datastore: &DataStore, - ) -> Uuid { - let dns_zones = datastore - .dns_zones_list( - &opctx, - DnsGroup::Internal, - &DataPageParams { - marker: None, - direction: dropshot::PaginationOrder::Ascending, - limit: NonZeroU32::new(2).unwrap(), - }, + ) { + let mut update = test_dns_update_builder(); + update + .add_name( + "we-got-beets".to_string(), + vec![nexus_params::DnsRecord::Aaaa("fe80::3".parse().unwrap())], ) - .await .unwrap(); - assert_eq!( - dns_zones.len(), - 1, - "expected exactly one internal DNS zone" - ); - dns_zones[0].id + write_dns_update(opctx, datastore, update).await + } + + fn test_dns_update_builder() -> DnsVersionUpdateBuilder { + DnsVersionUpdateBuilder::new( + DnsGroup::Internal, + "test suite DNS update".to_string(), + "test suite".to_string(), + ) } } diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 943490ac04..ec3f11dc6f 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -23,9 +23,6 @@ use std::net::SocketAddrV6; use std::sync::Arc; use uuid::Uuid; -#[cfg(test)] -use nexus_db_queries::db::model::ServiceKind; - impl super::Nexus { // Sleds pub fn sled_lookup<'a>( @@ -44,7 +41,7 @@ impl super::Nexus { // unless the DNS lookups at sled-agent are only for rack-local nexuses. pub(crate) async fn upsert_sled( &self, - opctx: &OpContext, + _opctx: &OpContext, id: Uuid, info: SledAgentStartupInfo, ) -> Result<(), Error> { @@ -73,10 +70,16 @@ impl super::Nexus { ); self.db_datastore.sled_upsert(sled).await?; - // Make sure any firewall rules for serices that may - // be running on this sled get plumbed - self.plumb_service_firewall_rules(opctx, &[id]).await?; + Ok(()) + } + pub(crate) async fn sled_request_firewall_rules( + &self, + opctx: &OpContext, + id: Uuid, + ) -> Result<(), Error> { + info!(self.log, "requesting firewall rules"; "sled_uuid" => id.to_string()); + self.plumb_service_firewall_rules(opctx, &[id]).await?; Ok(()) } @@ -276,41 +279,6 @@ impl super::Nexus { Ok(()) } - // Services - - /// Upserts a Service into the database, updating it if it already exists. - #[cfg(test)] - pub(crate) async fn upsert_service( - &self, - opctx: &OpContext, - id: Uuid, - sled_id: Uuid, - zone_id: Option, - address: SocketAddrV6, - kind: ServiceKind, - ) -> Result<(), Error> { - info!( - self.log, - "upserting service"; - "sled_id" => sled_id.to_string(), - "service_id" => id.to_string(), - "address" => address.to_string(), - ); - let service = - db::model::Service::new(id, sled_id, zone_id, address, kind); - self.db_datastore.service_upsert(opctx, service).await?; - - if kind == ServiceKind::ExternalDns { - self.background_tasks - .activate(&self.background_tasks.task_external_dns_servers); - } else if kind == ServiceKind::InternalDns { - self.background_tasks - .activate(&self.background_tasks.task_internal_dns_servers); - } - - Ok(()) - } - /// Ensure firewall rules for internal services get reflected on all the relevant sleds. pub(crate) async fn plumb_service_firewall_rules( &self, diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 0122d9b439..eddc834a2a 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -57,6 +57,7 @@ type NexusApiDescription = ApiDescription>; pub(crate) fn internal_api() -> NexusApiDescription { fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> { api.register(sled_agent_put)?; + api.register(sled_firewall_rules_request)?; api.register(switch_put)?; api.register(rack_initialization_complete)?; api.register(physical_disk_put)?; @@ -126,6 +127,31 @@ async fn sled_agent_put( apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await } +/// Request a new set of firewall rules for a sled. +/// +/// This causes Nexus to read the latest set of rules for the sled, +/// and call a Sled endpoint which applies the rules to all OPTE ports +/// that happen to exist. +#[endpoint { + method = POST, + path = "/sled-agents/{sled_id}/firewall-rules-update", + }] +async fn sled_firewall_rules_request( + rqctx: RequestContext>, + path_params: Path, +) -> Result { + let apictx = rqctx.context(); + let nexus = &apictx.nexus; + let opctx = crate::context::op_context_for_internal_api(&rqctx).await; + let path = path_params.into_inner(); + let sled_id = &path.sled_id; + let handler = async { + nexus.sled_request_firewall_rules(&opctx, *sled_id).await?; + Ok(HttpResponseUpdatedNoContent()) + }; + apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await +} + /// Path parameters for Rack requests. #[derive(Deserialize, JsonSchema)] struct RackPathParam { diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 4714b64c52..a1a78deb84 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -835,6 +835,35 @@ } } }, + "/sled-agents/{sled_id}/firewall-rules-update": { + "post": { + "summary": "Request a new set of firewall rules for a sled.", + "description": "This causes Nexus to read the latest set of rules for the sled, and call a Sled endpoint which applies the rules to all OPTE ports that happen to exist.", + "operationId": "sled_firewall_rules_request", + "parameters": [ + { + "in": "path", + "name": "sled_id", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + } + ], + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/sled-agents/{sled_id}/zpools/{zpool_id}": { "put": { "summary": "Report that a pool for a specified sled has come online.", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 395394defb..99156fffd4 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -136,6 +136,30 @@ } } }, + "/bootstore/status": { + "get": { + "summary": "Get the internal state of the local bootstore node", + "operationId": "bootstore_status", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BootstoreStatus" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/cockroachdb": { "post": { "summary": "Initializes a CockroachDB cluster", @@ -2531,6 +2555,60 @@ } ] }, + "BootstoreStatus": { + "type": "object", + "properties": { + "accepted_connections": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "established_connections": { + "type": "array", + "items": { + "$ref": "#/components/schemas/EstablishedConnection" + } + }, + "fsm_ledger_generation": { + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "fsm_state": { + "type": "string" + }, + "negotiating_connections": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "network_config_ledger_generation": { + "nullable": true, + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "peers": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + } + }, + "required": [ + "accepted_connections", + "established_connections", + "fsm_ledger_generation", + "fsm_state", + "negotiating_connections", + "peers" + ] + }, "BundleUtilization": { "description": "The portion of a debug dataset used for zone bundles.", "type": "object", @@ -3847,6 +3925,21 @@ "request_id" ] }, + "EstablishedConnection": { + "type": "object", + "properties": { + "addr": { + "type": "string" + }, + "baseboard": { + "$ref": "#/components/schemas/Baseboard" + } + }, + "required": [ + "addr", + "baseboard" + ] + }, "Field": { "description": "A `Field` is a named aspect of a target or metric.", "type": "object", diff --git a/package-manifest.toml b/package-manifest.toml index 9858fd980e..9b72dd7d18 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -497,8 +497,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "d58b59b2682183855cdf137ab86a298ecdaaf697" -source.sha256 = "d48408ac1c12332c0bf95834a9372f20d9ff53efa1d79513ea868f616a91cb97" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "eb98985871f321411f7875ef7751dba85ae0dd3034877b63ccb78cedcb96e6e7" output.type = "zone" output.intermediate_only = true @@ -522,8 +522,8 @@ only_for_targets.image = "standard" # 2. Copy the output zone image from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "d58b59b2682183855cdf137ab86a298ecdaaf697" -source.sha256 = "fb1b276c48d320a129376f8d574f61474734de20ed3e6f90408c56a3d205c597" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "cc0429f0d9ce6df94e834cea89cabbdf4d1fbfe623369dd3eb84c5b2677414be" output.type = "zone" output.intermediate_only = true @@ -540,8 +540,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out/dendrite-softnpu.tar.gz source.type = "prebuilt" source.repo = "dendrite" -source.commit = "d58b59b2682183855cdf137ab86a298ecdaaf697" -source.sha256 = "197ed4e1fde14062bf6ae037895c72571b028078ff5691591aec3ad063cad3ea" +source.commit = "3618dd6017b363c5d34399273453cf50b9c9a43e" +source.sha256 = "fa25585fb3aa1a888b76133af3060b859cbea8e53287bb1cc64e70889db37679" output.type = "zone" output.intermediate_only = true diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 0798aed664..5f888504db 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -8,7 +8,7 @@ use super::sled_agent::SledAgent; use crate::bootstrap::early_networking::EarlyNetworkConfig; use crate::bootstrap::params::AddSledRequest; use crate::params::{ - CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, + BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, Inventory, OmicronZonesConfig, SledRole, TimeSync, VpcFirewallRulesEnsureBody, @@ -85,6 +85,7 @@ pub fn api() -> SledApiDescription { api.register(host_os_write_status_get)?; api.register(host_os_write_status_delete)?; api.register(inventory)?; + api.register(bootstore_status)?; Ok(()) } @@ -972,3 +973,23 @@ async fn inventory( let sa = request_context.context(); Ok(HttpResponseOk(sa.inventory()?)) } + +/// Get the internal state of the local bootstore node +#[endpoint { + method = GET, + path = "/bootstore/status", +}] +async fn bootstore_status( + request_context: RequestContext, +) -> Result, HttpError> { + let sa = request_context.context(); + let bootstore = sa.bootstore(); + let status = bootstore + .get_status() + .await + .map_err(|e| { + HttpError::from(omicron_common::api::external::Error::from(e)) + })? + .into(); + Ok(HttpResponseOk(status)) +} diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 52bfb20e5d..7ed1264d9c 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -25,6 +25,7 @@ use sled_hardware::Baseboard; pub use sled_hardware::DendriteAsic; use sled_storage::dataset::DatasetKind; use sled_storage::dataset::DatasetName; +use std::collections::BTreeSet; use std::fmt::{Debug, Display, Formatter, Result as FormatResult}; use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::str::FromStr; @@ -865,3 +866,45 @@ pub struct Inventory { pub usable_physical_ram: ByteCount, pub reservoir_size: ByteCount, } + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct EstablishedConnection { + baseboard: Baseboard, + addr: SocketAddrV6, +} + +impl From<(Baseboard, SocketAddrV6)> for EstablishedConnection { + fn from(value: (Baseboard, SocketAddrV6)) -> Self { + EstablishedConnection { baseboard: value.0, addr: value.1 } + } +} + +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct BootstoreStatus { + pub fsm_ledger_generation: u64, + pub network_config_ledger_generation: Option, + pub fsm_state: String, + pub peers: BTreeSet, + pub established_connections: Vec, + pub accepted_connections: BTreeSet, + pub negotiating_connections: BTreeSet, +} + +impl From for BootstoreStatus { + fn from(value: bootstore::schemes::v0::Status) -> Self { + BootstoreStatus { + fsm_ledger_generation: value.fsm_ledger_generation, + network_config_ledger_generation: value + .network_config_ledger_generation, + fsm_state: value.fsm_state.to_string(), + peers: value.peers, + established_connections: value + .connections + .into_iter() + .map(EstablishedConnection::from) + .collect(), + accepted_connections: value.accepted_connections, + negotiating_connections: value.negotiating_connections, + } + } +} diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index bcc354232e..1a634a6346 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -105,6 +105,9 @@ pub enum Error { #[error("Failed to operate on underlay device: {0}")] Underlay(#[from] underlay::Error), + #[error("Failed to request firewall rules")] + FirewallRequest(#[source] nexus_client::Error), + #[error(transparent)] Services(#[from] crate::services::Error), @@ -602,11 +605,24 @@ impl SledAgent { retry_notify( retry_policy_internal_service_aggressive(), || async { - self.inner - .services - .load_services() + // Load as many services as we can, and don't exit immediately + // upon failure... + let load_services_result = + self.inner.services.load_services().await.map_err(|err| { + BackoffError::transient(Error::from(err)) + }); + + // ... and request firewall rule updates for as many services as + // we can. Note that we still make this request even if we only + // partially load some services. + let firewall_result = self + .request_firewall_update() .await - .map_err(|err| BackoffError::transient(err)) + .map_err(|err| BackoffError::transient(err)); + + // Only complete if we have loaded all services and firewall + // rules successfully. + load_services_result.and(firewall_result) }, |err, delay| { warn!( @@ -618,10 +634,6 @@ impl SledAgent { ) .await .unwrap(); // we retry forever, so this can't fail - - // Now that we've initialized the sled services, notify nexus again - // at which point it'll plumb any necessary firewall rules back to us. - self.notify_nexus_about_self(&self.log); } pub(crate) fn switch_zone_underlay_info( @@ -642,7 +654,26 @@ impl SledAgent { &self.inner.start_request } - // Sends a request to Nexus informing it that the current sled exists. + /// Requests firewall rules from Nexus. + /// + /// Does not retry upon failure. + async fn request_firewall_update(&self) -> Result<(), Error> { + let sled_id = self.inner.id; + + self.inner + .nexus_client + .client() + .sled_firewall_rules_request(&sled_id) + .await + .map_err(|err| Error::FirewallRequest(err))?; + Ok(()) + } + + /// Sends a request to Nexus informing it that the current sled exists, + /// with information abou the existing set of hardware. + /// + /// Does not block until Nexus is available -- the future created by this + /// function is retried in a queue that is polled in the background. pub(crate) fn notify_nexus_about_self(&self, log: &Logger) { let sled_id = self.inner.id; let nexus_client = self.inner.nexus_client.clone(); @@ -658,7 +689,7 @@ impl SledAgent { let log = log.clone(); let fut = async move { // Notify the control plane that we're up, and continue trying this - // until it succeeds. We retry with an randomized, capped + // until it succeeds. We retry with a randomized, capped // exponential backoff. // // TODO-robustness if this returns a 400 error, we probably want to diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index 69283e142d..6895170e02 100644 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="d58b59b2682183855cdf137ab86a298ecdaaf697" +COMMIT="3618dd6017b363c5d34399273453cf50b9c9a43e" SHA2="aa670165e5b459fab4caba36ae4d382a09264ff5cf6a2dac0dae0a0db39a378e" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 18e525c0d3..74b379f359 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="d48408ac1c12332c0bf95834a9372f20d9ff53efa1d79513ea868f616a91cb97" -CIDL_SHA256_LINUX_DPD="b6d45d713a879f1d03422eb0bbfa33767da776b05b0d5fbb855b602974ceba01" +CIDL_SHA256_ILLUMOS="eb98985871f321411f7875ef7751dba85ae0dd3034877b63ccb78cedcb96e6e7" +CIDL_SHA256_LINUX_DPD="cb9a1978d1fe3a3f2391757f80436d8cc87c0041161652ad2234e7cf83e9ae36" CIDL_SHA256_LINUX_SWADM="b7e737be56a8a815a95624f0b5c42ce1e339b07feeae7b3d7b9b4bc17c204245"