From 9f4ba0688873b6adf05e4c7359bf76ad91676be4 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Tue, 27 Aug 2024 18:06:57 -0700
Subject: [PATCH] add "live tests" (#6427)

---
 .config/nextest.toml                        |  14 +-
 .github/buildomat/build-and-test.sh         |   6 +
 Cargo.lock                                  |  43 ++++
 Cargo.toml                                  |   6 +
 README.adoc                                 |   2 +
 dev-tools/xtask/Cargo.toml                  |   2 +
 dev-tools/xtask/src/check_workspace_deps.rs |   1 +
 dev-tools/xtask/src/clippy.rs               |  25 +-
 dev-tools/xtask/src/common.rs               |  34 +++
 dev-tools/xtask/src/live_tests.rs           | 156 ++++++++++++
 dev-tools/xtask/src/main.rs                 |   6 +
 end-to-end-tests/README.adoc                |   2 +
 live-tests/Cargo.toml                       |  41 ++++
 live-tests/README.adoc                      |  78 ++++++
 live-tests/build.rs                         |  10 +
 live-tests/macros/Cargo.toml                |  16 ++
 live-tests/macros/src/lib.rs                |  86 +++++++
 live-tests/tests/common/mod.rs              | 249 ++++++++++++++++++++
 live-tests/tests/common/reconfigurator.rs   | 103 ++++++++
 live-tests/tests/test_nexus_add_remove.rs   | 229 ++++++++++++++++++
 nexus/db-queries/src/db/datastore/mod.rs    |  35 ++-
 nexus/reconfigurator/preparation/src/lib.rs | 130 +++++-----
 nexus/src/app/deployment.rs                 |  63 +----
 23 files changed, 1192 insertions(+), 145 deletions(-)
 create mode 100644 dev-tools/xtask/src/common.rs
 create mode 100644 dev-tools/xtask/src/live_tests.rs
 create mode 100644 live-tests/Cargo.toml
 create mode 100644 live-tests/README.adoc
 create mode 100644 live-tests/build.rs
 create mode 100644 live-tests/macros/Cargo.toml
 create mode 100644 live-tests/macros/src/lib.rs
 create mode 100644 live-tests/tests/common/mod.rs
 create mode 100644 live-tests/tests/common/reconfigurator.rs
 create mode 100644 live-tests/tests/test_nexus_add_remove.rs

diff --git a/.config/nextest.toml b/.config/nextest.toml
index 95d4c20102..67cabe9f0e 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -10,7 +10,9 @@ experimental = ["setup-scripts"]
 [[profile.default.scripts]]
 # Exclude omicron-dev tests from crdb-seed as we explicitly want to simulate an
 # environment where the seed file doesn't exist.
-filter = 'rdeps(nexus-test-utils) - package(omicron-dev)'
+# Exclude omicron-live-tests because those don't need this and also don't have
+# it available in the environment in which they run.
+filter = 'rdeps(nexus-test-utils) - package(omicron-dev) - package(omicron-live-tests)'
 setup = 'crdb-seed'
 
 [profile.ci]
@@ -21,18 +23,26 @@ fail-fast = false
 # invocations of nextest happen.
 command = 'cargo run -p crdb-seed --profile test'
 
+[test-groups]
 # The ClickHouse cluster tests currently rely on a hard-coded set of ports for
 # the nodes in the cluster. We would like to relax this in the future, at which
 # point this test-group configuration can be removed or at least loosened to
 # support testing in parallel. For now, enforce strict serialization for all
 # tests with `replicated` in the name.
-[test-groups]
 clickhouse-cluster = { max-threads = 1 }
+# While most Omicron tests operate with their own simulated control plane, the
+# live-tests operate on a more realistic, shared control plane and test
+# behaviors that conflict with each other.  They need to be run serially.
+live-tests = { max-threads = 1 }
 
 [[profile.default.overrides]]
 filter = 'package(oximeter-db) and test(replicated)'
 test-group = 'clickhouse-cluster'
 
+[[profile.default.overrides]]
+filter = 'package(omicron-live-tests)'
+test-group = 'live-tests'
+
 [[profile.default.overrides]]
 # These tests can time out under heavy contention.
 filter = 'binary_id(omicron-nexus::test_all) and test(::schema::)'
diff --git a/.github/buildomat/build-and-test.sh b/.github/buildomat/build-and-test.sh
index 1e4b655cb9..50bbd8194d 100755
--- a/.github/buildomat/build-and-test.sh
+++ b/.github/buildomat/build-and-test.sh
@@ -89,6 +89,12 @@ ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose
 banner doctest
 ptime -m timeout 1h cargo test --doc --locked --verbose --no-fail-fast
 
+# Build the live-tests.  This is only supported on illumos.
+# We also can't actually run them here.  See the README for more details.
+if [[ $target_os == "illumos" ]]; then
+    ptime -m cargo xtask live-tests
+fi
+
 # We expect the seed CRDB to be placed here, so we explicitly remove it so the
 # rmdir check below doesn't get triggered. Nextest doesn't have support for
 # teardown scripts so this is the best we've got.
diff --git a/Cargo.lock b/Cargo.lock
index b9ca47dc1e..eaabc31493 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4596,6 +4596,15 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
+[[package]]
+name = "live-tests-macros"
+version = "0.1.0"
+dependencies = [
+ "omicron-workspace-hack",
+ "quote",
+ "syn 2.0.74",
+]
+
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@@ -6066,6 +6075,38 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "omicron-live-tests"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "assert_matches",
+ "dropshot 0.10.2-dev",
+ "futures",
+ "internal-dns",
+ "live-tests-macros",
+ "nexus-client",
+ "nexus-config",
+ "nexus-db-model",
+ "nexus-db-queries",
+ "nexus-reconfigurator-planning",
+ "nexus-reconfigurator-preparation",
+ "nexus-sled-agent-shared",
+ "nexus-types",
+ "omicron-common",
+ "omicron-rpaths",
+ "omicron-test-utils",
+ "omicron-workspace-hack",
+ "pq-sys",
+ "reqwest",
+ "serde",
+ "slog",
+ "slog-error-chain",
+ "textwrap",
+ "tokio",
+ "uuid",
+]
+
 [[package]]
 name = "omicron-nexus"
 version = "0.1.0"
@@ -12305,6 +12346,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "camino",
+ "camino-tempfile",
  "cargo_metadata",
  "cargo_toml",
  "clap",
@@ -12313,6 +12355,7 @@ dependencies = [
  "serde",
  "swrite",
  "tabled",
+ "textwrap",
  "toml 0.8.19",
  "usdt",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 5f74b259b4..c8df0dc17b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -52,6 +52,8 @@ members = [
     "internal-dns",
     "ipcc",
     "key-manager",
+    "live-tests",
+    "live-tests/macros",
     "nexus",
     "nexus-config",
     "nexus-sled-agent-shared",
@@ -170,6 +172,9 @@ default-members = [
     "internal-dns",
     "ipcc",
     "key-manager",
+    # Do not include live-tests in the list of default members because its tests
+    # only work in a deployed system.  The macros can be here, though.
+    "live-tests/macros",
     "nexus",
     "nexus-config",
     "nexus-sled-agent-shared",
@@ -403,6 +408,7 @@ libc = "0.2.158"
 libfalcon = { git = "https://github.com/oxidecomputer/falcon", rev = "e69694a1f7cc9fe31fab27f321017280531fb5f7" }
 libnvme = { git = "https://github.com/oxidecomputer/libnvme", rev = "dd5bb221d327a1bc9287961718c3c10d6bd37da0" }
 linear-map = "1.2.0"
+live-tests-macros = { path = "live-tests/macros" }
 macaddr = { version = "1.0.1", features = ["serde_std"] }
 maplit = "1.0.2"
 mockall = "0.13"
diff --git a/README.adoc b/README.adoc
index d48a5c9736..c2473a7a07 100644
--- a/README.adoc
+++ b/README.adoc
@@ -62,6 +62,8 @@ Nextest https://github.com/nextest-rs/nextest/issues/16[does not support doctest
 
 Similarly, you can run tests inside a https://github.com/oxidecomputer/falcon[Falcon] based VM. This is described in the `test-utils` https://github.com/oxidecomputer/omicron/tree/main/test-utils[README].
 
+There's also a xref:./live-tests/README.adoc[`live-tests`] test suite that can be run by hand in a _deployed_ Omicron system.
+
 === rustfmt and clippy
 
 You can **format the code** using `cargo fmt`.  Make sure to run this before pushing changes.  The CI checks that the code is correctly formatted.
diff --git a/dev-tools/xtask/Cargo.toml b/dev-tools/xtask/Cargo.toml
index ec1b7825c6..508d0c73ee 100644
--- a/dev-tools/xtask/Cargo.toml
+++ b/dev-tools/xtask/Cargo.toml
@@ -24,6 +24,7 @@ workspace = true
 # downstream binaries do depend on it.)
 anyhow.workspace = true
 camino.workspace = true
+camino-tempfile.workspace = true
 cargo_toml = "0.20"
 cargo_metadata.workspace = true
 clap.workspace = true
@@ -32,5 +33,6 @@ macaddr.workspace = true
 serde.workspace = true
 swrite.workspace = true
 tabled.workspace = true
+textwrap.workspace = true
 toml.workspace = true
 usdt.workspace = true
diff --git a/dev-tools/xtask/src/check_workspace_deps.rs b/dev-tools/xtask/src/check_workspace_deps.rs
index 73d5643ffb..92e83d8724 100644
--- a/dev-tools/xtask/src/check_workspace_deps.rs
+++ b/dev-tools/xtask/src/check_workspace_deps.rs
@@ -128,6 +128,7 @@ pub fn run_cmd() -> Result<()> {
                 // The tests here should not be run by default, as they require
                 // a running control plane.
                 "end-to-end-tests",
+                "omicron-live-tests",
             ]
             .contains(&package.name.as_str())
             .then_some(&package.id)
diff --git a/dev-tools/xtask/src/clippy.rs b/dev-tools/xtask/src/clippy.rs
index 7924a05574..229d0e126e 100644
--- a/dev-tools/xtask/src/clippy.rs
+++ b/dev-tools/xtask/src/clippy.rs
@@ -4,7 +4,8 @@
 
 //! Subcommand: cargo xtask clippy
 
-use anyhow::{bail, Context, Result};
+use crate::common::run_subcmd;
+use anyhow::Result;
 use clap::Parser;
 use std::process::Command;
 
@@ -51,25 +52,5 @@ pub fn run_cmd(args: ClippyArgs) -> Result<()> {
         .arg("--deny")
         .arg("warnings");
 
-    eprintln!(
-        "running: {:?} {}",
-        &cargo,
-        command
-            .get_args()
-            .map(|arg| format!("{:?}", arg.to_str().unwrap()))
-            .collect::<Vec<_>>()
-            .join(" ")
-    );
-
-    let exit_status = command
-        .spawn()
-        .context("failed to spawn child process")?
-        .wait()
-        .context("failed to wait for child process")?;
-
-    if !exit_status.success() {
-        bail!("clippy failed: {}", exit_status);
-    }
-
-    Ok(())
+    run_subcmd(command)
 }
diff --git a/dev-tools/xtask/src/common.rs b/dev-tools/xtask/src/common.rs
new file mode 100644
index 0000000000..03b17a560f
--- /dev/null
+++ b/dev-tools/xtask/src/common.rs
@@ -0,0 +1,34 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Common xtask command helpers
+
+use anyhow::{bail, Context, Result};
+use std::process::Command;
+
+/// Runs the given command, printing some basic debug information around it, and
+/// failing with an error message if the command does not exit successfully
+pub fn run_subcmd(mut command: Command) -> Result<()> {
+    eprintln!(
+        "running: {} {}",
+        command.get_program().to_str().unwrap(),
+        command
+            .get_args()
+            .map(|arg| format!("{:?}", arg.to_str().unwrap()))
+            .collect::<Vec<_>>()
+            .join(" ")
+    );
+
+    let exit_status = command
+        .spawn()
+        .context("failed to spawn child process")?
+        .wait()
+        .context("failed to wait for child process")?;
+
+    if !exit_status.success() {
+        bail!("failed: {}", exit_status);
+    }
+
+    Ok(())
+}
diff --git a/dev-tools/xtask/src/live_tests.rs b/dev-tools/xtask/src/live_tests.rs
new file mode 100644
index 0000000000..35d40343ee
--- /dev/null
+++ b/dev-tools/xtask/src/live_tests.rs
@@ -0,0 +1,156 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Subcommand: cargo xtask live-tests
+
+use crate::common::run_subcmd;
+use anyhow::{bail, Context, Result};
+use clap::Parser;
+use std::process::Command;
+
+#[derive(Parser)]
+pub struct Args {}
+
+pub fn run_cmd(_args: Args) -> Result<()> {
+    const NAME: &str = "live-tests-archive";
+
+    // The live tests operate in deployed environments, which always run
+    // illumos.  Bail out quickly if someone tries to run this on a system whose
+    // binaries won't be usable.  (We could compile this subcommand out
+    // altogether on non-illumos systems, but it seems more confusing to be
+    // silently missing something you might expect to be there.  Plus, you can
+    // still check and even build *this* code on non-illumos systems.)
+    if cfg!(not(target_os = "illumos")) {
+        bail!("live-tests archive can only be built on illumos systems");
+    }
+
+    let tmpdir_root =
+        camino_tempfile::tempdir().context("creating temporary directory")?;
+    let final_tarball = camino::Utf8PathBuf::try_from(
+        std::env::current_dir()
+            .map(|d| d.join("target"))
+            .context("getting current directory")?,
+    )
+    .context("non-UTF-8 current directory")?
+    .join(format!("{}.tgz", NAME));
+    let proto_root = tmpdir_root.path().join(NAME);
+    let nextest_archive_file = proto_root.join("omicron-live-tests.tar.zst");
+
+    eprintln!("using temporary directory: {}", tmpdir_root.path());
+    eprintln!("will create archive file:  {}", nextest_archive_file);
+    eprintln!("output tarball:            {}", final_tarball);
+    eprintln!();
+
+    std::fs::create_dir(&proto_root)
+        .with_context(|| format!("mkdir {:?}", &proto_root))?;
+
+    let cargo =
+        std::env::var("CARGO").unwrap_or_else(|_| String::from("cargo"));
+    let mut command = Command::new(&cargo);
+
+    command.arg("nextest");
+    command.arg("archive");
+    command.arg("--package");
+    command.arg("omicron-live-tests");
+    command.arg("--archive-file");
+    command.arg(&nextest_archive_file);
+    run_subcmd(command)?;
+
+    // Using nextest archives requires that the source be separately transmitted
+    // to the system where the tests will be run.  We're trying to automate
+    // that.  So let's bundle up the source and the nextest archive into one big
+    // tarball.  But which source files do we bundle?  We need:
+    //
+    // - Cargo.toml (nextest expects to find this)
+    // - .config/nextest.toml (nextest's configuration, which is used while
+    //   running the tests)
+    // - live-tests (this is where the tests live, and they might expect stuff
+    //   that exists in here like expectorate files)
+    //
+    // plus the nextext archive file.
+    //
+    // To avoid creating a tarbomb, we want all the files prefixed with
+    // "live-tests-archive/".  There's no great way to do this with the illumos
+    // tar(1) except to create a temporary directory called "live-tests-archive"
+    // that contains the files and then tar'ing up that.
+    //
+    // Ironically, an easy way to construct that directory is with tar(1).
+    let mut command = Command::new("bash");
+    command.arg("-c");
+    command.arg(format!(
+        "tar cf - Cargo.toml .config/nextest.toml live-tests | \
+         tar xf - -C {:?}",
+        &proto_root
+    ));
+    run_subcmd(command)?;
+
+    let mut command = Command::new("tar");
+    command.arg("cf");
+    command.arg(&final_tarball);
+    command.arg("-C");
+    command.arg(tmpdir_root.path());
+    command.arg(NAME);
+    run_subcmd(command)?;
+
+    drop(tmpdir_root);
+
+    eprint!("created: ");
+    println!("{}", &final_tarball);
+    eprintln!("\nTo use this:\n");
+    eprintln!(
+        "1. Copy the tarball to the switch zone in a deployed Omicron system.\n"
+    );
+    let raw = &[
+        "scp \\",
+        &format!("{} \\", &final_tarball),
+        "root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root",
+    ]
+    .join("\n");
+    let text = textwrap::wrap(
+        &raw,
+        textwrap::Options::new(160)
+            .initial_indent("     e.g., ")
+            .subsequent_indent("              "),
+    );
+    eprintln!("{}\n", text.join("\n"));
+    eprintln!("2. Copy the `cargo-nextest` binary to the same place.\n");
+    let raw = &[
+        "scp \\",
+        "$(which cargo-nextest) \\",
+        "root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root",
+    ]
+    .join("\n");
+    let text = textwrap::wrap(
+        &raw,
+        textwrap::Options::new(160)
+            .initial_indent("     e.g., ")
+            .subsequent_indent("              "),
+    );
+    eprintln!("{}\n", text.join("\n"));
+    eprintln!("3. On that system, unpack the tarball with:\n");
+    eprintln!("     tar xzf {}\n", final_tarball.file_name().unwrap());
+    eprintln!("4. On that system, run tests with:\n");
+    // TMPDIR=/var/tmp puts stuff on disk, cached as needed, rather than the
+    // default /tmp which requires that stuff be in-memory.  That can lead to
+    // great sadness if the tests wind up writing a lot of data.
+    let raw = &[
+        "TMPDIR=/var/tmp ./cargo-nextest nextest run \\",
+        &format!(
+            "--archive-file {}/{} \\",
+            NAME,
+            nextest_archive_file.file_name().unwrap()
+        ),
+        &format!("--workspace-remap {}", NAME),
+    ]
+    .join("\n");
+    let text = textwrap::wrap(
+        &raw,
+        textwrap::Options::new(160)
+            .initial_indent("     ")
+            .subsequent_indent("         "),
+    );
+    eprintln!("{}\n", text.join("\n"));
+
+    Ok(())
+}
diff --git a/dev-tools/xtask/src/main.rs b/dev-tools/xtask/src/main.rs
index 02fd05a198..9880adeb67 100644
--- a/dev-tools/xtask/src/main.rs
+++ b/dev-tools/xtask/src/main.rs
@@ -16,8 +16,10 @@ use std::process::Command;
 mod check_features;
 mod check_workspace_deps;
 mod clippy;
+mod common;
 #[cfg_attr(not(target_os = "illumos"), allow(dead_code))]
 mod external;
+mod live_tests;
 mod usdt;
 
 #[cfg(target_os = "illumos")]
@@ -59,6 +61,9 @@ enum Cmds {
     /// Download binaries, OpenAPI specs, and other out-of-repo utilities.
     Download(external::External),
 
+    /// Create a bundle of live tests
+    LiveTests(live_tests::Args),
+
     /// Utilities for working with MGS.
     MgsDev(external::External),
     /// Utilities for working with Omicron.
@@ -127,6 +132,7 @@ fn main() -> Result<()> {
                 external.exec_bin("xtask-downloader")
             }
         }
+        Cmds::LiveTests(args) => live_tests::run_cmd(args),
         Cmds::MgsDev(external) => external.exec_bin("mgs-dev"),
         Cmds::OmicronDev(external) => external.exec_bin("omicron-dev"),
         Cmds::Openapi(external) => external.exec_bin("openapi-manager"),
diff --git a/end-to-end-tests/README.adoc b/end-to-end-tests/README.adoc
index b9766db809..3e31f2b382 100644
--- a/end-to-end-tests/README.adoc
+++ b/end-to-end-tests/README.adoc
@@ -4,6 +4,8 @@ These tests run in Buildomat. They are built by the xref:../.github/buildomat/jo
 
 This package is not built or run by default (it is excluded from `default-members` in xref:../Cargo.toml[]).
 
+See also: xref:../live-tests/README.adoc[omicron-live-tests].
+
 == Running these tests on your machine
 
 1. xref:../docs/how-to-run.adoc[Make yourself a Gimlet].
diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml
new file mode 100644
index 0000000000..e0eaf2c338
--- /dev/null
+++ b/live-tests/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "omicron-live-tests"
+version = "0.1.0"
+edition = "2021"
+license = "MPL-2.0"
+
+[build-dependencies]
+omicron-rpaths.workspace = true
+
+[dependencies]
+# See omicron-rpaths for more about the "pq-sys" dependency.
+pq-sys = "*"
+omicron-workspace-hack.workspace = true
+
+[dev-dependencies]
+anyhow.workspace = true
+assert_matches.workspace = true
+dropshot.workspace = true
+futures.workspace = true
+internal-dns.workspace = true
+live-tests-macros.workspace = true
+nexus-client.workspace = true
+nexus-config.workspace = true
+nexus-db-model.workspace = true
+nexus-db-queries.workspace = true
+nexus-reconfigurator-planning.workspace = true
+nexus-reconfigurator-preparation.workspace = true
+nexus-sled-agent-shared.workspace = true
+nexus-types.workspace = true
+omicron-common.workspace = true
+omicron-test-utils.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+slog.workspace = true
+slog-error-chain.workspace = true
+textwrap.workspace = true
+tokio.workspace = true
+uuid.workspace = true
+
+[lints]
+workspace = true
diff --git a/live-tests/README.adoc b/live-tests/README.adoc
new file mode 100644
index 0000000000..9ea3a93e48
--- /dev/null
+++ b/live-tests/README.adoc
@@ -0,0 +1,78 @@
+= Omicron live tests
+
+The `omicron-live-tests` package contains automated tests that operate in the context of an already-deployed "real" Oxide system (e.g., `a4x2` or our `london` or `madrid` test environments).  This is a home for automated tests for all kinds of Reconfigurator behavior (e.g., add/expunge of all zones, add/expunge sled, upgrades, etc.).  It can probably be used for non-Reconfigurator behavior, too.
+
+This package is not built or tested by default because the tests generally can't work in a dev environment and there's no way to have `cargo` build and check them but not run the tests by default.
+
+== Why a separate test suite?
+
+What makes these tests different from the rest of the test suite is that they require connectivity to the underlay network of the deployed system and they make API calls to various components in that system and they assume that this will behave like a real production system.  By contrast, the normal tests instead _set up_ a bunch of components using simulated sled agents and localhost networking, which is great for starting from a predictable state and running tests in parallel, but the simulated sled agents and networking make it impossible to exercise quite a lot of Reconfigurator's functionality.  
+
+There are also the `end-to-end-tests`.  That environment is more realistic than the main test suite, but not faithful enough for many Reconfigurator tests.
+
+== Production systems
+
+There are some safeguards so that these tests won't run on production systems: they refuse to run if they find any Oxide-hardware sleds in the system whose serial numbers don't correspond to known test environments.
+
+== Usage
+
+These tests are not currently run automatically (though they are _built_ in CI).
+
+You can run them yourself.  First, deploy Omicron using `a4x2` or one of the hardware test rigs.  In your Omicron workspace, run `cargo xtask live-tests` to build an archive and then follow the instructions:
+
+```
+$ cargo xtask live-tests
+    Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.96s
+     Running `target/debug/xtask live-tests`
+using temporary directory: /dangerzone/omicron_tmp/.tmp0ItZUD
+will create archive file:  /dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst
+output tarball:            /home/dap/omicron-work/target/live-tests-archive.tgz
+
+running: /home/dap/.rustup/toolchains/1.80.1-x86_64-unknown-illumos/bin/cargo "nextest" "archive" "--package" "omicron-live-tests" "--archive-file" "/dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst"
+    Finished `test` profile [unoptimized + debuginfo] target(s) in 0.89s
+info: experimental features enabled: setup-scripts
+   Archiving 1 binary, 1 build script output directory, and 1 linked path to /dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst
+    Archived 35 files to /dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive/omicron-live-tests.tar.zst in 0.31s
+running: bash "-c" "tar cf - Cargo.toml .config/nextest.toml live-tests | tar xf - -C \"/dangerzone/omicron_tmp/.tmp0ItZUD/live-tests-archive\""
+running: tar "cf" "/home/dap/omicron-work/target/live-tests-archive.tgz" "-C" "/dangerzone/omicron_tmp/.tmp0ItZUD" "live-tests-archive"
+created: /home/dap/omicron-work/target/live-tests-archive.tgz
+
+To use this:
+
+1. Copy the tarball to the switch zone in a deployed Omicron system.
+
+     e.g., scp \
+              /home/dap/omicron-work/target/live-tests-archive.tgz \
+              root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root
+
+2. Copy the `cargo-nextest` binary to the same place.
+
+     e.g., scp \
+              $(which cargo-nextest) \
+              root@YOUR_SCRIMLET_GZ_IP:/zone/oxz_switch/root/root
+
+3. On that system, unpack the tarball with:
+
+     tar xzf live-tests-archive.tgz
+
+4. On that system, run tests with:
+
+     TMPDIR=/var/tmp ./cargo-nextest nextest run \
+         --archive-file live-tests-archive/omicron-live-tests.tar.zst \
+         --workspace-remap live-tests-archive
+```
+
+Follow the instructions, run the tests, and you'll see the usual `nextest`-style output:
+
+```
+root@oxz_switch:~# TMPDIR=/var/tmp ./cargo-nextest nextest run          --archive-file live-tests-archive/omicron-live-tests.tar.zst          --workspace-remap live-tests-archive
+  Extracting 1 binary, 1 build script output directory, and 1 linked path to /var/tmp/nextest-archive-Lqx9VZ
+   Extracted 35 files to /var/tmp/nextest-archive-Lqx9VZ in 1.01s
+info: experimental features enabled: setup-scripts
+    Starting 1 test across 1 binary (run ID: a5fc9163-9dd5-4b23-b89f-55f8f39ebbbc, nextest profile: default)
+        SLOW [> 60.000s] omicron-live-tests::test_nexus_add_remove test_nexus_add_remove
+        PASS [  61.975s] omicron-live-tests::test_nexus_add_remove test_nexus_add_remove
+------------
+     Summary [  61.983s] 1 test run: 1 passed (1 slow), 0 skipped
+root@oxz_switch:~# 
+```
diff --git a/live-tests/build.rs b/live-tests/build.rs
new file mode 100644
index 0000000000..1ba9acd41c
--- /dev/null
+++ b/live-tests/build.rs
@@ -0,0 +1,10 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+// See omicron-rpaths for documentation.
+// NOTE: This file MUST be kept in sync with the other build.rs files in this
+// repository.
+fn main() {
+    omicron_rpaths::configure_default_omicron_rpaths();
+}
diff --git a/live-tests/macros/Cargo.toml b/live-tests/macros/Cargo.toml
new file mode 100644
index 0000000000..81d094d926
--- /dev/null
+++ b/live-tests/macros/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "live-tests-macros"
+version = "0.1.0"
+edition = "2021"
+license = "MPL-2.0"
+
+[lib]
+proc-macro = true
+
+[lints]
+workspace = true
+
+[dependencies]
+quote.workspace = true
+syn = { workspace = true, features = [ "fold", "parsing" ] }
+omicron-workspace-hack.workspace = true
diff --git a/live-tests/macros/src/lib.rs b/live-tests/macros/src/lib.rs
new file mode 100644
index 0000000000..4fdd4029b5
--- /dev/null
+++ b/live-tests/macros/src/lib.rs
@@ -0,0 +1,86 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Macro to wrap a live test function that automatically creates and cleans up
+//! the `LiveTestContext`
+
+use proc_macro::TokenStream;
+use quote::quote;
+use syn::{parse_macro_input, ItemFn};
+
+/// Define a test function that uses `LiveTestContext`
+///
+/// This is usable only within the `omicron-live-tests` crate.
+///
+/// Similar to `nexus_test`, this macro lets you define a test function that
+/// behaves like `tokio::test` except that it accepts an argument of type
+/// `&LiveTestContext`.  The `LiveTestContext` is cleaned up on _successful_
+/// return of the test function.  On failure, debugging information is
+/// deliberately left around.
+///
+/// Example usage:
+///
+/// ```ignore
+/// #[live_test]
+/// async fn test_my_test_case(lc: &LiveTestContext) {
+///   assert!(true);
+/// }
+/// ```
+///
+/// We use this instead of implementing Drop on LiveTestContext because we want
+/// the teardown to only happen when the test doesn't fail (which causes a panic
+/// and unwind).
+#[proc_macro_attribute]
+pub fn live_test(_attrs: TokenStream, input: TokenStream) -> TokenStream {
+    let input_func = parse_macro_input!(input as ItemFn);
+
+    let mut correct_signature = true;
+    if input_func.sig.variadic.is_some()
+        || input_func.sig.inputs.len() != 1
+        || input_func.sig.asyncness.is_none()
+    {
+        correct_signature = false;
+    }
+
+    // Verify we're returning an empty tuple
+    correct_signature &= match input_func.sig.output {
+        syn::ReturnType::Default => true,
+        syn::ReturnType::Type(_, ref t) => {
+            if let syn::Type::Tuple(syn::TypeTuple { elems, .. }) = &**t {
+                elems.is_empty()
+            } else {
+                false
+            }
+        }
+    };
+    if !correct_signature {
+        panic!("func signature must be async fn(&LiveTestContext)");
+    }
+
+    let func_ident_string = input_func.sig.ident.to_string();
+    let func_ident = input_func.sig.ident.clone();
+    let new_block = quote! {
+        {
+            #input_func
+
+            let ctx = crate::common::LiveTestContext::new(
+                #func_ident_string
+            ).await.expect("setting up LiveTestContext");
+            #func_ident(&ctx).await;
+            ctx.cleanup_successful();
+        }
+    };
+    let mut sig = input_func.sig.clone();
+    sig.inputs.clear();
+    let func = ItemFn {
+        attrs: input_func.attrs,
+        vis: input_func.vis,
+        sig,
+        block: Box::new(syn::parse2(new_block).unwrap()),
+    };
+    TokenStream::from(quote!(
+    #[::tokio::test]
+    #func
+    ))
+}
diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs
new file mode 100644
index 0000000000..28f677f5ed
--- /dev/null
+++ b/live-tests/tests/common/mod.rs
@@ -0,0 +1,249 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+pub mod reconfigurator;
+
+use anyhow::{anyhow, ensure, Context};
+use dropshot::test_util::LogContext;
+use internal_dns::resolver::Resolver;
+use internal_dns::ServiceName;
+use nexus_config::PostgresConfigWithUrl;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::DataStore;
+use nexus_types::deployment::SledFilter;
+use omicron_common::address::Ipv6Subnet;
+use slog::info;
+use slog::o;
+use std::ffi::OsStr;
+use std::net::SocketAddrV6;
+use std::path::Component;
+use std::sync::Arc;
+
+/// Contains data and interfaces useful for running tests against an existing
+/// deployed control plane
+pub struct LiveTestContext {
+    logctx: LogContext,
+    opctx: OpContext,
+    resolver: Resolver,
+    datastore: Arc<DataStore>,
+}
+
+impl LiveTestContext {
+    /// Make a new `LiveTestContext` for a test called `test_name`.
+    pub async fn new(
+        test_name: &'static str,
+    ) -> Result<LiveTestContext, anyhow::Error> {
+        let logctx = omicron_test_utils::dev::test_setup_log(test_name);
+        let log = &logctx.log;
+        let resolver = create_resolver(log)?;
+        check_execution_environment(&resolver).await?;
+        let datastore = create_datastore(&log, &resolver).await?;
+        let opctx = OpContext::for_tests(log.clone(), datastore.clone());
+        check_hardware_environment(&opctx, &datastore).await?;
+        Ok(LiveTestContext { logctx, opctx, resolver, datastore })
+    }
+
+    /// Clean up this `LiveTestContext`
+    ///
+    /// This mainly removes log files created by the test.  We do this in this
+    /// explicit cleanup function rather than on `Drop` because we want the log
+    /// files preserved on test failure.
+    pub fn cleanup_successful(self) {
+        self.logctx.cleanup_successful();
+    }
+
+    /// Returns a logger suitable for use in the test
+    pub fn log(&self) -> &slog::Logger {
+        &self.logctx.log
+    }
+
+    /// Returns an `OpContext` suitable for use in tests
+    pub fn opctx(&self) -> &OpContext {
+        &self.opctx
+    }
+
+    /// Returns a `DataStore` pointing at this deployed system's database
+    pub fn datastore(&self) -> &DataStore {
+        &self.datastore
+    }
+
+    /// Returns a client for a Nexus internal API at the given socket address
+    pub fn specific_internal_nexus_client(
+        &self,
+        sockaddr: SocketAddrV6,
+    ) -> nexus_client::Client {
+        let url = format!("http://{}", sockaddr);
+        let log = self.logctx.log.new(o!("nexus_internal_url" => url.clone()));
+        nexus_client::Client::new(&url, log)
+    }
+
+    /// Returns a list of clients for the internal APIs for all Nexus instances
+    /// found in DNS
+    pub async fn all_internal_nexus_clients(
+        &self,
+    ) -> Result<Vec<nexus_client::Client>, anyhow::Error> {
+        Ok(self
+            .resolver
+            .lookup_all_socket_v6(ServiceName::Nexus)
+            .await
+            .context("looking up Nexus in internal DNS")?
+            .into_iter()
+            .map(|s| self.specific_internal_nexus_client(s))
+            .collect())
+    }
+}
+
+fn create_resolver(log: &slog::Logger) -> Result<Resolver, anyhow::Error> {
+    // In principle, we should look at /etc/resolv.conf to find the DNS servers.
+    // In practice, this usually isn't populated today.  See
+    // oxidecomputer/omicron#2122.
+    //
+    // However, the address selected below should work for most existing Omicron
+    // deployments today.  That's because while the base subnet is in principle
+    // configurable in config-rss.toml, it's very uncommon to change it from the
+    // default value used here.
+    let subnet = Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap());
+    eprintln!("note: using DNS server for subnet {}", subnet.net());
+    internal_dns::resolver::Resolver::new_from_subnet(log.clone(), subnet)
+        .with_context(|| {
+            format!("creating DNS resolver for subnet {}", subnet.net())
+        })
+}
+
+/// Creates a DataStore pointing at the CockroachDB cluster that's in DNS
+async fn create_datastore(
+    log: &slog::Logger,
+    resolver: &Resolver,
+) -> Result<Arc<DataStore>, anyhow::Error> {
+    let sockaddrs = resolver
+        .lookup_all_socket_v6(ServiceName::Cockroach)
+        .await
+        .context("resolving CockroachDB")?;
+
+    let url = format!(
+        "postgresql://root@{}/omicron?sslmode=disable",
+        sockaddrs
+            .into_iter()
+            .map(|a| a.to_string())
+            .collect::<Vec<_>>()
+            .join(",")
+    )
+    .parse::<PostgresConfigWithUrl>()
+    .context("failed to parse constructed postgres URL")?;
+
+    let db_config = nexus_db_queries::db::Config { url };
+    let pool =
+        Arc::new(nexus_db_queries::db::Pool::new_single_host(log, &db_config));
+    DataStore::new_failfast(log, pool)
+        .await
+        .context("creating DataStore")
+        .map(Arc::new)
+}
+
+/// Performs quick checks to determine if the user is running these tests in the
+/// wrong place and bails out if so
+///
+/// This isn't perfect but seeks to fail fast in obviously bogus environments
+/// that someone might accidentally try to run this in.
+async fn check_execution_environment(
+    resolver: &Resolver,
+) -> Result<(), anyhow::Error> {
+    ensure!(
+        cfg!(target_os = "illumos"),
+        "live tests can only be run on deployed systems, which run illumos"
+    );
+
+    // The only real requirement for these tests is that they're run from a
+    // place with connectivity to the underlay network of a deployed control
+    // plane.  The easiest way to tell is to look up something in internal DNS.
+    resolver.lookup_ip(ServiceName::InternalDns).await.map_err(|e| {
+        let text = format!(
+            "check_execution_environment(): failed to look up internal DNS \
+                 in the internal DNS servers.\n\n \
+                 Are you trying to run this in a development environment?  \
+                 This test can only be run on deployed systems and only from a \
+                 context with connectivity to the underlay network.\n\n \
+                 raw error: {}",
+            slog_error_chain::InlineErrorChain::new(&e)
+        );
+        anyhow!("{}", textwrap::wrap(&text, 80).join("\n"))
+    })?;
+
+    // Warn the user if the temporary directory is /tmp.  This check is
+    // heuristic.  There are other ways they may have specified a tmpfs
+    // temporary directory and we don't claim to catch all of them.
+    //
+    // We could also just go ahead and use /var/tmp, but it's not clear we can
+    // reliably do that at this point (if Rust or other components have cached
+    // TMPDIR) and it would be hard to override.
+    let tmpdir = std::env::temp_dir();
+    let mut tmpdir_components = tmpdir.components().take(2);
+    if let Some(first) = tmpdir_components.next() {
+        if let Some(next) = tmpdir_components.next() {
+            if first == Component::RootDir
+                && next == Component::Normal(OsStr::new("tmp"))
+            {
+                eprintln!(
+                    "WARNING: temporary directory appears to be under /tmp, \
+                     which is generally tmpfs.  Consider setting \
+                     TMPDIR=/var/tmp to avoid runaway tests using too much\
+                     memory and swap."
+                );
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Performs additional checks to determine if we're running in an environment
+/// that we believe is safe to run tests
+///
+/// These tests may make arbitrary modifications to the system.  We don't want
+/// to run this in dogfood or other pre-production or production environments.
+/// This function uses an allowlist of Oxide serials corresponding to test
+/// environments so that it never accidentally runs on a production system.
+///
+/// Non-Oxide hardware (e.g., PCs, a4x2, etc.) are always allowed.
+async fn check_hardware_environment(
+    opctx: &OpContext,
+    datastore: &DataStore,
+) -> Result<(), anyhow::Error> {
+    const ALLOWED_GIMLET_SERIALS: &[&str] = &[
+        // test rig: "madrid"
+        "BRM42220004",
+        "BRM42220081",
+        "BRM42220007",
+        "BRM42220046",
+        // test rig: "london"
+        "BRM42220036",
+        "BRM42220062",
+        "BRM42220030",
+        "BRM44220007",
+    ];
+
+    // Refuse to operate in an environment that might contain real Oxide
+    // hardware that's not known to be part of a test rig.  This is deliberately
+    // conservative.
+    let scary_sleds = datastore
+        .sled_list_all_batched(opctx, SledFilter::Commissioned)
+        .await
+        .context("check_environment: listing commissioned sleds")?
+        .into_iter()
+        .filter_map(|s| {
+            (s.part_number() != "i86pc"
+                && !ALLOWED_GIMLET_SERIALS.contains(&s.serial_number()))
+            .then(|| s.serial_number().to_owned())
+        })
+        .collect::<Vec<_>>();
+    if scary_sleds.is_empty() {
+        info!(&opctx.log, "environment verified");
+        Ok(())
+    } else {
+        Err(anyhow!(
+            "refusing to operate in an environment with an unknown system: {}",
+            scary_sleds.join(", ")
+        ))
+    }
+}
diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs
new file mode 100644
index 0000000000..8f2560bb49
--- /dev/null
+++ b/live-tests/tests/common/reconfigurator.rs
@@ -0,0 +1,103 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Helpers common to Reconfigurator tests
+
+use anyhow::{ensure, Context};
+use nexus_client::types::BlueprintTargetSet;
+use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
+use nexus_types::deployment::{Blueprint, PlanningInput};
+use slog::{debug, info};
+
+/// Modify the system by editing the current target blueprint
+///
+/// More precisely, this function:
+///
+/// - fetches the current target blueprint
+/// - creates a new BlueprintBuilder based on it
+/// - invokes the caller's `edit_fn`, which may modify the builder however it
+///   likes
+/// - generates a new blueprint (thus based on the current target)
+/// - uploads the new blueprint
+/// - sets the new blueprint as the current target
+/// - enables the new blueprint
+///
+/// ## Errors
+///
+/// This function fails if the current target blueprint is not already enabled.
+/// That's because a disabled target blueprint means somebody doesn't want
+/// Reconfigurator running or doesn't want it using that blueprint.  We don't
+/// want the test to inadvertently override that behavior.  In a typical use
+/// case, a developer enables the initial target blueprint before running these
+/// tests and then doesn't need to think about it again for the lifetime of
+/// their test environment.
+pub async fn blueprint_edit_current_target(
+    log: &slog::Logger,
+    planning_input: &PlanningInput,
+    nexus: &nexus_client::Client,
+    edit_fn: &dyn Fn(&mut BlueprintBuilder) -> Result<(), anyhow::Error>,
+) -> Result<(Blueprint, Blueprint), anyhow::Error> {
+    // Fetch the current target configuration.
+    info!(log, "editing current target blueprint");
+    let target_blueprint = nexus
+        .blueprint_target_view()
+        .await
+        .context("fetch current target config")?
+        .into_inner();
+    debug!(log, "found current target blueprint";
+        "blueprint_id" => %target_blueprint.target_id
+    );
+    ensure!(
+        target_blueprint.enabled,
+        "refusing to modify a system with target blueprint disabled"
+    );
+
+    // Fetch the actual blueprint.
+    let blueprint1 = nexus
+        .blueprint_view(&target_blueprint.target_id)
+        .await
+        .context("fetch current target blueprint")?
+        .into_inner();
+    debug!(log, "fetched current target blueprint";
+        "blueprint_id" => %target_blueprint.target_id
+    );
+
+    // Make a new builder based on that blueprint and use `edit_fn` to edit it.
+    let mut builder = BlueprintBuilder::new_based_on(
+        log,
+        &blueprint1,
+        &planning_input,
+        "test-suite",
+    )
+    .context("creating BlueprintBuilder")?;
+
+    edit_fn(&mut builder)?;
+
+    // Assemble the new blueprint, import it, and make it the new target.
+    let blueprint2 = builder.build();
+    info!(log, "assembled new blueprint based on target";
+        "current_target_id" => %target_blueprint.target_id,
+        "new_blueprint_id" => %blueprint2.id,
+    );
+    nexus
+        .blueprint_import(&blueprint2)
+        .await
+        .context("importing new blueprint")?;
+    debug!(log, "imported new blueprint";
+        "blueprint_id" => %blueprint2.id,
+    );
+    nexus
+        .blueprint_target_set(&BlueprintTargetSet {
+            enabled: true,
+            target_id: blueprint2.id,
+        })
+        .await
+        .expect("setting new target");
+    info!(log, "finished editing target blueprint";
+        "old_target_id" => %blueprint1.id,
+        "new_target_id" => %blueprint2.id,
+    );
+
+    Ok((blueprint1, blueprint2))
+}
diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs
new file mode 100644
index 0000000000..70e55b704a
--- /dev/null
+++ b/live-tests/tests/test_nexus_add_remove.rs
@@ -0,0 +1,229 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+mod common;
+
+use anyhow::Context;
+use assert_matches::assert_matches;
+use common::reconfigurator::blueprint_edit_current_target;
+use common::LiveTestContext;
+use futures::TryStreamExt;
+use live_tests_macros::live_test;
+use nexus_client::types::Saga;
+use nexus_client::types::SagaState;
+use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder;
+use nexus_reconfigurator_planning::blueprint_builder::EnsureMultiple;
+use nexus_reconfigurator_preparation::PlanningInputFromDb;
+use nexus_sled_agent_shared::inventory::ZoneKind;
+use nexus_types::deployment::SledFilter;
+use omicron_common::address::NEXUS_INTERNAL_PORT;
+use omicron_test_utils::dev::poll::wait_for_condition;
+use omicron_test_utils::dev::poll::CondCheckError;
+use slog::{debug, info};
+use std::net::SocketAddrV6;
+use std::time::Duration;
+
+// TODO-coverage This test could check other stuff:
+//
+// - that after adding:
+//   - the new Nexus appears in external DNS
+//   - we can _use_ the new Nexus from the outside
+//     (e.g., using an `oxide_client` using a custom reqwest resolver that
+//     points only at that one IP so that we can make sure we're always getting
+//     that one)
+// - that after expungement, it doesn't appear in external DNS any more
+//
+#[live_test]
+async fn test_nexus_add_remove(lc: &LiveTestContext) {
+    // Test setup
+    let log = lc.log();
+    let opctx = lc.opctx();
+    let datastore = lc.datastore();
+    let planning_input = PlanningInputFromDb::assemble(&opctx, &datastore)
+        .await
+        .expect("planning input");
+    let initial_nexus_clients = lc.all_internal_nexus_clients().await.unwrap();
+    let nexus = initial_nexus_clients.first().expect("internal Nexus client");
+
+    // First, deploy a new Nexus zone to an arbitrary sled.
+    let sled_id = planning_input
+        .all_sled_ids(SledFilter::Commissioned)
+        .next()
+        .expect("any sled id");
+    let (blueprint1, blueprint2) = blueprint_edit_current_target(
+        log,
+        &planning_input,
+        &nexus,
+        &|builder: &mut BlueprintBuilder| {
+            let nnexus = builder
+                .sled_num_running_zones_of_kind(sled_id, ZoneKind::Nexus);
+            let count = builder
+                .sled_ensure_zone_multiple_nexus(sled_id, nnexus + 1)
+                .context("adding Nexus zone")?;
+            assert_matches!(
+                count,
+                EnsureMultiple::Changed { added: 1, removed: 0 }
+            );
+            Ok(())
+        },
+    )
+    .await
+    .expect("editing blueprint to add zone");
+
+    // Figure out which zone is new and make a new client for it.
+    let diff = blueprint2.diff_since_blueprint(&blueprint1);
+    let new_zone = diff
+        .zones
+        .added
+        .values()
+        .next()
+        .expect("at least one sled with added zones")
+        .zones
+        .first()
+        .expect("at least one added zone on that sled");
+    assert_eq!(new_zone.kind(), ZoneKind::Nexus);
+    let new_zone_addr = new_zone.underlay_address();
+    let new_zone_sockaddr =
+        SocketAddrV6::new(new_zone_addr, NEXUS_INTERNAL_PORT, 0, 0);
+    let new_zone_client = lc.specific_internal_nexus_client(new_zone_sockaddr);
+
+    // Wait for the new Nexus zone to show up and be usable.
+    let initial_sagas_list = wait_for_condition(
+        || async {
+            list_sagas(&new_zone_client).await.map_err(|e| {
+                debug!(log,
+                    "waiting for new Nexus to be available: listing sagas: {e:#}"
+                );
+                CondCheckError::<()>::NotYet
+            })
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(60),
+    )
+    .await
+    .expect("new Nexus to be usable");
+    assert!(initial_sagas_list.is_empty());
+    info!(log, "new Nexus is online");
+
+    // Create a demo saga from the new Nexus zone.  We'll use this to test that
+    // when the zone is expunged, its saga gets moved to a different Nexus.
+    let demo_saga = new_zone_client
+        .saga_demo_create()
+        .await
+        .expect("new Nexus saga demo create");
+    let saga_id = demo_saga.saga_id;
+    let sagas_list =
+        list_sagas(&new_zone_client).await.expect("new Nexus sagas_list");
+    assert_eq!(sagas_list.len(), 1);
+    assert_eq!(sagas_list[0].id, saga_id);
+    info!(log, "created demo saga"; "demo_saga" => ?demo_saga);
+
+    // Now expunge the zone we just created.
+    let _ = blueprint_edit_current_target(
+        log,
+        &planning_input,
+        &nexus,
+        &|builder: &mut BlueprintBuilder| {
+            builder
+                .sled_expunge_zone(sled_id, new_zone.id())
+                .context("expunging zone")
+        },
+    )
+    .await
+    .expect("editing blueprint to expunge zone");
+
+    // At some point, we should be unable to reach this Nexus any more.
+    wait_for_condition(
+        || async {
+            match new_zone_client.saga_list(None, None, None).await {
+                Err(nexus_client::Error::CommunicationError(error)) => {
+                    info!(log, "expunged Nexus no longer reachable";
+                        "error" => slog_error_chain::InlineErrorChain::new(&error),
+                    );
+                    Ok(())
+                }
+                Ok(_) => {
+                    debug!(log, "expunged Nexus is still reachable");
+                    Err(CondCheckError::<()>::NotYet)
+                }
+                Err(error) => {
+                    debug!(log, "expunged Nexus is still reachable";
+                        "error" => slog_error_chain::InlineErrorChain::new(&error),
+                    );
+                    Err(CondCheckError::NotYet)
+                }
+            }
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(60),
+    )
+    .await
+    .unwrap();
+
+    // Wait for some other Nexus instance to pick up the saga.
+    let nexus_found = wait_for_condition(
+        || async {
+            for nexus_client in &initial_nexus_clients {
+                assert!(nexus_client.baseurl() != new_zone_client.baseurl());
+                let Ok(sagas) = list_sagas(&nexus_client).await else {
+                    continue;
+                };
+
+                debug!(log, "found sagas (last): {:?}", sagas);
+                if sagas.into_iter().any(|s| s.id == saga_id) {
+                    return Ok(nexus_client);
+                }
+            }
+
+            return Err(CondCheckError::<()>::NotYet);
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(60),
+    )
+    .await
+    .unwrap();
+
+    info!(log, "found saga in a different Nexus instance";
+        "saga_id" => %saga_id,
+        "found_nexus" => nexus_found.baseurl(),
+    );
+    assert!(nexus_found.baseurl() != new_zone_client.baseurl());
+
+    // Now, complete the demo saga on whichever instance is running it now.
+    // `saga_demo_complete` is not synchronous.  It just unblocks the saga.
+    // We'll need to poll a bit to wait for it to finish.
+    nexus_found
+        .saga_demo_complete(&demo_saga.demo_saga_id)
+        .await
+        .expect("complete demo saga");
+    let found = wait_for_condition(
+        || async {
+            let sagas = list_sagas(&nexus_found).await.expect("listing sagas");
+            debug!(log, "found sagas (last): {:?}", sagas);
+            let found = sagas.into_iter().find(|s| s.id == saga_id).unwrap();
+            if matches!(found.state, SagaState::Succeeded) {
+                Ok(found)
+            } else {
+                Err(CondCheckError::<()>::NotYet)
+            }
+        },
+        &Duration::from_millis(50),
+        &Duration::from_secs(30),
+    )
+    .await
+    .unwrap();
+
+    assert_eq!(found.id, saga_id);
+    assert!(matches!(found.state, SagaState::Succeeded));
+}
+
+async fn list_sagas(
+    client: &nexus_client::Client,
+) -> Result<Vec<Saga>, anyhow::Error> {
+    client
+        .saga_list_stream(None, None)
+        .try_collect::<Vec<_>>()
+        .await
+        .context("listing sagas")
+}
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index d424e08b61..9e69800fed 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -27,6 +27,7 @@ use crate::db::{
     error::{public_error_from_diesel, ErrorHandler},
 };
 use ::oximeter::types::ProducerRegistry;
+use anyhow::{anyhow, bail, Context};
 use async_bb8_diesel::AsyncRunQueryDsl;
 use diesel::pg::Pg;
 use diesel::prelude::*;
@@ -207,7 +208,7 @@ impl DataStore {
 
     /// Constructs a new Datastore object.
     ///
-    /// Only returns if the database schema is compatible with Nexus's known
+    /// Only returns when the database schema is compatible with Nexus's known
     /// schema version.
     pub async fn new(
         log: &Logger,
@@ -241,6 +242,38 @@ impl DataStore {
         Ok(datastore)
     }
 
+    /// Constructs a new Datastore, failing if the schema version does not match
+    /// this program's expected version
+    pub async fn new_failfast(
+        log: &Logger,
+        pool: Arc<Pool>,
+    ) -> Result<Self, anyhow::Error> {
+        let datastore =
+            Self::new_unchecked(log.new(o!("component" => "datastore")), pool)
+                .map_err(|e| anyhow!("{}", e))?;
+        const EXPECTED_VERSION: SemverVersion = nexus_db_model::SCHEMA_VERSION;
+        let (found_version, found_target) = datastore
+            .database_schema_version()
+            .await
+            .context("loading database schema version")?;
+
+        if let Some(found_target) = found_target {
+            bail!(
+                "database schema check failed: apparently mid-upgrade \
+                 (found_target = {found_target})"
+            );
+        }
+
+        if found_version != EXPECTED_VERSION {
+            bail!(
+                "database schema check failed: \
+                 expected {EXPECTED_VERSION}, found {found_version}",
+            );
+        }
+
+        Ok(datastore)
+    }
+
     pub fn register_producers(&self, registry: &ProducerRegistry) {
         registry
             .register_producer(
diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs
index fc0e4638f8..a637241aaa 100644
--- a/nexus/reconfigurator/preparation/src/lib.rs
+++ b/nexus/reconfigurator/preparation/src/lib.rs
@@ -35,6 +35,7 @@ use omicron_common::address::IpRange;
 use omicron_common::address::Ipv6Subnet;
 use omicron_common::address::SLED_PREFIX;
 use omicron_common::api::external::Error;
+use omicron_common::api::external::InternalContext;
 use omicron_common::api::external::LookupType;
 use omicron_common::disk::DiskIdentity;
 use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY;
@@ -72,6 +73,74 @@ pub struct PlanningInputFromDb<'a> {
 }
 
 impl PlanningInputFromDb<'_> {
+    pub async fn assemble(
+        opctx: &OpContext,
+        datastore: &DataStore,
+    ) -> Result<PlanningInput, Error> {
+        opctx.check_complex_operations_allowed()?;
+        let sled_rows = datastore
+            .sled_list_all_batched(opctx, SledFilter::Commissioned)
+            .await
+            .internal_context("fetching all sleds")?;
+        let zpool_rows = datastore
+            .zpool_list_all_external_batched(opctx)
+            .await
+            .internal_context("fetching all external zpool rows")?;
+        let ip_pool_range_rows = {
+            let (authz_service_ip_pool, _) = datastore
+                .ip_pools_service_lookup(opctx)
+                .await
+                .internal_context("fetching IP services pool")?;
+            datastore
+                .ip_pool_list_ranges_batched(opctx, &authz_service_ip_pool)
+                .await
+                .internal_context("listing services IP pool ranges")?
+        };
+        let external_ip_rows = datastore
+            .external_ip_list_service_all_batched(opctx)
+            .await
+            .internal_context("fetching service external IPs")?;
+        let service_nic_rows = datastore
+            .service_network_interfaces_all_list_batched(opctx)
+            .await
+            .internal_context("fetching service NICs")?;
+        let internal_dns_version = datastore
+            .dns_group_latest_version(opctx, DnsGroup::Internal)
+            .await
+            .internal_context("fetching internal DNS version")?
+            .version;
+        let external_dns_version = datastore
+            .dns_group_latest_version(opctx, DnsGroup::External)
+            .await
+            .internal_context("fetching external DNS version")?
+            .version;
+        let cockroachdb_settings = datastore
+            .cockroachdb_settings(opctx)
+            .await
+            .internal_context("fetching cockroachdb settings")?;
+
+        let planning_input = PlanningInputFromDb {
+            sled_rows: &sled_rows,
+            zpool_rows: &zpool_rows,
+            ip_pool_range_rows: &ip_pool_range_rows,
+            target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY,
+            target_nexus_zone_count: NEXUS_REDUNDANCY,
+            target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY,
+            target_cockroachdb_cluster_version:
+                CockroachDbClusterVersion::POLICY,
+            external_ip_rows: &external_ip_rows,
+            service_nic_rows: &service_nic_rows,
+            log: &opctx.log,
+            internal_dns_version,
+            external_dns_version,
+            cockroachdb_settings: &cockroachdb_settings,
+        }
+        .build()
+        .internal_context("assembling planning_input")?;
+
+        Ok(planning_input)
+    }
+
     pub fn build(&self) -> Result<PlanningInput, Error> {
         let service_ip_pool_ranges =
             self.ip_pool_range_rows.iter().map(IpRange::from).collect();
@@ -195,65 +264,8 @@ pub async fn reconfigurator_state_load(
     datastore: &DataStore,
 ) -> Result<UnstableReconfiguratorState, anyhow::Error> {
     opctx.check_complex_operations_allowed()?;
-    let sled_rows = datastore
-        .sled_list_all_batched(opctx, SledFilter::Commissioned)
-        .await
-        .context("listing sleds")?;
-    let zpool_rows = datastore
-        .zpool_list_all_external_batched(opctx)
-        .await
-        .context("listing zpools")?;
-    let ip_pool_range_rows = {
-        let (authz_service_ip_pool, _) = datastore
-            .ip_pools_service_lookup(opctx)
-            .await
-            .context("fetching IP services pool")?;
-        datastore
-            .ip_pool_list_ranges_batched(opctx, &authz_service_ip_pool)
-            .await
-            .context("listing services IP pool ranges")?
-    };
-    let external_ip_rows = datastore
-        .external_ip_list_service_all_batched(opctx)
-        .await
-        .context("fetching service external IPs")?;
-    let service_nic_rows = datastore
-        .service_network_interfaces_all_list_batched(opctx)
-        .await
-        .context("fetching service NICs")?;
-    let internal_dns_version = datastore
-        .dns_group_latest_version(opctx, DnsGroup::Internal)
-        .await
-        .context("fetching internal DNS version")?
-        .version;
-    let external_dns_version = datastore
-        .dns_group_latest_version(opctx, DnsGroup::External)
-        .await
-        .context("fetching external DNS version")?
-        .version;
-    let cockroachdb_settings = datastore
-        .cockroachdb_settings(opctx)
-        .await
-        .context("fetching cockroachdb settings")?;
-
-    let planning_input = PlanningInputFromDb {
-        sled_rows: &sled_rows,
-        zpool_rows: &zpool_rows,
-        ip_pool_range_rows: &ip_pool_range_rows,
-        target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY,
-        target_nexus_zone_count: NEXUS_REDUNDANCY,
-        target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY,
-        target_cockroachdb_cluster_version: CockroachDbClusterVersion::POLICY,
-        external_ip_rows: &external_ip_rows,
-        service_nic_rows: &service_nic_rows,
-        log: &opctx.log,
-        internal_dns_version,
-        external_dns_version,
-        cockroachdb_settings: &cockroachdb_settings,
-    }
-    .build()
-    .context("assembling planning_input")?;
-
+    let planning_input =
+        PlanningInputFromDb::assemble(opctx, datastore).await?;
     let collection_ids = datastore
         .inventory_collections()
         .await
diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs
index 50ae332d3f..79e7a93e6d 100644
--- a/nexus/src/app/deployment.rs
+++ b/nexus/src/app/deployment.rs
@@ -4,7 +4,6 @@
 
 //! Configuration of the deployment system
 
-use nexus_db_model::DnsGroup;
 use nexus_db_queries::authz;
 use nexus_db_queries::context::OpContext;
 use nexus_reconfigurator_planning::planner::Planner;
@@ -13,9 +12,7 @@ use nexus_types::deployment::Blueprint;
 use nexus_types::deployment::BlueprintMetadata;
 use nexus_types::deployment::BlueprintTarget;
 use nexus_types::deployment::BlueprintTargetSet;
-use nexus_types::deployment::CockroachDbClusterVersion;
 use nexus_types::deployment::PlanningInput;
-use nexus_types::deployment::SledFilter;
 use nexus_types::inventory::Collection;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::DataPageParams;
@@ -25,9 +22,6 @@ use omicron_common::api::external::InternalContext;
 use omicron_common::api::external::ListResultVec;
 use omicron_common::api::external::LookupResult;
 use omicron_common::api::external::LookupType;
-use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY;
-use omicron_common::policy::COCKROACHDB_REDUNDANCY;
-use omicron_common::policy::NEXUS_REDUNDANCY;
 use slog_error_chain::InlineErrorChain;
 use uuid::Uuid;
 
@@ -132,61 +126,8 @@ impl super::Nexus {
     ) -> Result<PlanningContext, Error> {
         let creator = self.id.to_string();
         let datastore = self.datastore();
-
-        let sled_rows = datastore
-            .sled_list_all_batched(opctx, SledFilter::Commissioned)
-            .await?;
-        let zpool_rows =
-            datastore.zpool_list_all_external_batched(opctx).await?;
-        let ip_pool_range_rows = {
-            let (authz_service_ip_pool, _) =
-                datastore.ip_pools_service_lookup(opctx).await?;
-            datastore
-                .ip_pool_list_ranges_batched(opctx, &authz_service_ip_pool)
-                .await?
-        };
-        let external_ip_rows =
-            datastore.external_ip_list_service_all_batched(opctx).await?;
-        let service_nic_rows = datastore
-            .service_network_interfaces_all_list_batched(opctx)
-            .await?;
-
-        let internal_dns_version = datastore
-            .dns_group_latest_version(opctx, DnsGroup::Internal)
-            .await
-            .internal_context(
-                "fetching internal DNS version for blueprint planning",
-            )?
-            .version;
-        let external_dns_version = datastore
-            .dns_group_latest_version(opctx, DnsGroup::External)
-            .await
-            .internal_context(
-                "fetching external DNS version for blueprint planning",
-            )?
-            .version;
-        let cockroachdb_settings =
-            datastore.cockroachdb_settings(opctx).await.internal_context(
-                "fetching cockroachdb settings for blueprint planning",
-            )?;
-
-        let planning_input = PlanningInputFromDb {
-            sled_rows: &sled_rows,
-            zpool_rows: &zpool_rows,
-            ip_pool_range_rows: &ip_pool_range_rows,
-            external_ip_rows: &external_ip_rows,
-            service_nic_rows: &service_nic_rows,
-            target_boundary_ntp_zone_count: BOUNDARY_NTP_REDUNDANCY,
-            target_nexus_zone_count: NEXUS_REDUNDANCY,
-            target_cockroachdb_zone_count: COCKROACHDB_REDUNDANCY,
-            target_cockroachdb_cluster_version:
-                CockroachDbClusterVersion::POLICY,
-            log: &opctx.log,
-            internal_dns_version,
-            external_dns_version,
-            cockroachdb_settings: &cockroachdb_settings,
-        }
-        .build()?;
+        let planning_input =
+            PlanningInputFromDb::assemble(opctx, datastore).await?;
 
         // The choice of which inventory collection to use here is not
         // necessarily trivial.  Inventory collections may be incomplete due to