From 148d8d10d97ddca4ff7bbb249fd635b1545bcdec Mon Sep 17 00:00:00 2001
From: Augustus Mayo <augustus@oxidecomputer.com>
Date: Thu, 11 Jul 2024 17:07:03 -0500
Subject: [PATCH 01/27] Fix maghemite auto-updates (#5949)

Running the maghemite update script creates changes in the
packge-manifest.toml file, but that file is not being tracked in the
auto update script.

* Commit package manifest during updates
---
 .github/workflows/update-maghemite.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/update-maghemite.yml b/.github/workflows/update-maghemite.yml
index 7ced0adf5e..9065a54e04 100644
--- a/.github/workflows/update-maghemite.yml
+++ b/.github/workflows/update-maghemite.yml
@@ -55,7 +55,7 @@ jobs:
         run: |
           . ./tools/reflector/helpers.sh
 
-          PATHS=("tools/maghemite_ddm_openapi_version" "tools/maghemite_mg_openapi_version" "tools/maghemite_mgd_checksums")
+          PATHS=("tools/maghemite_ddm_openapi_version" "tools/maghemite_mg_openapi_version" "tools/maghemite_mgd_checksums" "package-manifest.toml")
           CHANGES=()
           commit $TARGET_BRANCH $INT_BRANCH ${{ inputs.reflector_user_id }} PATHS CHANGES
 

From 87e166328df630a17bd9f6317ac1425d914ebf6f Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Thu, 11 Jul 2024 15:16:59 -0700
Subject: [PATCH 02/27] Update Rust crate serde_with to 3.8.3 (#6049)

---
 Cargo.lock | 10 +++++-----
 Cargo.toml |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4c99c8fc6d..4680813d1d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4026,7 +4026,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
 dependencies = [
  "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -8644,9 +8644,9 @@ dependencies = [
 
 [[package]]
 name = "serde_with"
-version = "3.8.1"
+version = "3.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ad483d2ab0149d5a5ebcd9972a3852711e0153d863bf5a5d0391d28883c4a20"
+checksum = "e73139bc5ec2d45e6c5fd85be5a46949c1c39a4c18e56915f5eb4c12f975e377"
 dependencies = [
  "base64 0.22.1",
  "chrono",
@@ -8662,9 +8662,9 @@ dependencies = [
 
 [[package]]
 name = "serde_with_macros"
-version = "3.8.1"
+version = "3.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65569b702f41443e8bc8bbb1c5779bd0450bbe723b56198980e80ec45780bce2"
+checksum = "b80d3d6b56b64335c0180e5ffde23b3c5e08c14c585b51a15bd0e95393f46703"
 dependencies = [
  "darling",
  "proc-macro2",
diff --git a/Cargo.toml b/Cargo.toml
index 36da57b9c4..4eff3e13c0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -442,7 +442,7 @@ serde_json = "1.0.117"
 serde_path_to_error = "0.1.16"
 serde_tokenstream = "0.2"
 serde_urlencoded = "0.7.1"
-serde_with = "3.7.0"
+serde_with = "3.8.3"
 sha2 = "0.10.8"
 sha3 = "0.10.8"
 shell-words = "1.1.0"

From 468e699bacc2b5d0b76e4990e9edfba191049247 Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Thu, 11 Jul 2024 15:17:26 -0700
Subject: [PATCH 03/27] Update Rust crate serde_json to 1.0.120 (#6031)

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 4eff3e13c0..8a5bd29974 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -438,7 +438,7 @@ secrecy = "0.8.0"
 semver = { version = "1.0.23", features = ["std", "serde"] }
 serde = { version = "1.0", default-features = false, features = [ "derive", "rc" ] }
 serde_human_bytes = { git = "https://github.com/oxidecomputer/serde_human_bytes", branch = "main" }
-serde_json = "1.0.117"
+serde_json = "1.0.120"
 serde_path_to_error = "0.1.16"
 serde_tokenstream = "0.2"
 serde_urlencoded = "0.7.1"

From 2cb9d5e8baf2ffd7f9b32a66a5bda35911c01dad Mon Sep 17 00:00:00 2001
From: Rain <rain@oxide.computer>
Date: Thu, 11 Jul 2024 16:25:26 -0700
Subject: [PATCH 04/27] [meta] update dropshot (#6056)

Main change here is the error type for `ApiDescription::register` error
changing to a more structured one.
---
 Cargo.lock                                      |  4 ++--
 cockroach-admin/src/http_entrypoints.rs         |  5 ++++-
 gateway/src/http_entrypoints.rs                 |  3 ++-
 installinator-artifactd/src/http_entrypoints.rs |  8 ++++----
 nexus/src/external_api/http_entrypoints.rs      | 10 ++++++----
 nexus/src/internal_api/http_entrypoints.rs      |  5 ++++-
 sled-agent/src/bootstrap/http_entrypoints.rs    |  3 ++-
 sled-agent/src/http_entrypoints.rs              | 12 +++++++-----
 sled-agent/src/sim/http_entrypoints.rs          |  6 ++++--
 sled-agent/src/sim/http_entrypoints_pantry.rs   |  7 ++++---
 sled-agent/src/sim/http_entrypoints_storage.rs  |  7 ++++---
 wicketd/src/http_entrypoints.rs                 |  3 ++-
 12 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4680813d1d..6665f3da41 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2028,7 +2028,7 @@ dependencies = [
 [[package]]
 name = "dropshot"
 version = "0.10.2-dev"
-source = "git+https://github.com/oxidecomputer/dropshot?branch=main#6a3f84ca5fd8d0c5c010cfe837efbe6b5d117d9d"
+source = "git+https://github.com/oxidecomputer/dropshot?branch=main#9fef3961c0b89aa8ab8e186dc0c89f8f4f811eea"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -2074,7 +2074,7 @@ dependencies = [
 [[package]]
 name = "dropshot_endpoint"
 version = "0.10.2-dev"
-source = "git+https://github.com/oxidecomputer/dropshot?branch=main#6a3f84ca5fd8d0c5c010cfe837efbe6b5d117d9d"
+source = "git+https://github.com/oxidecomputer/dropshot?branch=main#9fef3961c0b89aa8ab8e186dc0c89f8f4f811eea"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
diff --git a/cockroach-admin/src/http_entrypoints.rs b/cockroach-admin/src/http_entrypoints.rs
index 1c02d23ae2..45957df0df 100644
--- a/cockroach-admin/src/http_entrypoints.rs
+++ b/cockroach-admin/src/http_entrypoints.rs
@@ -6,6 +6,7 @@ use crate::cockroach_cli::NodeDecommission;
 use crate::cockroach_cli::NodeStatus;
 use crate::context::ServerContext;
 use dropshot::endpoint;
+use dropshot::ApiDescriptionRegisterError;
 use dropshot::HttpError;
 use dropshot::HttpResponseOk;
 use dropshot::RequestContext;
@@ -19,7 +20,9 @@ use std::sync::Arc;
 type CrdbApiDescription = dropshot::ApiDescription<Arc<ServerContext>>;
 
 pub fn api() -> CrdbApiDescription {
-    fn register_endpoints(api: &mut CrdbApiDescription) -> Result<(), String> {
+    fn register_endpoints(
+        api: &mut CrdbApiDescription,
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(local_node_id)?;
         api.register(node_status)?;
         api.register(node_decommission)?;
diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs
index 7e1c8a991e..fa91bcebf5 100644
--- a/gateway/src/http_entrypoints.rs
+++ b/gateway/src/http_entrypoints.rs
@@ -17,6 +17,7 @@ use crate::ServerContext;
 use base64::Engine;
 use dropshot::endpoint;
 use dropshot::ApiDescription;
+use dropshot::ApiDescriptionRegisterError;
 use dropshot::HttpError;
 use dropshot::HttpResponseOk;
 use dropshot::HttpResponseUpdatedNoContent;
@@ -1677,7 +1678,7 @@ type GatewayApiDescription = ApiDescription<Arc<ServerContext>>;
 pub fn api() -> GatewayApiDescription {
     fn register_endpoints(
         api: &mut GatewayApiDescription,
-    ) -> Result<(), String> {
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(sp_get)?;
         api.register(sp_startup_options_get)?;
         api.register(sp_startup_options_set)?;
diff --git a/installinator-artifactd/src/http_entrypoints.rs b/installinator-artifactd/src/http_entrypoints.rs
index 8360fc9e35..13163e007b 100644
--- a/installinator-artifactd/src/http_entrypoints.rs
+++ b/installinator-artifactd/src/http_entrypoints.rs
@@ -5,9 +5,9 @@
 // Copyright 2022 Oxide Computer Company
 
 use dropshot::{
-    endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseHeaders,
-    HttpResponseOk, HttpResponseUpdatedNoContent, Path, RequestContext,
-    TypedBody,
+    endpoint, ApiDescription, ApiDescriptionRegisterError, FreeformBody,
+    HttpError, HttpResponseHeaders, HttpResponseOk,
+    HttpResponseUpdatedNoContent, Path, RequestContext, TypedBody,
 };
 use hyper::{header, Body, StatusCode};
 use installinator_common::EventReport;
@@ -24,7 +24,7 @@ type ArtifactServerApiDesc = ApiDescription<ServerContext>;
 pub fn api() -> ArtifactServerApiDesc {
     fn register_endpoints(
         api: &mut ArtifactServerApiDesc,
-    ) -> Result<(), String> {
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(get_artifact_by_hash)?;
         api.register(report_progress)?;
         Ok(())
diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs
index 1e11004191..1f185ae820 100644
--- a/nexus/src/external_api/http_entrypoints.rs
+++ b/nexus/src/external_api/http_entrypoints.rs
@@ -14,7 +14,6 @@ use super::{
     },
 };
 use crate::{context::ApiContext, external_api::shared};
-use dropshot::HttpError;
 use dropshot::HttpResponseAccepted;
 use dropshot::HttpResponseCreated;
 use dropshot::HttpResponseDeleted;
@@ -32,6 +31,7 @@ use dropshot::{
     channel, endpoint, WebsocketChannelResult, WebsocketConnection,
 };
 use dropshot::{ApiDescription, StreamingBody};
+use dropshot::{ApiDescriptionRegisterError, HttpError};
 use dropshot::{ApiEndpoint, EmptyScanParams};
 use ipnetwork::IpNetwork;
 use nexus_db_queries::db;
@@ -100,7 +100,9 @@ type NexusApiDescription = ApiDescription<ApiContext>;
 
 /// Returns a description of the external nexus API
 pub(crate) fn external_api() -> NexusApiDescription {
-    fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> {
+    fn register_endpoints(
+        api: &mut NexusApiDescription,
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(ping)?;
 
         api.register(system_policy_view)?;
@@ -368,7 +370,7 @@ pub(crate) fn external_api() -> NexusApiDescription {
     fn register_experimental<T>(
         api: &mut NexusApiDescription,
         endpoint: T,
-    ) -> Result<(), String>
+    ) -> Result<(), ApiDescriptionRegisterError>
     where
         T: Into<ApiEndpoint<ApiContext>>,
     {
@@ -381,7 +383,7 @@ pub(crate) fn external_api() -> NexusApiDescription {
 
     fn register_experimental_endpoints(
         api: &mut NexusApiDescription,
-    ) -> Result<(), String> {
+    ) -> Result<(), ApiDescriptionRegisterError> {
         register_experimental(api, probe_list)?;
         register_experimental(api, probe_view)?;
         register_experimental(api, probe_create)?;
diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs
index 582f7cb608..8e7b39c111 100644
--- a/nexus/src/internal_api/http_entrypoints.rs
+++ b/nexus/src/internal_api/http_entrypoints.rs
@@ -8,6 +8,7 @@ use super::params::{OximeterInfo, RackInitializationRequest};
 use crate::context::ApiContext;
 use dropshot::endpoint;
 use dropshot::ApiDescription;
+use dropshot::ApiDescriptionRegisterError;
 use dropshot::FreeformBody;
 use dropshot::HttpError;
 use dropshot::HttpResponseCreated;
@@ -68,7 +69,9 @@ type NexusApiDescription = ApiDescription<ApiContext>;
 
 /// Returns a description of the internal nexus API
 pub(crate) fn internal_api() -> NexusApiDescription {
-    fn register_endpoints(api: &mut NexusApiDescription) -> Result<(), String> {
+    fn register_endpoints(
+        api: &mut NexusApiDescription,
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(sled_agent_get)?;
         api.register(sled_agent_put)?;
         api.register(sled_firewall_rules_request)?;
diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs
index 68098f431e..2fa0b83f1d 100644
--- a/sled-agent/src/bootstrap/http_entrypoints.rs
+++ b/sled-agent/src/bootstrap/http_entrypoints.rs
@@ -15,6 +15,7 @@ use crate::bootstrap::rack_ops::{RackInitId, RackResetId};
 use crate::updates::ConfigUpdates;
 use crate::updates::{Component, UpdateManager};
 use bootstore::schemes::v0 as bootstore;
+use dropshot::ApiDescriptionRegisterError;
 use dropshot::{
     endpoint, ApiDescription, HttpError, HttpResponseOk,
     HttpResponseUpdatedNoContent, RequestContext, TypedBody,
@@ -63,7 +64,7 @@ type BootstrapApiDescription = ApiDescription<BootstrapServerContext>;
 pub(crate) fn api() -> BootstrapApiDescription {
     fn register_endpoints(
         api: &mut BootstrapApiDescription,
-    ) -> Result<(), String> {
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(baseboard_get)?;
         api.register(components_get)?;
         api.register(rack_initialization_status)?;
diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
index 2d41e2860a..a21c278699 100644
--- a/sled-agent/src/http_entrypoints.rs
+++ b/sled-agent/src/http_entrypoints.rs
@@ -20,10 +20,10 @@ use bootstore::schemes::v0::NetworkConfig;
 use camino::Utf8PathBuf;
 use display_error_chain::DisplayErrorChain;
 use dropshot::{
-    endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseCreated,
-    HttpResponseDeleted, HttpResponseHeaders, HttpResponseOk,
-    HttpResponseUpdatedNoContent, Path, Query, RequestContext, StreamingBody,
-    TypedBody,
+    endpoint, ApiDescription, ApiDescriptionRegisterError, FreeformBody,
+    HttpError, HttpResponseCreated, HttpResponseDeleted, HttpResponseHeaders,
+    HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext,
+    StreamingBody, TypedBody,
 };
 use illumos_utils::opte::params::VirtualNetworkInterfaceHost;
 use installinator_common::M2Slot;
@@ -46,7 +46,9 @@ type SledApiDescription = ApiDescription<SledAgent>;
 
 /// Returns a description of the sled agent API
 pub fn api() -> SledApiDescription {
-    fn register_endpoints(api: &mut SledApiDescription) -> Result<(), String> {
+    fn register_endpoints(
+        api: &mut SledApiDescription,
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(disk_put)?;
         api.register(cockroachdb_init)?;
         api.register(instance_issue_disk_snapshot_request)?;
diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs
index cfafaeea22..78d48be0ff 100644
--- a/sled-agent/src/sim/http_entrypoints.rs
+++ b/sled-agent/src/sim/http_entrypoints.rs
@@ -12,7 +12,6 @@ use crate::params::{
     InstancePutStateResponse, InstanceUnregisterResponse, Inventory,
     OmicronPhysicalDisksConfig, OmicronZonesConfig, VpcFirewallRulesEnsureBody,
 };
-use dropshot::endpoint;
 use dropshot::ApiDescription;
 use dropshot::HttpError;
 use dropshot::HttpResponseOk;
@@ -20,6 +19,7 @@ use dropshot::HttpResponseUpdatedNoContent;
 use dropshot::Path;
 use dropshot::RequestContext;
 use dropshot::TypedBody;
+use dropshot::{endpoint, ApiDescriptionRegisterError};
 use illumos_utils::opte::params::VirtualNetworkInterfaceHost;
 use omicron_common::api::internal::nexus::DiskRuntimeState;
 use omicron_common::api::internal::nexus::SledInstanceState;
@@ -40,7 +40,9 @@ type SledApiDescription = ApiDescription<Arc<SledAgent>>;
 
 /// Returns a description of the sled agent API
 pub fn api() -> SledApiDescription {
-    fn register_endpoints(api: &mut SledApiDescription) -> Result<(), String> {
+    fn register_endpoints(
+        api: &mut SledApiDescription,
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(instance_put_migration_ids)?;
         api.register(instance_put_state)?;
         api.register(instance_get_state)?;
diff --git a/sled-agent/src/sim/http_entrypoints_pantry.rs b/sled-agent/src/sim/http_entrypoints_pantry.rs
index 13882deabc..a93cb6fca9 100644
--- a/sled-agent/src/sim/http_entrypoints_pantry.rs
+++ b/sled-agent/src/sim/http_entrypoints_pantry.rs
@@ -5,8 +5,9 @@
 //! HTTP entrypoint functions for simulating the crucible pantry API.
 
 use dropshot::{
-    endpoint, ApiDescription, HttpError, HttpResponseDeleted, HttpResponseOk,
-    HttpResponseUpdatedNoContent, Path as TypedPath, RequestContext, TypedBody,
+    endpoint, ApiDescription, ApiDescriptionRegisterError, HttpError,
+    HttpResponseDeleted, HttpResponseOk, HttpResponseUpdatedNoContent,
+    Path as TypedPath, RequestContext, TypedBody,
 };
 use propolis_client::types::VolumeConstructionRequest;
 use schemars::JsonSchema;
@@ -21,7 +22,7 @@ type CruciblePantryApiDescription = ApiDescription<Arc<Pantry>>;
 pub fn api() -> CruciblePantryApiDescription {
     fn register_endpoints(
         api: &mut CruciblePantryApiDescription,
-    ) -> Result<(), String> {
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(attach)?;
         api.register(is_job_finished)?;
         api.register(job_result_ok)?;
diff --git a/sled-agent/src/sim/http_entrypoints_storage.rs b/sled-agent/src/sim/http_entrypoints_storage.rs
index dcc449b61a..99817755bf 100644
--- a/sled-agent/src/sim/http_entrypoints_storage.rs
+++ b/sled-agent/src/sim/http_entrypoints_storage.rs
@@ -9,8 +9,9 @@ use crucible_agent_client::types::{
     Snapshot,
 };
 use dropshot::{
-    endpoint, ApiDescription, HttpError, HttpResponseDeleted, HttpResponseOk,
-    Path as TypedPath, RequestContext, TypedBody,
+    endpoint, ApiDescription, ApiDescriptionRegisterError, HttpError,
+    HttpResponseDeleted, HttpResponseOk, Path as TypedPath, RequestContext,
+    TypedBody,
 };
 use schemars::JsonSchema;
 use serde::Deserialize;
@@ -24,7 +25,7 @@ type CrucibleAgentApiDescription = ApiDescription<Arc<CrucibleData>>;
 pub fn api() -> CrucibleAgentApiDescription {
     fn register_endpoints(
         api: &mut CrucibleAgentApiDescription,
-    ) -> Result<(), String> {
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(region_list)?;
         api.register(region_create)?;
         api.register(region_get)?;
diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs
index 001974e085..4a4374b312 100644
--- a/wicketd/src/http_entrypoints.rs
+++ b/wicketd/src/http_entrypoints.rs
@@ -18,6 +18,7 @@ use bootstrap_agent_client::types::RackOperationStatus;
 use bootstrap_agent_client::types::RackResetId;
 use dropshot::endpoint;
 use dropshot::ApiDescription;
+use dropshot::ApiDescriptionRegisterError;
 use dropshot::HttpError;
 use dropshot::HttpResponseOk;
 use dropshot::HttpResponseUpdatedNoContent;
@@ -59,7 +60,7 @@ type WicketdApiDescription = ApiDescription<ServerContext>;
 pub fn api() -> WicketdApiDescription {
     fn register_endpoints(
         api: &mut WicketdApiDescription,
-    ) -> Result<(), String> {
+    ) -> Result<(), ApiDescriptionRegisterError> {
         api.register(get_bootstrap_sleds)?;
         api.register(get_rss_config)?;
         api.register(put_rss_config)?;

From 0695368e62f7b6af662a1d049bfd51713fdebc4b Mon Sep 17 00:00:00 2001
From: Ryan Goodfellow <ryan.goodfellow@oxide.computer>
Date: Thu, 11 Jul 2024 17:44:52 -0700
Subject: [PATCH 05/27] bump maghemite (#6058)

---
 Cargo.lock                          | 4 ++--
 Cargo.toml                          | 4 ++--
 package-manifest.toml               | 8 ++++----
 tools/maghemite_ddm_openapi_version | 2 +-
 tools/maghemite_mg_openapi_version  | 2 +-
 tools/maghemite_mgd_checksums       | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6665f3da41..8b074da20c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1614,7 +1614,7 @@ dependencies = [
 [[package]]
 name = "ddm-admin-client"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/maghemite?rev=d1686c86f92ead77e07ddc6024837dee4a401d6d#d1686c86f92ead77e07ddc6024837dee4a401d6d"
+source = "git+https://github.com/oxidecomputer/maghemite?rev=1b385990e8648b221fd11f018f2a7ec425461c6c#1b385990e8648b221fd11f018f2a7ec425461c6c"
 dependencies = [
  "oxnet",
  "percent-encoding",
@@ -4313,7 +4313,7 @@ dependencies = [
 [[package]]
 name = "mg-admin-client"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/maghemite?rev=d1686c86f92ead77e07ddc6024837dee4a401d6d#d1686c86f92ead77e07ddc6024837dee4a401d6d"
+source = "git+https://github.com/oxidecomputer/maghemite?rev=1b385990e8648b221fd11f018f2a7ec425461c6c#1b385990e8648b221fd11f018f2a7ec425461c6c"
 dependencies = [
  "anyhow",
  "chrono",
diff --git a/Cargo.toml b/Cargo.toml
index 8a5bd29974..04202a3b77 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -333,8 +333,8 @@ macaddr = { version = "1.0.1", features = ["serde_std"] }
 maplit = "1.0.2"
 mockall = "0.12"
 newtype_derive = "0.1.6"
-mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "d1686c86f92ead77e07ddc6024837dee4a401d6d" }
-ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "d1686c86f92ead77e07ddc6024837dee4a401d6d" }
+mg-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "1b385990e8648b221fd11f018f2a7ec425461c6c" }
+ddm-admin-client = { git = "https://github.com/oxidecomputer/maghemite", rev = "1b385990e8648b221fd11f018f2a7ec425461c6c" }
 multimap = "0.10.0"
 nexus-auth = { path = "nexus/auth" }
 nexus-client = { path = "clients/nexus-client" }
diff --git a/package-manifest.toml b/package-manifest.toml
index 7f8154b8af..561a61ec4c 100644
--- a/package-manifest.toml
+++ b/package-manifest.toml
@@ -563,7 +563,7 @@ source.repo = "maghemite"
 # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
 # building `ddm-admin-client` (which will instruct you to update
 # `tools/maghemite_openapi_version`).
-source.commit = "d1686c86f92ead77e07ddc6024837dee4a401d6d"
+source.commit = "1b385990e8648b221fd11f018f2a7ec425461c6c"
 # The SHA256 digest is automatically posted to:
 # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/mg-ddm-gz.sha256.txt
 source.sha256 = "280bd6e5c30d8f1076bac9b8dbbdbc45379e76259aa6319da257192fcbf64a54"
@@ -579,7 +579,7 @@ source.repo = "maghemite"
 # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
 # building `ddm-admin-client` (which will instruct you to update
 # `tools/maghemite_openapi_version`).
-source.commit = "d1686c86f92ead77e07ddc6024837dee4a401d6d"
+source.commit = "1b385990e8648b221fd11f018f2a7ec425461c6c"
 # The SHA256 digest is automatically posted to:
 # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/mg-ddm.sha256.txt
 source.sha256 = "f15f8bb0e13b1a9372c895775dae96b68ff1cc5e395e6bad4389c2a97957354e"
@@ -594,10 +594,10 @@ source.repo = "maghemite"
 # `tools/maghemite_openapi_version`. Failing to do so will cause a failure when
 # building `ddm-admin-client` (which will instruct you to update
 # `tools/maghemite_openapi_version`).
-source.commit = "d1686c86f92ead77e07ddc6024837dee4a401d6d"
+source.commit = "1b385990e8648b221fd11f018f2a7ec425461c6c"
 # The SHA256 digest is automatically posted to:
 # https://buildomat.eng.oxide.computer/public/file/oxidecomputer/maghemite/image/<commit>/mgd.sha256.txt
-source.sha256 = "b0223e0aad4c22bf980da17084caf6704d0428bad1b3e5daf54e7d415ce82d3e"
+source.sha256 = "d4f2aaca20b312b6716206c335165442d6625b929eb00f0fd23f551e38216ace"
 output.type = "zone"
 output.intermediate_only = true
 
diff --git a/tools/maghemite_ddm_openapi_version b/tools/maghemite_ddm_openapi_version
index 980081379e..f5848af24f 100644
--- a/tools/maghemite_ddm_openapi_version
+++ b/tools/maghemite_ddm_openapi_version
@@ -1,2 +1,2 @@
-COMMIT="d1686c86f92ead77e07ddc6024837dee4a401d6d"
+COMMIT="1b385990e8648b221fd11f018f2a7ec425461c6c"
 SHA2="007bfb717ccbc077c0250dee3121aeb0c5bb0d1c16795429a514fa4f8635a5ef"
diff --git a/tools/maghemite_mg_openapi_version b/tools/maghemite_mg_openapi_version
index a0f7ac10ec..a5f027ec9f 100644
--- a/tools/maghemite_mg_openapi_version
+++ b/tools/maghemite_mg_openapi_version
@@ -1,2 +1,2 @@
-COMMIT="d1686c86f92ead77e07ddc6024837dee4a401d6d"
+COMMIT="1b385990e8648b221fd11f018f2a7ec425461c6c"
 SHA2="e4b42ab9daad90f0c561a830b62a9d17e294b4d0da0a6d44b4030929b0c37b7e"
diff --git a/tools/maghemite_mgd_checksums b/tools/maghemite_mgd_checksums
index 2e7d8a863c..9c4f94c6cd 100644
--- a/tools/maghemite_mgd_checksums
+++ b/tools/maghemite_mgd_checksums
@@ -1,2 +1,2 @@
-CIDL_SHA256="b0223e0aad4c22bf980da17084caf6704d0428bad1b3e5daf54e7d415ce82d3e"
-MGD_LINUX_SHA256="776f18e9e7fc905d5a2f33d1a1bdd8863ed988bb2965a222217ec06790a3f452"
\ No newline at end of file
+CIDL_SHA256="d4f2aaca20b312b6716206c335165442d6625b929eb00f0fd23f551e38216ace"
+MGD_LINUX_SHA256="2a9484345e6cba6587f71c1ee75048e2ee45a18a6628a7d88ccb9b9fb7b07faf"
\ No newline at end of file

From 32f353d2104cc4c60943da1f527c0d2c01460011 Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 04:20:38 +0000
Subject: [PATCH 06/27] Update taiki-e/install-action digest to 0256b3e (#6060)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR contains the following updates:

| Package | Type | Update | Change |
|---|---|---|---|
| [taiki-e/install-action](https://togithub.com/taiki-e/install-action)
| action | digest | [`ef2fb5a` ->
`0256b3e`](https://togithub.com/taiki-e/install-action/compare/ef2fb5a...0256b3e)
|

---

### Configuration

📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone
America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone
America/Los_Angeles.

🚦 **Automerge**: Enabled.

♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the
rebase/retry checkbox.

🔕 **Ignore**: Close this PR and you won't be reminded about this update
again.

---

- [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check
this box

---

This PR has been generated by [Renovate
Bot](https://togithub.com/renovatebot/renovate).

<!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNy40MjkuMCIsInVwZGF0ZWRJblZlciI6IjM3LjQyOS4wIiwidGFyZ2V0QnJhbmNoIjoibWFpbiIsImxhYmVscyI6WyJkZXBlbmRlbmNpZXMiXX0=-->

Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com>
---
 .github/workflows/hakari.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml
index 7cd3b69c58..a9beb49ed5 100644
--- a/.github/workflows/hakari.yml
+++ b/.github/workflows/hakari.yml
@@ -24,7 +24,7 @@ jobs:
         with:
           toolchain: stable
       - name: Install cargo-hakari
-        uses: taiki-e/install-action@ef2fb5af7d19da8885ee368c6bde2ae6d0758e3d # v2
+        uses: taiki-e/install-action@0256b3ea9ae3d751755a35cbb0608979a842f1d2 # v2
         with:
           tool: cargo-hakari
       - name: Check workspace-hack Cargo.toml is up-to-date

From bedb2382b0b5f473a07b5ff29d16720aede7c4b6 Mon Sep 17 00:00:00 2001
From: Rain <rain@oxide.computer>
Date: Fri, 12 Jul 2024 10:37:09 -0700
Subject: [PATCH 07/27] [nexus] switch nexus internal API to a trait, add
 scaffolding to manage it (#5844)

Part of the work on [RFD 479 Dropshot API traits](https://rfd.shared.oxide.computer/rfd/0479).

As a first, impactful example, this PR switches over the Nexus internal
API to being defined by a trait. Some minor refactoring was required but
overall it took me around half an hour to do this.

Included also is a tool, `cargo xtask openapi`, which is used to manage
this and future documents, as well as instructions that folks can
hopefully follow.
---
 Cargo.lock                                    |   43 +
 Cargo.toml                                    |    7 +
 README.adoc                                   |   25 +-
 dev-tools/openapi-manager/Cargo.toml          |   25 +
 dev-tools/openapi-manager/README.adoc         |  103 +
 dev-tools/openapi-manager/src/check.rs        |  185 ++
 dev-tools/openapi-manager/src/dispatch.rs     |   99 +
 dev-tools/openapi-manager/src/generate.rs     |  128 ++
 dev-tools/openapi-manager/src/lib.rs          |   21 +
 dev-tools/openapi-manager/src/list.rs         |  127 ++
 dev-tools/openapi-manager/src/main.rs         |   13 +
 dev-tools/openapi-manager/src/output.rs       |  253 +++
 dev-tools/openapi-manager/src/spec.rs         |  260 +++
 dev-tools/xtask/src/main.rs                   |    6 +
 nexus/Cargo.toml                              |    1 +
 nexus/db-model/src/external_ip.rs             |   23 +
 nexus/db-model/src/ipv4_nat_entry.rs          |   18 +-
 .../src/db/datastore/ipv4_nat_entry.rs        |    2 +-
 nexus/db-queries/src/db/datastore/mod.rs      |    1 -
 nexus/db-queries/src/db/datastore/probe.rs    |   35 +-
 nexus/internal-api/Cargo.toml                 |   18 +
 nexus/internal-api/src/lib.rs                 |  591 ++++++
 nexus/src/app/probe.rs                        |    2 +-
 nexus/src/bin/nexus.rs                        |   12 -
 nexus/src/external_api/http_entrypoints.rs    |    6 +-
 nexus/src/internal_api/http_entrypoints.rs    | 1721 +++++++----------
 nexus/src/lib.rs                              |   10 -
 nexus/tests/integration_tests/commands.rs     |   16 -
 nexus/tests/integration_tests/probe.rs        |    3 +-
 nexus/tests/output/cmd-nexus-noargs-stderr    |    5 +-
 nexus/types/src/external_api/shared.rs        |   26 +
 nexus/types/src/internal_api/views.rs         |   17 +
 openapi/nexus-internal.json                   |   24 +-
 openapi/nexus.json                            |   18 +-
 workspace-hack/Cargo.toml                     |   10 +-
 35 files changed, 2733 insertions(+), 1121 deletions(-)
 create mode 100644 dev-tools/openapi-manager/Cargo.toml
 create mode 100644 dev-tools/openapi-manager/README.adoc
 create mode 100644 dev-tools/openapi-manager/src/check.rs
 create mode 100644 dev-tools/openapi-manager/src/dispatch.rs
 create mode 100644 dev-tools/openapi-manager/src/generate.rs
 create mode 100644 dev-tools/openapi-manager/src/lib.rs
 create mode 100644 dev-tools/openapi-manager/src/list.rs
 create mode 100644 dev-tools/openapi-manager/src/main.rs
 create mode 100644 dev-tools/openapi-manager/src/output.rs
 create mode 100644 dev-tools/openapi-manager/src/spec.rs
 create mode 100644 nexus/internal-api/Cargo.toml
 create mode 100644 nexus/internal-api/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8b074da20c..84669a13e7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3531,6 +3531,12 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "indent_write"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cfe9645a18782869361d9c8732246be7b410ad4e919d3609ebabdac00ba12c3"
+
 [[package]]
 name = "indexmap"
 version = "1.9.3"
@@ -4720,6 +4726,20 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "nexus-internal-api"
+version = "0.1.0"
+dependencies = [
+ "dropshot",
+ "nexus-types",
+ "omicron-common",
+ "omicron-uuid-kinds",
+ "omicron-workspace-hack",
+ "schemars",
+ "serde",
+ "uuid",
+]
+
 [[package]]
 name = "nexus-inventory"
 version = "0.1.0"
@@ -5522,6 +5542,7 @@ dependencies = [
  "nexus-db-model",
  "nexus-db-queries",
  "nexus-defaults",
+ "nexus-internal-api",
  "nexus-inventory",
  "nexus-metrics-producer-gc",
  "nexus-networking",
@@ -5908,6 +5929,7 @@ dependencies = [
  "bit-vec",
  "bitflags 1.3.2",
  "bitflags 2.5.0",
+ "bstr 0.2.17",
  "bstr 1.9.1",
  "byteorder",
  "bytes",
@@ -6076,6 +6098,27 @@ dependencies = [
  "regex",
 ]
 
+[[package]]
+name = "openapi-manager"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "atomicwrites",
+ "camino",
+ "clap",
+ "dropshot",
+ "fs-err",
+ "indent_write",
+ "nexus-internal-api",
+ "omicron-workspace-hack",
+ "openapi-lint",
+ "openapiv3",
+ "owo-colors",
+ "serde_json",
+ "similar",
+ "supports-color",
+]
+
 [[package]]
 name = "openapiv3"
 version = "2.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index 04202a3b77..e5783b39eb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ members = [
     "dev-tools/crdb-seed",
     "dev-tools/omdb",
     "dev-tools/omicron-dev",
+    "dev-tools/openapi-manager",
     "dev-tools/oxlog",
     "dev-tools/reconfigurator-cli",
     "dev-tools/releng",
@@ -46,6 +47,7 @@ members = [
     "nexus/db-model",
     "nexus/db-queries",
     "nexus/defaults",
+    "nexus/internal-api",
     "nexus/inventory",
     "nexus/macros-common",
     "nexus/metrics-producer-gc",
@@ -109,6 +111,7 @@ default-members = [
     "dev-tools/crdb-seed",
     "dev-tools/omdb",
     "dev-tools/omicron-dev",
+    "dev-tools/openapi-manager",
     "dev-tools/oxlog",
     "dev-tools/reconfigurator-cli",
     "dev-tools/releng",
@@ -138,6 +141,7 @@ default-members = [
     "nexus/db-model",
     "nexus/db-queries",
     "nexus/defaults",
+    "nexus/internal-api",
     "nexus/inventory",
     "nexus/macros-common",
     "nexus/metrics-producer-gc",
@@ -310,6 +314,7 @@ hyper = "0.14"
 hyper-rustls = "0.26.0"
 hyper-staticfile = "0.9.5"
 illumos-utils = { path = "illumos-utils" }
+indent_write = "2.2.0"
 indexmap = "2.2.6"
 indicatif = { version = "0.17.8", features = ["rayon"] }
 installinator = { path = "installinator" }
@@ -344,6 +349,7 @@ nexus-db-model = { path = "nexus/db-model" }
 nexus-db-queries = { path = "nexus/db-queries" }
 nexus-defaults = { path = "nexus/defaults" }
 nexus-inventory = { path = "nexus/inventory" }
+nexus-internal-api = { path = "nexus/internal-api" }
 nexus-macros-common = { path = "nexus/macros-common" }
 nexus-metrics-producer-gc = { path = "nexus/metrics-producer-gc" }
 nexus-networking = { path = "nexus/networking" }
@@ -449,6 +455,7 @@ shell-words = "1.1.0"
 signal-hook = "0.3"
 signal-hook-tokio = { version = "0.3", features = [ "futures-v0_3" ] }
 sigpipe = "0.1.3"
+similar = { version = "2.5.0", features = ["bytes"] }
 similar-asserts = "1.5.0"
 # Don't change sled's version on accident; sled's on-disk format is not yet
 # stable and requires manual migrations. In the limit this won't matter because
diff --git a/README.adoc b/README.adoc
index f0e3a88343..1ef4bd8601 100644
--- a/README.adoc
+++ b/README.adoc
@@ -181,16 +181,23 @@ By default, Cargo does not operate on the tests.  Cargo's check/build/clippy com
 Each service is a Dropshot server that presents an HTTP API. The description of
 that API is serialized as an
 https://github.com/OAI/OpenAPI-Specification[OpenAPI] document which we store
-in link:./openapi[`omicron/openapi`] and check in to this repo. In order to
-ensure that changes to those APIs are made intentionally, each service contains
-a test that validates that the current API matches. This allows us 1. to catch
-accidental changes as test failures and 2. to explicitly observe API changes
-during code review (and in the git history).
+in link:./openapi[`omicron/openapi`] and check in to this repo. Checking in
+these generated files allows us:
+
+. To catch accidental changes as test failures.
+. To explicitly observe API changes during code review (and in the git history).
 
 We also use these OpenAPI documents as the source for the clients we generate
 using https://github.com/oxidecomputer/progenitor[Progenitor]. Clients are
 automatically updated when the coresponding OpenAPI document is modified.
 
+There are currently two kinds of services based on how their corresponding documents are generated: *managed* and *unmanaged*. Eventually, all services within Omicron will transition to being managed.
+
+* A *managed* service is tracked by the `cargo xtask openapi` command, using Dropshot's relatively new API trait functionality.
+* An *unmanaged* service is defined the traditional way, by gluing together a set of implementation functions, and is tracked by an independent test.
+
+To check whether your document is managed, run `cargo xtask openapi list`: it will list out all managed OpenAPI documents. If your document is not on the list, it is unmanaged.
+
 Note that Omicron contains a nominally circular dependency:
 
 * Nexus depends on the Sled Agent client
@@ -201,7 +208,13 @@ Note that Omicron contains a nominally circular dependency:
 We effectively "break" this circular dependency by virtue of the OpenAPI
 documents being checked in.
 
-In general, changes any service API **require the following set of build steps**:
+==== Updating Managed Services
+
+See the documentation in link:./dev-tools/openapi-manager[`dev-tools/openapi-manager`] for more information.
+
+==== Updating Unmanaged Services
+
+In general, changes to unmanaged service APs **require the following set of build steps**:
 
 . Make changes to the service API.
 . Update the OpenAPI document by running the relevant test with overwrite set:
diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml
new file mode 100644
index 0000000000..b50aeec69f
--- /dev/null
+++ b/dev-tools/openapi-manager/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "openapi-manager"
+version = "0.1.0"
+edition = "2021"
+license = "MPL-2.0"
+
+[lints]
+workspace = true
+
+[dependencies]
+anyhow.workspace = true
+atomicwrites.workspace = true
+camino.workspace = true
+clap.workspace = true
+dropshot.workspace = true
+fs-err.workspace = true
+indent_write.workspace = true
+nexus-internal-api.workspace = true
+omicron-workspace-hack.workspace = true
+openapiv3.workspace = true
+openapi-lint.workspace = true
+owo-colors.workspace = true
+serde_json.workspace = true
+similar.workspace = true
+supports-color.workspace = true
diff --git a/dev-tools/openapi-manager/README.adoc b/dev-tools/openapi-manager/README.adoc
new file mode 100644
index 0000000000..1aadaa2c0c
--- /dev/null
+++ b/dev-tools/openapi-manager/README.adoc
@@ -0,0 +1,103 @@
+= OpenAPI manager
+
+This tool manages the OpenAPI documents (JSON files) checked into Omicron's `openapi` directory, using Dropshot's support for *API traits*.
+
+NOTE: For more information about API traits, see https://rfd.shared.oxide.computer/rfd/0479[RFD 479].
+
+Currently, a subset of OpenAPI documents is managed by this tool. Eventually, all of the OpenAPI documents in Omicron will be managed by this tool; work to make that happen is ongoing.
+
+To check whether your document is managed, run `cargo xtask openapi list`: it will list out all managed OpenAPI documents. If your document is not on the list, it is unmanaged.
+
+== Basic usage
+
+The OpenAPI manager is meant to be invoked via `cargo xtask openapi`. Currently, three commands are provided:
+
+* `cargo xtask openapi list`: List information about currently-managed documents.
+* `cargo xtask openapi check`: Check that all of the managed documents are up-to-date.
+* `cargo xtask openapi generate`: Update and generate OpenAPI documents.
+
+There is also a test which makes sure that all managed documents are up-to-date, and tells you to run `cargo xtask openapi generate` if they aren't.
+
+=== API crates [[api_crates]]
+
+The OpenAPI manager has dependencies on a set of *API crates*. An API crate is a Rust library that consists of the API trait, and possibly supporting types. Each OpenAPI document should have a separate API crate.
+
+To keep compile times down, ensure that the API crate has as few dependencies as possible. In particular, *strongly avoid any dependencies on Diesel or other database logic*.
+
+The ideal set of dependencies is:
+    
+* Common crates within omicron: `omicron-common`, perhaps `omicron-uuid-kinds` if typed UUIDs are in use, and a `types` crate for your service.
+* Core external crates: `dropshot`, `serde`, `schemars`, and `uuid`.
+
+For an archetypal way to organize code, see the dependency graph in https://rfd.shared.oxide.computer/rfd/0479#functions_vs_traits[RFD 479's _Choosing between functions and traits_].
+
+== Managing OpenAPI documents
+
+For OpenAPI documents to be managed by this tool, the corresponding interfaces must be defined via *API traits* rather than traditional Dropshot function-based servers.
+
+TIP: For examples within Omicron, search the repo for `dropshot::api_description`.
+
+=== Adding new documents
+
+If you're defining a new service fronted by OpenAPI, first create an API crate (see <<api_crates>> above).
+
+. Add the API crate to the workspace's `Cargo.toml`: `members` and `default-members`, and a reference in `[workspace.dependencies]`.
+. Following the example in https://rfd.shared.oxide.computer/rfd/0479#guide_trait_definition[RFD 479's _Trait definition_], define the API trait.
+
+In the implementation crate:
+
+. Add a dependency on the API crate.
+. Following the example in https://rfd.shared.oxide.computer/rfd/0479#guide_api_implementation[RFD 479's _API implementation_], provide an implementation of the trait.
+
+Once the API crate is defined, perform the steps in <<add_to_manager>> below.
+
+=== Converting existing documents
+
+Existing, unmanaged documents are generated via *function-based servers*: a set of functions that some code combines into a Dropshot `ApiDescription`. (There is also likely an expectorate test which ensures that the document is up-to-date.)
+
+The first step is to convert the function-based server into an API trait. To do so, create an API crate (see <<api_crates>> above).
+
+. Add the API crate to the workspace's `Cargo.toml`: `members` and `default-members`, and a reference in `[workspace.dependencies]`.
+. Follow the instructions in https://rfd.shared.oxide.computer/rfd/0479#guide_converting_functions_to_traits[RFD 479's _Converting functions to API traits_] for the API crate.
+
+In the implementation crate:
+
+. Continue following the instructions in https://rfd.shared.oxide.computer/rfd/0479#guide_converting_functions_to_traits[RFD 479's _Converting functions to API traits_] for where the endpoint functions are currently defined.
+. Find the test which currently manages the document (try searching the repo for `openapi_lint::validate`). If it performs any checks on the document beyond `openapi_lint::validate` or `openapi_lint::validate_external`, see <<extra_validation>>.
+
+Next, perform the steps in <<add_to_manager>> below.
+
+Finally, remove:
+
+. The test which used to manage the document. The OpenAPI manager includes a test that will automatically run in CI.
+. The binary subcommand (typically called `openapi`) that generated the OpenAPI document. The test was the only practical use of this subcommand.
+
+=== Adding the API crate to the manager [[add_to_manager]]
+
+Once the API crate is defined, inform the OpenAPI manager of its existence. Within this directory:
+
+. In `Cargo.toml`, add a dependency on the API crate.
+. In `src/spec.rs`, add the crate to the `all_apis` function. (Please keep the list sorted by filename.)
+
+To ensure everything works well, run `cargo xtask openapi generate`.
+
+* Your OpenAPI document should be generated on disk and listed in the output.
+* If you're converting an existing API, the only changes should be the ones you might have introduced as part of the refactor. If there are significant changes, something's gone wrong--maybe you missed an endpoint?
+
+==== Performing extra validation [[extra_validation]]
+
+By default, the OpenAPI manager does basic validation on the generated document. Some documents require extra validation steps.
+
+It's best to put extra validation next to the trait, within the API crate.
+
+. In the API crate, add dependencies on `anyhow` and `openapiv3`.
+. Define a function with signature `fn extra_validation(openapi: &openapiv3::OpenAPI) -> anyhow::Result<()>` which performs the extra validation steps.
+. In `all_apis`, set the `extra_validation` field to this function.
+
+== Design notes
+
+The OpenAPI manager uses the new support for Dropshot API traits described in https://rfd.shared.oxide.computer/rfd/0479[RFD 479].
+
+With traditional function-based Dropshot servers, generating the OpenAPI document requires the implementation to be compiled. With API traits, that is no longer necessary. The OpenAPI manager leverages this to provide a fast and easy way to regenerate API documents.
+
+This does mean that the OpenAPI manager requires the use of API traits, and that eventually all of Omicron's Dropshot APIs should be switched over to traits.
diff --git a/dev-tools/openapi-manager/src/check.rs b/dev-tools/openapi-manager/src/check.rs
new file mode 100644
index 0000000000..182ed9fb19
--- /dev/null
+++ b/dev-tools/openapi-manager/src/check.rs
@@ -0,0 +1,185 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::{io::Write, process::ExitCode};
+
+use anyhow::Result;
+use camino::Utf8Path;
+use indent_write::io::IndentWriter;
+use owo_colors::OwoColorize;
+use similar::TextDiff;
+
+use crate::{
+    output::{
+        display_api_spec, display_error, display_summary, headers::*, plural,
+        write_diff, OutputOpts, Styles,
+    },
+    spec::{all_apis, CheckStatus},
+    FAILURE_EXIT_CODE, NEEDS_UPDATE_EXIT_CODE,
+};
+
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum CheckResult {
+    Success,
+    NeedsUpdate,
+    Failures,
+}
+
+impl CheckResult {
+    pub(crate) fn to_exit_code(self) -> ExitCode {
+        match self {
+            CheckResult::Success => ExitCode::SUCCESS,
+            CheckResult::NeedsUpdate => NEEDS_UPDATE_EXIT_CODE.into(),
+            CheckResult::Failures => FAILURE_EXIT_CODE.into(),
+        }
+    }
+}
+
+pub(crate) fn check_impl(
+    dir: &Utf8Path,
+    output: &OutputOpts,
+) -> Result<CheckResult> {
+    let mut styles = Styles::default();
+    if output.use_color(supports_color::Stream::Stderr) {
+        styles.colorize();
+    }
+
+    let all_apis = all_apis();
+    let total = all_apis.len();
+    let count_width = total.to_string().len();
+    let continued_indent = continued_indent(count_width);
+
+    eprintln!("{:>HEADER_WIDTH$}", SEPARATOR);
+
+    eprintln!(
+        "{:>HEADER_WIDTH$} {} OpenAPI {}...",
+        CHECKING.style(styles.success_header),
+        total.style(styles.bold),
+        plural::documents(total),
+    );
+    let mut num_up_to_date = 0;
+    let mut num_stale = 0;
+    let mut num_missing = 0;
+    let mut num_failed = 0;
+
+    for (ix, spec) in all_apis.iter().enumerate() {
+        let count = ix + 1;
+
+        match spec.check(&dir) {
+            Ok(status) => match status {
+                CheckStatus::Ok(summary) => {
+                    eprintln!(
+                            "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {}",
+                            UP_TO_DATE.style(styles.success_header),
+                            display_api_spec(spec, &styles),
+                            display_summary(&summary, &styles),
+                        );
+
+                    num_up_to_date += 1;
+                }
+                CheckStatus::Stale { full_path, actual, expected } => {
+                    eprintln!(
+                        "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}",
+                        STALE.style(styles.warning_header),
+                        display_api_spec(spec, &styles),
+                    );
+
+                    let diff = TextDiff::from_lines(&actual, &expected);
+                    write_diff(
+                        &diff,
+                        &full_path,
+                        &styles,
+                        // Add an indent to align diff with the status message.
+                        &mut IndentWriter::new(
+                            &continued_indent,
+                            std::io::stderr(),
+                        ),
+                    )?;
+
+                    num_stale += 1;
+                }
+                CheckStatus::Missing => {
+                    eprintln!(
+                        "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}",
+                        MISSING.style(styles.warning_header),
+                        display_api_spec(spec, &styles),
+                    );
+
+                    num_missing += 1;
+                }
+            },
+            Err(error) => {
+                eprint!(
+                    "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}",
+                    FAILURE.style(styles.failure_header),
+                    display_api_spec(spec, &styles),
+                );
+                let display = display_error(&error, styles.failure);
+                write!(
+                    IndentWriter::new(&continued_indent, std::io::stderr()),
+                    "{}",
+                    display,
+                )?;
+
+                num_failed += 1;
+            }
+        };
+    }
+
+    eprintln!("{:>HEADER_WIDTH$}", SEPARATOR);
+
+    let status_header = if num_failed > 0 {
+        FAILURE.style(styles.failure_header)
+    } else if num_stale > 0 {
+        STALE.style(styles.warning_header)
+    } else {
+        SUCCESS.style(styles.success_header)
+    };
+
+    eprintln!(
+        "{:>HEADER_WIDTH$} {} {} checked: {} up-to-date, {} stale, {} missing, {} failed",
+        status_header,
+        total.style(styles.bold),
+        plural::documents(total),
+        num_up_to_date.style(styles.bold),
+        num_stale.style(styles.bold),
+        num_missing.style(styles.bold),
+        num_failed.style(styles.bold),
+    );
+    if num_failed > 0 {
+        eprintln!(
+            "{:>HEADER_WIDTH$} (fix failures, then run {} to update)",
+            "",
+            "cargo xtask openapi generate".style(styles.bold)
+        );
+        Ok(CheckResult::Failures)
+    } else if num_stale > 0 {
+        eprintln!(
+            "{:>HEADER_WIDTH$} (run {} to update)",
+            "",
+            "cargo xtask openapi generate".style(styles.bold)
+        );
+        Ok(CheckResult::NeedsUpdate)
+    } else {
+        Ok(CheckResult::Success)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::process::ExitCode;
+
+    use crate::spec::find_openapi_dir;
+
+    use super::*;
+
+    #[test]
+    fn check_apis_up_to_date() -> Result<ExitCode> {
+        let output = OutputOpts { color: clap::ColorChoice::Auto };
+        let dir = find_openapi_dir()?;
+
+        let result = check_impl(&dir, &output)?;
+        Ok(result.to_exit_code())
+    }
+}
diff --git a/dev-tools/openapi-manager/src/dispatch.rs b/dev-tools/openapi-manager/src/dispatch.rs
new file mode 100644
index 0000000000..937a8b485f
--- /dev/null
+++ b/dev-tools/openapi-manager/src/dispatch.rs
@@ -0,0 +1,99 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::process::ExitCode;
+
+use anyhow::Result;
+use camino::Utf8PathBuf;
+use clap::{Args, Parser, Subcommand};
+
+use crate::{
+    check::check_impl, generate::generate_impl, list::list_impl,
+    output::OutputOpts, spec::openapi_dir,
+};
+
+/// Manage OpenAPI specifications.
+///
+/// For more information, see dev-tools/openapi-manager/README.adoc.
+#[derive(Debug, Parser)]
+pub struct App {
+    #[clap(flatten)]
+    output_opts: OutputOpts,
+
+    #[clap(subcommand)]
+    command: Command,
+}
+
+impl App {
+    pub fn exec(self) -> Result<ExitCode> {
+        match self.command {
+            Command::List(args) => args.exec(&self.output_opts),
+            Command::Generate(args) => args.exec(&self.output_opts),
+            Command::Check(args) => args.exec(&self.output_opts),
+        }
+    }
+}
+
+#[derive(Debug, Subcommand)]
+pub enum Command {
+    /// List managed APIs.
+    ///
+    /// Returns information purely from code without consulting JSON files on
+    /// disk. To compare against files on disk, use the `check` command.
+    List(ListArgs),
+
+    /// Generate APIs.
+    Generate(GenerateArgs),
+
+    /// Check that APIs are up-to-date.
+    Check(CheckArgs),
+}
+
+#[derive(Debug, Args)]
+pub struct ListArgs {
+    /// Show verbose output including descriptions.
+    #[clap(long, short)]
+    verbose: bool,
+}
+
+impl ListArgs {
+    fn exec(self, output: &OutputOpts) -> anyhow::Result<ExitCode> {
+        list_impl(self.verbose, output)?;
+        Ok(ExitCode::SUCCESS)
+    }
+}
+
+#[derive(Debug, Args)]
+pub struct GenerateArgs {
+    /// The directory to write generated APIs to (default: workspace root/openapi)
+    #[clap(long)]
+    dir: Option<Utf8PathBuf>,
+}
+
+impl GenerateArgs {
+    fn exec(self, output: &OutputOpts) -> anyhow::Result<ExitCode> {
+        let dir = openapi_dir(self.dir)?;
+        Ok(generate_impl(&dir, output)?.to_exit_code())
+    }
+}
+
+#[derive(Debug, Args)]
+pub struct CheckArgs {
+    /// The directory to read generated APIs from.
+    #[clap(long)]
+    dir: Option<Utf8PathBuf>,
+}
+
+impl CheckArgs {
+    fn exec(self, output: &OutputOpts) -> anyhow::Result<ExitCode> {
+        let dir = openapi_dir(self.dir)?;
+        Ok(check_impl(&dir, output)?.to_exit_code())
+    }
+}
+
+// This code is not 0 or 1 (general anyhow errors) and indicates out-of-date.
+pub(crate) const NEEDS_UPDATE_EXIT_CODE: u8 = 2;
+
+// This code indicates failures during generation, e.g. validation errors.
+pub(crate) const FAILURE_EXIT_CODE: u8 = 100;
diff --git a/dev-tools/openapi-manager/src/generate.rs b/dev-tools/openapi-manager/src/generate.rs
new file mode 100644
index 0000000000..f776ff2709
--- /dev/null
+++ b/dev-tools/openapi-manager/src/generate.rs
@@ -0,0 +1,128 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::{io::Write, process::ExitCode};
+
+use anyhow::Result;
+use camino::Utf8Path;
+use indent_write::io::IndentWriter;
+use owo_colors::OwoColorize;
+
+use crate::{
+    output::{
+        display_api_spec, display_error, display_summary, headers::*, plural,
+        OutputOpts, Styles,
+    },
+    spec::{all_apis, OverwriteStatus},
+    FAILURE_EXIT_CODE,
+};
+
+#[derive(Clone, Copy, Debug)]
+pub(crate) enum GenerateResult {
+    Success,
+    Failures,
+}
+
+impl GenerateResult {
+    pub(crate) fn to_exit_code(self) -> ExitCode {
+        match self {
+            GenerateResult::Success => ExitCode::SUCCESS,
+            GenerateResult::Failures => FAILURE_EXIT_CODE.into(),
+        }
+    }
+}
+
+pub(crate) fn generate_impl(
+    dir: &Utf8Path,
+    output: &OutputOpts,
+) -> Result<GenerateResult> {
+    let mut styles = Styles::default();
+    if output.use_color(supports_color::Stream::Stderr) {
+        styles.colorize();
+    }
+
+    let all_apis = all_apis();
+    let total = all_apis.len();
+    let count_width = total.to_string().len();
+    let continued_indent = continued_indent(count_width);
+
+    eprintln!("{:>HEADER_WIDTH$}", SEPARATOR);
+
+    eprintln!(
+        "{:>HEADER_WIDTH$} {} OpenAPI {}...",
+        GENERATING.style(styles.success_header),
+        total.style(styles.bold),
+        plural::documents(total),
+    );
+    let mut num_updated = 0;
+    let mut num_unchanged = 0;
+    let mut num_failed = 0;
+
+    for (ix, spec) in all_apis.iter().enumerate() {
+        let count = ix + 1;
+
+        match spec.overwrite(&dir) {
+            Ok((status, summary)) => match status {
+                OverwriteStatus::Updated => {
+                    eprintln!(
+                        "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {}",
+                        UPDATED.style(styles.success_header),
+                        display_api_spec(spec, &styles),
+                        display_summary(&summary, &styles),
+                    );
+                    num_updated += 1;
+                }
+                OverwriteStatus::Unchanged => {
+                    eprintln!(
+                        "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}: {}",
+                        UNCHANGED.style(styles.unchanged_header),
+                        display_api_spec(spec, &styles),
+                        display_summary(&summary, &styles),
+                    );
+                    num_unchanged += 1;
+                }
+            },
+            Err(err) => {
+                eprintln!(
+                    "{:>HEADER_WIDTH$} [{count:>count_width$}/{total}] {}",
+                    FAILURE.style(styles.failure_header),
+                    display_api_spec(spec, &styles),
+                );
+                let display = display_error(&err, styles.failure);
+                write!(
+                    IndentWriter::new(&continued_indent, std::io::stderr()),
+                    "{}",
+                    display,
+                )?;
+
+                num_failed += 1;
+            }
+        };
+    }
+
+    eprintln!("{:>HEADER_WIDTH$}", SEPARATOR);
+
+    let status_header = if num_failed > 0 {
+        FAILURE.style(styles.failure_header)
+    } else {
+        SUCCESS.style(styles.success_header)
+    };
+
+    eprintln!(
+        "{:>HEADER_WIDTH$} {} {} generated: \
+         {} updated, {} unchanged, {} failed",
+        status_header,
+        total.style(styles.bold),
+        plural::documents(total),
+        num_updated.style(styles.bold),
+        num_unchanged.style(styles.bold),
+        num_failed.style(styles.bold),
+    );
+
+    if num_failed > 0 {
+        Ok(GenerateResult::Failures)
+    } else {
+        Ok(GenerateResult::Success)
+    }
+}
diff --git a/dev-tools/openapi-manager/src/lib.rs b/dev-tools/openapi-manager/src/lib.rs
new file mode 100644
index 0000000000..0f79c5f9f4
--- /dev/null
+++ b/dev-tools/openapi-manager/src/lib.rs
@@ -0,0 +1,21 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! OpenAPI manager for Omicron.
+//!
+//! This tool generates and checks OpenAPI specifications for Omicron OpenAPI
+//! documents. In the future, all OpenAPI documents will be generated by this
+//! tool, but work to make that happen is ongoing.
+//!
+//! This is meant to be invoked as `cargo xtask openapi`, but is a separate
+//! binary to avoid compiling a bunch of extra code when running `cargo xtask`.
+
+mod check;
+mod dispatch;
+mod generate;
+mod list;
+mod output;
+mod spec;
+
+pub use dispatch::*;
diff --git a/dev-tools/openapi-manager/src/list.rs b/dev-tools/openapi-manager/src/list.rs
new file mode 100644
index 0000000000..bf1920c69d
--- /dev/null
+++ b/dev-tools/openapi-manager/src/list.rs
@@ -0,0 +1,127 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::io::Write;
+
+use indent_write::io::IndentWriter;
+use owo_colors::OwoColorize;
+
+use crate::{
+    output::{display_api_spec, display_error, OutputOpts, Styles},
+    spec::all_apis,
+};
+
+pub(crate) fn list_impl(
+    verbose: bool,
+    output: &OutputOpts,
+) -> anyhow::Result<()> {
+    let mut styles = Styles::default();
+    if output.use_color(supports_color::Stream::Stdout) {
+        styles.colorize();
+    }
+    let mut out = std::io::stdout();
+
+    let all_apis = all_apis();
+    let total = all_apis.len();
+    let count_width = total.to_string().len();
+
+    if verbose {
+        // A string for verbose indentation. +1 for the closing ), and +2 for
+        // further indentation.
+        let initial_indent = " ".repeat(count_width + 1 + 2);
+        // This plus 4 more for continued indentation.
+        let continued_indent = " ".repeat(count_width + 1 + 2 + 4);
+
+        for (ix, api) in all_apis.iter().enumerate() {
+            let count = ix + 1;
+
+            writeln!(
+                &mut out,
+                "{count:count_width$}) {}",
+                api.filename.style(styles.bold),
+            )?;
+
+            writeln!(
+                &mut out,
+                "{initial_indent} {}: {} v{}",
+                "title".style(styles.header),
+                api.title,
+                api.version,
+            )?;
+
+            write!(
+                &mut out,
+                "{initial_indent} {}: ",
+                "description".style(styles.header)
+            )?;
+            writeln!(
+                IndentWriter::new_skip_initial(&continued_indent, &mut out),
+                "{}",
+                api.description,
+            )?;
+
+            writeln!(
+                &mut out,
+                "{initial_indent} {}: {}",
+                "boundary".style(styles.header),
+                api.boundary,
+            )?;
+
+            match api.to_openapi_doc() {
+                Ok(openapi) => {
+                    let num_paths = openapi.paths.paths.len();
+                    let num_schemas = openapi.components.map_or_else(
+                        || "(data missing)".to_owned(),
+                        |c| c.schemas.len().to_string(),
+                    );
+                    writeln!(
+                        &mut out,
+                        "{initial_indent} {}: {} paths, {} schemas",
+                        "details".style(styles.header),
+                        num_paths.style(styles.bold),
+                        num_schemas.style(styles.bold),
+                    )?;
+                }
+                Err(error) => {
+                    write!(
+                        &mut out,
+                        "{initial_indent} {}: ",
+                        "error".style(styles.failure),
+                    )?;
+                    let display = display_error(&error, styles.failure);
+                    write!(
+                        IndentWriter::new_skip_initial(
+                            &continued_indent,
+                            std::io::stderr(),
+                        ),
+                        "{}",
+                        display,
+                    )?;
+                }
+            };
+
+            if ix + 1 < total {
+                writeln!(&mut out)?;
+            }
+        }
+    } else {
+        for (ix, spec) in all_apis.iter().enumerate() {
+            let count = ix + 1;
+
+            writeln!(
+                &mut out,
+                "{count:count_width$}) {}",
+                display_api_spec(spec, &styles),
+            )?;
+        }
+
+        writeln!(
+            &mut out,
+            "note: run with {} for more information",
+            "-v".style(styles.bold),
+        )?;
+    }
+
+    Ok(())
+}
diff --git a/dev-tools/openapi-manager/src/main.rs b/dev-tools/openapi-manager/src/main.rs
new file mode 100644
index 0000000000..422a1553b0
--- /dev/null
+++ b/dev-tools/openapi-manager/src/main.rs
@@ -0,0 +1,13 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::process::ExitCode;
+
+use clap::Parser;
+use openapi_manager::App;
+
+fn main() -> ExitCode {
+    let app = App::parse();
+    app.exec().unwrap()
+}
diff --git a/dev-tools/openapi-manager/src/output.rs b/dev-tools/openapi-manager/src/output.rs
new file mode 100644
index 0000000000..6cd578e778
--- /dev/null
+++ b/dev-tools/openapi-manager/src/output.rs
@@ -0,0 +1,253 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::{fmt, fmt::Write, io};
+
+use camino::Utf8Path;
+use clap::{Args, ColorChoice};
+use indent_write::fmt::IndentWriter;
+use owo_colors::{OwoColorize, Style};
+use similar::{ChangeTag, DiffableStr, TextDiff};
+
+use crate::spec::{ApiSpec, DocumentSummary};
+
+#[derive(Debug, Args)]
+#[clap(next_help_heading = "Global options")]
+pub struct OutputOpts {
+    /// Color output
+    #[clap(long, value_enum, global = true, default_value_t)]
+    pub(crate) color: ColorChoice,
+}
+
+impl OutputOpts {
+    /// Returns true if color should be used for the stream.
+    pub(crate) fn use_color(&self, stream: supports_color::Stream) -> bool {
+        match self.color {
+            ColorChoice::Auto => supports_color::on_cached(stream).is_some(),
+            ColorChoice::Always => true,
+            ColorChoice::Never => false,
+        }
+    }
+}
+
+#[derive(Clone, Debug, Default)]
+pub(crate) struct Styles {
+    pub(crate) bold: Style,
+    pub(crate) header: Style,
+    pub(crate) success_header: Style,
+    pub(crate) failure: Style,
+    pub(crate) failure_header: Style,
+    pub(crate) warning_header: Style,
+    pub(crate) unchanged_header: Style,
+    pub(crate) filename: Style,
+    pub(crate) diff_before: Style,
+    pub(crate) diff_after: Style,
+}
+
+impl Styles {
+    pub(crate) fn colorize(&mut self) {
+        self.bold = Style::new().bold();
+        self.header = Style::new().purple();
+        self.success_header = Style::new().green().bold();
+        self.failure = Style::new().red();
+        self.failure_header = Style::new().red().bold();
+        self.unchanged_header = Style::new().blue().bold();
+        self.warning_header = Style::new().yellow().bold();
+        self.filename = Style::new().cyan();
+        self.diff_before = Style::new().red();
+        self.diff_after = Style::new().green();
+    }
+}
+
+// This is copied from similar's UnifiedDiff::to_writer, except with colorized
+// output.
+pub(crate) fn write_diff<'diff, 'old, 'new, 'bufs>(
+    diff: &'diff TextDiff<'old, 'new, 'bufs, [u8]>,
+    full_path: &Utf8Path,
+    styles: &Styles,
+    out: &mut dyn io::Write,
+) -> io::Result<()>
+where
+    'diff: 'old + 'new + 'bufs,
+{
+    // The "a/" (/ courtesy full_path) and "b/" make it feel more like git diff.
+    writeln!(
+        out,
+        "{}",
+        format!("--- a{}", full_path).style(styles.diff_before)
+    )?;
+    writeln!(
+        out,
+        "{}",
+        format!("+++ b/generated/{}", full_path.file_name().unwrap())
+            .style(styles.diff_after)
+    )?;
+
+    let udiff = diff.unified_diff();
+    for hunk in udiff.iter_hunks() {
+        for (idx, change) in hunk.iter_changes().enumerate() {
+            if idx == 0 {
+                writeln!(out, "{}", hunk.header())?;
+            }
+            let style = match change.tag() {
+                ChangeTag::Delete => styles.diff_before,
+                ChangeTag::Insert => styles.diff_after,
+                ChangeTag::Equal => Style::new(),
+            };
+
+            write!(out, "{}", change.tag().style(style))?;
+            write!(out, "{}", change.value().to_string_lossy().style(style))?;
+            if !diff.newline_terminated() {
+                writeln!(out)?;
+            }
+            if diff.newline_terminated() && change.missing_newline() {
+                writeln!(
+                    out,
+                    "{}",
+                    MissingNewlineHint(hunk.missing_newline_hint())
+                )?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+pub(crate) fn display_api_spec(spec: &ApiSpec, styles: &Styles) -> String {
+    format!(
+        "{} ({} v{})",
+        spec.filename.style(styles.filename),
+        spec.title,
+        spec.version,
+    )
+}
+
+pub(crate) fn display_summary(
+    summary: &DocumentSummary,
+    styles: &Styles,
+) -> String {
+    let mut summary_str = format!(
+        "{} {}",
+        summary.path_count.style(styles.bold),
+        plural::paths(summary.path_count),
+    );
+
+    if let Some(schema_count) = summary.schema_count {
+        summary_str.push_str(&format!(
+            ", {} {}",
+            schema_count.style(styles.bold),
+            plural::schemas(schema_count),
+        ));
+    } else {
+        summary_str.push_str(&format!(
+            ", {} for schemas",
+            "data missing".style(styles.failure)
+        ));
+    }
+
+    summary_str
+}
+
+pub(crate) fn display_error(
+    error: &anyhow::Error,
+    failure_style: Style,
+) -> impl fmt::Display + '_ {
+    struct DisplayError<'a> {
+        error: &'a anyhow::Error,
+        failure_style: Style,
+    }
+
+    impl fmt::Display for DisplayError<'_> {
+        fn fmt(&self, mut f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            writeln!(f, "{}", self.error.style(self.failure_style))?;
+
+            let mut source = self.error.source();
+            while let Some(curr) = source {
+                write!(f, "-> ")?;
+                writeln!(
+                    IndentWriter::new_skip_initial("   ", &mut f),
+                    "{}",
+                    curr.style(self.failure_style),
+                )?;
+                source = curr.source();
+            }
+
+            Ok(())
+        }
+    }
+
+    DisplayError { error, failure_style }
+}
+
+struct MissingNewlineHint(bool);
+
+impl fmt::Display for MissingNewlineHint {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.0 {
+            write!(f, "\n\\ No newline at end of file")?;
+        }
+        Ok(())
+    }
+}
+
+/// Output headers.
+pub(crate) mod headers {
+    // Same width as Cargo's output.
+    pub(crate) const HEADER_WIDTH: usize = 12;
+
+    pub(crate) static SEPARATOR: &str = "-------";
+
+    pub(crate) static CHECKING: &str = "Checking";
+    pub(crate) static GENERATING: &str = "Generating";
+
+    pub(crate) static UP_TO_DATE: &str = "Up-to-date";
+    pub(crate) static STALE: &str = "Stale";
+    pub(crate) static MISSING: &str = "Missing";
+
+    pub(crate) static UPDATED: &str = "Updated";
+    pub(crate) static UNCHANGED: &str = "Unchanged";
+
+    pub(crate) static SUCCESS: &str = "Success";
+    pub(crate) static FAILURE: &str = "Failure";
+
+    pub(crate) fn continued_indent(count_width: usize) -> String {
+        // Status strings are of the form:
+        //
+        //    Generated [ 1/12] api.json: 1 path, 1 schema
+        //
+        // So the continued indent is:
+        //
+        // HEADER_WIDTH for the status string
+        // + (count_width * 2) for current and total counts
+        // + 3 for '[/]'
+        // + 2 for spaces on either side.
+        " ".repeat(HEADER_WIDTH + count_width * 2 + 3 + 2)
+    }
+}
+
+pub(crate) mod plural {
+    pub(crate) fn documents(count: usize) -> &'static str {
+        if count == 1 {
+            "document"
+        } else {
+            "documents"
+        }
+    }
+
+    pub(crate) fn paths(count: usize) -> &'static str {
+        if count == 1 {
+            "path"
+        } else {
+            "paths"
+        }
+    }
+
+    pub(crate) fn schemas(count: usize) -> &'static str {
+        if count == 1 {
+            "schema"
+        } else {
+            "schemas"
+        }
+    }
+}
diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs
new file mode 100644
index 0000000000..37330d6922
--- /dev/null
+++ b/dev-tools/openapi-manager/src/spec.rs
@@ -0,0 +1,260 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::{fmt, io::Write};
+
+use anyhow::{Context, Result};
+use atomicwrites::AtomicFile;
+use camino::{Utf8Path, Utf8PathBuf};
+use dropshot::{ApiDescription, ApiDescriptionBuildErrors, StubContext};
+use fs_err as fs;
+use openapiv3::OpenAPI;
+
+/// All APIs managed by openapi-manager.
+pub fn all_apis() -> Vec<ApiSpec> {
+    vec![
+        ApiSpec {
+            title: "Nexus internal API".to_string(),
+            version: "0.0.1".to_string(),
+            description: "Nexus internal API".to_string(),
+            boundary: ApiBoundary::Internal,
+            api_description:
+                nexus_internal_api::nexus_internal_api_mod::stub_api_description,
+            filename: "nexus-internal.json".to_string(),
+            extra_validation: None,
+        },
+        // Add your APIs here! Please keep this list sorted by filename.
+    ]
+}
+
+pub struct ApiSpec {
+    /// The title.
+    pub title: String,
+
+    /// The version.
+    pub version: String,
+
+    /// The description string.
+    pub description: String,
+
+    /// Whether this API is internal or external.
+    pub boundary: ApiBoundary,
+
+    /// The API description function, typically a reference to
+    /// `stub_api_description`.
+    pub api_description:
+        fn() -> Result<ApiDescription<StubContext>, ApiDescriptionBuildErrors>,
+
+    /// The JSON filename to write the API description to.
+    pub filename: String,
+
+    /// Extra validation to perform on the OpenAPI spec, if any.
+    pub extra_validation: Option<fn(&OpenAPI) -> anyhow::Result<()>>,
+}
+
+impl ApiSpec {
+    pub(crate) fn overwrite(
+        &self,
+        dir: &Utf8Path,
+    ) -> Result<(OverwriteStatus, DocumentSummary)> {
+        let contents = self.to_json_bytes()?;
+
+        let summary = self
+            .validate_json(&contents)
+            .context("OpenAPI document validation failed")?;
+
+        let full_path = dir.join(&self.filename);
+        let status = overwrite_file(&full_path, &contents)?;
+
+        Ok((status, summary))
+    }
+
+    pub(crate) fn check(&self, dir: &Utf8Path) -> Result<CheckStatus> {
+        let contents = self.to_json_bytes()?;
+        let summary = self
+            .validate_json(&contents)
+            .context("OpenAPI document validation failed")?;
+
+        let full_path = dir.join(&self.filename);
+        let existing_contents =
+            read_opt(&full_path).context("failed to read contents on disk")?;
+
+        match existing_contents {
+            Some(existing_contents) if existing_contents == contents => {
+                Ok(CheckStatus::Ok(summary))
+            }
+            Some(existing_contents) => Ok(CheckStatus::Stale {
+                full_path,
+                actual: existing_contents,
+                expected: contents,
+            }),
+            None => Ok(CheckStatus::Missing),
+        }
+    }
+
+    pub(crate) fn to_openapi_doc(&self) -> Result<OpenAPI> {
+        // It's a bit weird to first convert to bytes and then back to OpenAPI,
+        // but this is the easiest way to do so (currently, Dropshot doesn't
+        // return the OpenAPI type directly). It is also consistent with the
+        // other code paths.
+        let contents = self.to_json_bytes()?;
+        contents_to_openapi(&contents)
+    }
+
+    fn to_json_bytes(&self) -> Result<Vec<u8>> {
+        let description = (self.api_description)().map_err(|error| {
+            // ApiDescriptionBuildError is actually a list of errors so it
+            // doesn't implement std::error::Error itself. Its Display
+            // impl formats the errors appropriately.
+            anyhow::anyhow!("{}", error)
+        })?;
+        let mut openapi_def = description.openapi(&self.title, &self.version);
+        openapi_def
+            .description(&self.description)
+            .contact_url("https://oxide.computer")
+            .contact_email("api@oxide.computer");
+
+        // Use write because it's the most reliable way to get the canonical
+        // JSON order. The `json` method returns a serde_json::Value which may
+        // or may not have preserve_order enabled.
+        let mut contents = Vec::new();
+        openapi_def.write(&mut contents)?;
+        Ok(contents)
+    }
+
+    fn validate_json(&self, contents: &[u8]) -> Result<DocumentSummary> {
+        let openapi_doc = contents_to_openapi(contents)
+            .context("JSON returned by ApiDescription is not valid OpenAPI")?;
+
+        // Check for lint errors.
+        let errors = match self.boundary {
+            ApiBoundary::Internal => openapi_lint::validate(&openapi_doc),
+            ApiBoundary::External => {
+                openapi_lint::validate_external(&openapi_doc)
+            }
+        };
+        if !errors.is_empty() {
+            return Err(anyhow::anyhow!("{}", errors.join("\n\n")));
+        }
+
+        if let Some(extra_validation) = self.extra_validation {
+            extra_validation(&openapi_doc)?;
+        }
+
+        Ok(DocumentSummary::new(&openapi_doc))
+    }
+}
+
+fn contents_to_openapi(contents: &[u8]) -> Result<OpenAPI> {
+    serde_json::from_slice(&contents)
+        .context("JSON returned by ApiDescription is not valid OpenAPI")
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum ApiBoundary {
+    Internal,
+    #[allow(dead_code)] // Remove this when we start managing an external API.
+    External,
+}
+
+impl fmt::Display for ApiBoundary {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ApiBoundary::Internal => write!(f, "internal"),
+            ApiBoundary::External => write!(f, "external"),
+        }
+    }
+}
+
+#[derive(Debug)]
+#[must_use]
+pub(crate) enum OverwriteStatus {
+    Updated,
+    Unchanged,
+}
+
+#[derive(Debug)]
+#[must_use]
+pub(crate) enum CheckStatus {
+    Ok(DocumentSummary),
+    Stale { full_path: Utf8PathBuf, actual: Vec<u8>, expected: Vec<u8> },
+    Missing,
+}
+
+#[derive(Debug)]
+#[must_use]
+pub(crate) struct DocumentSummary {
+    pub(crate) path_count: usize,
+    // None if data is missing.
+    pub(crate) schema_count: Option<usize>,
+}
+
+impl DocumentSummary {
+    fn new(doc: &OpenAPI) -> Self {
+        Self {
+            path_count: doc.paths.paths.len(),
+            schema_count: doc
+                .components
+                .as_ref()
+                .map_or(None, |c| Some(c.schemas.len())),
+        }
+    }
+}
+
+pub(crate) fn openapi_dir(dir: Option<Utf8PathBuf>) -> Result<Utf8PathBuf> {
+    match dir {
+        Some(dir) => Ok(dir.canonicalize_utf8().with_context(|| {
+            format!("failed to canonicalize directory: {}", dir)
+        })?),
+        None => find_openapi_dir().context("failed to find openapi directory"),
+    }
+}
+
+pub(crate) fn find_openapi_dir() -> Result<Utf8PathBuf> {
+    let mut root = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    // This crate is two levels down from the root of omicron, so go up twice.
+    root.pop();
+    root.pop();
+
+    root.push("openapi");
+    let root = root.canonicalize_utf8().with_context(|| {
+        format!("failed to canonicalize openapi directory: {}", root)
+    })?;
+
+    if !root.is_dir() {
+        anyhow::bail!("openapi root is not a directory: {}", root);
+    }
+
+    Ok(root)
+}
+
+/// Overwrite a file with new contents, if the contents are different.
+///
+/// The file is left unchanged if the contents are the same. That's to avoid
+/// mtime-based recompilations.
+fn overwrite_file(path: &Utf8Path, contents: &[u8]) -> Result<OverwriteStatus> {
+    // Only overwrite the file if the contents are actually different.
+    let existing_contents =
+        read_opt(path).context("failed to read contents on disk")?;
+
+    // None means the file doesn't exist, in which case we always want to write
+    // the new contents.
+    if existing_contents.as_deref() == Some(contents) {
+        return Ok(OverwriteStatus::Unchanged);
+    }
+
+    AtomicFile::new(path, atomicwrites::OverwriteBehavior::AllowOverwrite)
+        .write(|f| f.write_all(contents))
+        .with_context(|| format!("failed to write to `{}`", path))?;
+
+    Ok(OverwriteStatus::Updated)
+}
+
+fn read_opt(path: &Utf8Path) -> std::io::Result<Option<Vec<u8>>> {
+    match fs::read(path) {
+        Ok(contents) => Ok(Some(contents)),
+        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
+        Err(err) => return Err(err),
+    }
+}
diff --git a/dev-tools/xtask/src/main.rs b/dev-tools/xtask/src/main.rs
index 3d8acceb3d..d0a61272a9 100644
--- a/dev-tools/xtask/src/main.rs
+++ b/dev-tools/xtask/src/main.rs
@@ -46,6 +46,11 @@ enum Cmds {
     /// Download binaries, OpenAPI specs, and other out-of-repo utilities.
     Download(download::DownloadArgs),
 
+    /// Manage OpenAPI specifications.
+    ///
+    /// For more information, see dev-tools/openapi-manager/README.adoc.
+    Openapi(external::External),
+
     #[cfg(target_os = "illumos")]
     /// Build a TUF repo
     Releng(external::External),
@@ -88,6 +93,7 @@ async fn main() -> Result<()> {
         Cmds::Clippy(args) => clippy::run_cmd(args),
         Cmds::CheckWorkspaceDeps => check_workspace_deps::run_cmd(),
         Cmds::Download(args) => download::run_cmd(args).await,
+        Cmds::Openapi(external) => external.exec_bin("openapi-manager"),
         #[cfg(target_os = "illumos")]
         Cmds::Releng(external) => {
             external.cargo_args(["--release"]).exec_bin("omicron-releng")
diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml
index fb6a07969d..359ea616d4 100644
--- a/nexus/Cargo.toml
+++ b/nexus/Cargo.toml
@@ -45,6 +45,7 @@ macaddr.workspace = true
 # integration tests.
 nexus-client.workspace = true
 nexus-config.workspace = true
+nexus-internal-api.workspace = true
 nexus-networking.workspace = true
 nexus-test-interface.workspace = true
 num-integer.workspace = true
diff --git a/nexus/db-model/src/external_ip.rs b/nexus/db-model/src/external_ip.rs
index 2a68b4d7d0..8226f8293e 100644
--- a/nexus/db-model/src/external_ip.rs
+++ b/nexus/db-model/src/external_ip.rs
@@ -22,6 +22,8 @@ use nexus_types::deployment::OmicronZoneExternalIp;
 use nexus_types::deployment::OmicronZoneExternalSnatIp;
 use nexus_types::external_api::params;
 use nexus_types::external_api::shared;
+use nexus_types::external_api::shared::ProbeExternalIp;
+use nexus_types::external_api::shared::ProbeExternalIpKind;
 use nexus_types::external_api::views;
 use nexus_types::inventory::SourceNatConfig;
 use omicron_common::api::external::Error;
@@ -191,6 +193,27 @@ impl TryFrom<&'_ ExternalIp> for OmicronZoneExternalIp {
     }
 }
 
+impl From<ExternalIp> for ProbeExternalIp {
+    fn from(value: ExternalIp) -> Self {
+        Self {
+            ip: value.ip.ip(),
+            first_port: value.first_port.0,
+            last_port: value.last_port.0,
+            kind: value.kind.into(),
+        }
+    }
+}
+
+impl From<IpKind> for ProbeExternalIpKind {
+    fn from(value: IpKind) -> Self {
+        match value {
+            IpKind::SNat => ProbeExternalIpKind::Snat,
+            IpKind::Ephemeral => ProbeExternalIpKind::Ephemeral,
+            IpKind::Floating => ProbeExternalIpKind::Floating,
+        }
+    }
+}
+
 /// A view type constructed from `ExternalIp` used to represent Floating IP
 /// objects in user-facing APIs.
 ///
diff --git a/nexus/db-model/src/ipv4_nat_entry.rs b/nexus/db-model/src/ipv4_nat_entry.rs
index 4ff1ee9171..c60c37a0bf 100644
--- a/nexus/db-model/src/ipv4_nat_entry.rs
+++ b/nexus/db-model/src/ipv4_nat_entry.rs
@@ -1,13 +1,10 @@
-use std::net::{Ipv4Addr, Ipv6Addr};
-
 use super::MacAddr;
 use crate::{
     schema::ipv4_nat_changes, schema::ipv4_nat_entry, Ipv4Net, Ipv6Net, SqlU16,
     Vni,
 };
 use chrono::{DateTime, Utc};
-use omicron_common::api::external;
-use schemars::JsonSchema;
+use nexus_types::internal_api::views::Ipv4NatEntryView;
 use serde::Deserialize;
 use serde::Serialize;
 use uuid::Uuid;
@@ -65,19 +62,6 @@ pub struct Ipv4NatChange {
     pub deleted: bool,
 }
 
-/// NAT Record
-#[derive(Clone, Debug, Serialize, JsonSchema)]
-pub struct Ipv4NatEntryView {
-    pub external_address: Ipv4Addr,
-    pub first_port: u16,
-    pub last_port: u16,
-    pub sled_address: Ipv6Addr,
-    pub vni: external::Vni,
-    pub mac: external::MacAddr,
-    pub gen: i64,
-    pub deleted: bool,
-}
-
 impl From<Ipv4NatChange> for Ipv4NatEntryView {
     fn from(value: Ipv4NatChange) -> Self {
         Self {
diff --git a/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs b/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs
index 5b370f27a9..0a514f55dc 100644
--- a/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs
+++ b/nexus/db-queries/src/db/datastore/ipv4_nat_entry.rs
@@ -10,7 +10,7 @@ use diesel::prelude::*;
 use diesel::sql_types::BigInt;
 use nexus_db_model::ExternalIp;
 use nexus_db_model::Ipv4NatChange;
-use nexus_db_model::Ipv4NatEntryView;
+use nexus_types::internal_api::views::Ipv4NatEntryView;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::DeleteResult;
 use omicron_common::api::external::Error;
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index ca7c76c0ae..461e71d88a 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -112,7 +112,6 @@ pub use dns::DnsVersionUpdateBuilder;
 pub use instance::InstanceAndActiveVmm;
 pub use inventory::DataStoreInventoryTest;
 use nexus_db_model::AllSchemaVersions;
-pub use probe::ProbeInfo;
 pub use rack::RackInit;
 pub use rack::SledUnderlayAllocationResult;
 pub use region::RegionAllocationFor;
diff --git a/nexus/db-queries/src/db/datastore/probe.rs b/nexus/db-queries/src/db/datastore/probe.rs
index a96f857163..f3e0614552 100644
--- a/nexus/db-queries/src/db/datastore/probe.rs
+++ b/nexus/db-queries/src/db/datastore/probe.rs
@@ -1,5 +1,3 @@
-use std::net::IpAddr;
-
 use crate::authz;
 use crate::context::OpContext;
 use crate::db;
@@ -15,6 +13,7 @@ use diesel::{ExpressionMethods, QueryDsl, SelectableHelper};
 use nexus_db_model::IncompleteNetworkInterface;
 use nexus_db_model::Probe;
 use nexus_db_model::VpcSubnet;
+use nexus_types::external_api::shared::ProbeInfo;
 use nexus_types::identity::Resource;
 use omicron_common::api::external::http_pagination::PaginatedBy;
 use omicron_common::api::external::CreateResult;
@@ -32,34 +31,6 @@ use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 
-#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)]
-pub struct ProbeInfo {
-    pub id: Uuid,
-    pub name: Name,
-    sled: Uuid,
-    pub external_ips: Vec<ProbeExternalIp>,
-    pub interface: NetworkInterface,
-}
-
-#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)]
-pub struct ProbeExternalIp {
-    ip: IpAddr,
-    first_port: u16,
-    last_port: u16,
-    kind: IpKind,
-}
-
-impl From<nexus_db_model::ExternalIp> for ProbeExternalIp {
-    fn from(value: nexus_db_model::ExternalIp) -> Self {
-        Self {
-            ip: value.ip.ip(),
-            first_port: value.first_port.0,
-            last_port: value.last_port.0,
-            kind: value.kind.into(),
-        }
-    }
-}
-
 #[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub enum IpKind {
@@ -141,7 +112,7 @@ impl super::DataStore {
 
             result.push(ProbeInfo {
                 id: probe.id(),
-                name: probe.name().clone().into(),
+                name: probe.name().clone(),
                 sled: probe.sled,
                 interface,
                 external_ips,
@@ -184,7 +155,7 @@ impl super::DataStore {
 
         Ok(ProbeInfo {
             id: probe.id(),
-            name: probe.name().clone().into(),
+            name: probe.name().clone(),
             sled: probe.sled,
             interface,
             external_ips,
diff --git a/nexus/internal-api/Cargo.toml b/nexus/internal-api/Cargo.toml
new file mode 100644
index 0000000000..76fa6bd59a
--- /dev/null
+++ b/nexus/internal-api/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "nexus-internal-api"
+version = "0.1.0"
+edition = "2021"
+license = "MPL-2.0"
+
+[lints]
+workspace = true
+
+[dependencies]
+dropshot.workspace = true
+nexus-types.workspace = true
+omicron-common.workspace = true
+omicron-uuid-kinds.workspace = true
+omicron-workspace-hack.workspace = true
+serde.workspace = true
+schemars.workspace = true
+uuid.workspace = true
diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs
new file mode 100644
index 0000000000..b2d68036bb
--- /dev/null
+++ b/nexus/internal-api/src/lib.rs
@@ -0,0 +1,591 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+use std::collections::{BTreeMap, BTreeSet};
+
+use dropshot::{
+    FreeformBody, HttpError, HttpResponseCreated, HttpResponseDeleted,
+    HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext,
+    ResultsPage, TypedBody,
+};
+use nexus_types::{
+    deployment::{
+        Blueprint, BlueprintMetadata, BlueprintTarget, BlueprintTargetSet,
+    },
+    external_api::{
+        params::{SledSelector, UninitializedSledId},
+        shared::{ProbeInfo, UninitializedSled},
+        views::SledPolicy,
+    },
+    internal_api::{
+        params::{
+            OximeterInfo, RackInitializationRequest, SledAgentInfo,
+            SwitchPutRequest, SwitchPutResponse,
+        },
+        views::{BackgroundTask, Ipv4NatEntryView, Saga},
+    },
+};
+use omicron_common::{
+    api::{
+        external::http_pagination::PaginatedById,
+        internal::nexus::{
+            DiskRuntimeState, DownstairsClientStopRequest,
+            DownstairsClientStopped, ProducerEndpoint,
+            ProducerRegistrationResponse, RepairFinishInfo, RepairProgress,
+            RepairStartInfo, SledInstanceState,
+        },
+    },
+    update::ArtifactId,
+};
+use omicron_uuid_kinds::{
+    DownstairsKind, SledUuid, TypedUuid, UpstairsKind, UpstairsRepairKind,
+};
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
+
+#[dropshot::api_description {
+    // Named something different to let 'import nexus_internal_api::*;' work.
+    module = "nexus_internal_api_mod",
+}]
+pub trait NexusInternalApi {
+    type Context;
+
+    /// Return information about the given sled agent
+    #[endpoint {
+        method = GET,
+        path = "/sled-agents/{sled_id}",
+    }]
+    async fn sled_agent_get(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SledAgentPathParam>,
+    ) -> Result<HttpResponseOk<SledAgentInfo>, HttpError>;
+
+    /// Report that the sled agent for the specified sled has come online.
+    #[endpoint {
+        method = POST,
+        path = "/sled-agents/{sled_id}",
+    }]
+    async fn sled_agent_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SledAgentPathParam>,
+        sled_info: TypedBody<SledAgentInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// Request a new set of firewall rules for a sled.
+    ///
+    /// This causes Nexus to read the latest set of rules for the sled,
+    /// and call a Sled endpoint which applies the rules to all OPTE ports
+    /// that happen to exist.
+    #[endpoint {
+        method = POST,
+        path = "/sled-agents/{sled_id}/firewall-rules-update",
+    }]
+    async fn sled_firewall_rules_request(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SledAgentPathParam>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// Report that the Rack Setup Service initialization is complete
+    ///
+    /// See RFD 278 for more details.
+    #[endpoint {
+        method = PUT,
+        path = "/racks/{rack_id}/initialization-complete",
+    }]
+    async fn rack_initialization_complete(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<RackPathParam>,
+        info: TypedBody<RackInitializationRequest>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    #[endpoint {
+        method = PUT,
+        path = "/switch/{switch_id}",
+    }]
+    async fn switch_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SwitchPathParam>,
+        body: TypedBody<SwitchPutRequest>,
+    ) -> Result<HttpResponseOk<SwitchPutResponse>, HttpError>;
+
+    /// Report updated state for an instance.
+    #[endpoint {
+        method = PUT,
+        path = "/instances/{instance_id}",
+    }]
+    async fn cpapi_instances_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<InstancePathParam>,
+        new_runtime_state: TypedBody<SledInstanceState>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// Report updated state for a disk.
+    #[endpoint {
+        method = PUT,
+        path = "/disks/{disk_id}",
+    }]
+    async fn cpapi_disks_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<DiskPathParam>,
+        new_runtime_state: TypedBody<DiskRuntimeState>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// Request removal of a read_only_parent from a volume.
+    ///
+    /// A volume can be created with the source data for that volume being another
+    /// volume that attached as a "read_only_parent". In the background there
+    /// exists a scrubber that will copy the data from the read_only_parent
+    /// into the volume. When that scrubber has completed copying the data, this
+    /// endpoint can be called to update the database that the read_only_parent
+    /// is no longer needed for a volume and future attachments of this volume
+    /// should not include that read_only_parent.
+    #[endpoint {
+        method = POST,
+        path = "/volume/{volume_id}/remove-read-only-parent",
+    }]
+    async fn cpapi_volume_remove_read_only_parent(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<VolumePathParam>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// Request removal of a read_only_parent from a disk.
+    ///
+    /// This is a thin wrapper around the volume_remove_read_only_parent saga.
+    /// All we are doing here is, given a disk UUID, figure out what the
+    /// volume_id is for that disk, then use that to call the
+    /// disk_remove_read_only_parent saga on it.
+    #[endpoint {
+        method = POST,
+        path = "/disk/{disk_id}/remove-read-only-parent",
+    }]
+    async fn cpapi_disk_remove_read_only_parent(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<DiskPathParam>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// Accept a registration from a new metric producer
+    #[endpoint {
+        method = POST,
+        path = "/metrics/producers",
+    }]
+    async fn cpapi_producers_post(
+        request_context: RequestContext<Self::Context>,
+        producer_info: TypedBody<ProducerEndpoint>,
+    ) -> Result<HttpResponseCreated<ProducerRegistrationResponse>, HttpError>;
+
+    /// List all metric producers assigned to an oximeter collector.
+    #[endpoint {
+        method = GET,
+        path = "/metrics/collectors/{collector_id}/producers",
+    }]
+    async fn cpapi_assigned_producers_list(
+        request_context: RequestContext<Self::Context>,
+        path_params: Path<CollectorIdPathParams>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<ResultsPage<ProducerEndpoint>>, HttpError>;
+
+    /// Accept a notification of a new oximeter collection server.
+    #[endpoint {
+        method = POST,
+        path = "/metrics/collectors",
+    }]
+    async fn cpapi_collectors_post(
+        request_context: RequestContext<Self::Context>,
+        oximeter_info: TypedBody<OximeterInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// Endpoint used by Sled Agents to download cached artifacts.
+    #[endpoint {
+    method = GET,
+    path = "/artifacts/{kind}/{name}/{version}",
+    }]
+    async fn cpapi_artifact_download(
+        request_context: RequestContext<Self::Context>,
+        path_params: Path<ArtifactId>,
+    ) -> Result<HttpResponseOk<FreeformBody>, HttpError>;
+
+    /// An Upstairs will notify this endpoint when a repair starts
+    #[endpoint {
+        method = POST,
+        path = "/crucible/0/upstairs/{upstairs_id}/repair-start",
+    }]
+    async fn cpapi_upstairs_repair_start(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsPathParam>,
+        repair_start_info: TypedBody<RepairStartInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// An Upstairs will notify this endpoint when a repair finishes.
+    #[endpoint {
+        method = POST,
+        path = "/crucible/0/upstairs/{upstairs_id}/repair-finish",
+    }]
+    async fn cpapi_upstairs_repair_finish(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsPathParam>,
+        repair_finish_info: TypedBody<RepairFinishInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// An Upstairs will update this endpoint with the progress of a repair
+    #[endpoint {
+        method = POST,
+        path = "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress",
+    }]
+    async fn cpapi_upstairs_repair_progress(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsRepairPathParam>,
+        repair_progress: TypedBody<RepairProgress>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// An Upstairs will update this endpoint if a Downstairs client task is
+    /// requested to stop
+    #[endpoint {
+        method = POST,
+        path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request",
+    }]
+    async fn cpapi_downstairs_client_stop_request(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsDownstairsPathParam>,
+        downstairs_client_stop_request: TypedBody<DownstairsClientStopRequest>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// An Upstairs will update this endpoint if a Downstairs client task stops for
+    /// any reason (not just after being requested to)
+    #[endpoint {
+        method = POST,
+        path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped",
+    }]
+    async fn cpapi_downstairs_client_stopped(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsDownstairsPathParam>,
+        downstairs_client_stopped: TypedBody<DownstairsClientStopped>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    // Sagas
+
+    /// List sagas
+    #[endpoint {
+        method = GET,
+        path = "/sagas",
+    }]
+    async fn saga_list(
+        rqctx: RequestContext<Self::Context>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<ResultsPage<Saga>>, HttpError>;
+
+    /// Fetch a saga
+    #[endpoint {
+        method = GET,
+        path = "/sagas/{saga_id}",
+    }]
+    async fn saga_view(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SagaPathParam>,
+    ) -> Result<HttpResponseOk<Saga>, HttpError>;
+
+    // Background Tasks
+
+    /// List background tasks
+    ///
+    /// This is a list of discrete background activities that Nexus carries out.
+    /// This is exposed for support and debugging.
+    #[endpoint {
+        method = GET,
+        path = "/bgtasks",
+    }]
+    async fn bgtask_list(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<BTreeMap<String, BackgroundTask>>, HttpError>;
+
+    /// Fetch status of one background task
+    ///
+    /// This is exposed for support and debugging.
+    #[endpoint {
+        method = GET,
+        path = "/bgtasks/view/{bgtask_name}",
+    }]
+    async fn bgtask_view(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<BackgroundTaskPathParam>,
+    ) -> Result<HttpResponseOk<BackgroundTask>, HttpError>;
+
+    /// Activates one or more background tasks, causing them to be run immediately
+    /// if idle, or scheduled to run again as soon as possible if already running.
+    #[endpoint {
+        method = POST,
+        path = "/bgtasks/activate",
+    }]
+    async fn bgtask_activate(
+        rqctx: RequestContext<Self::Context>,
+        body: TypedBody<BackgroundTasksActivateRequest>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    // NAT RPW internal APIs
+
+    /// Fetch NAT ChangeSet
+    ///
+    /// Caller provides their generation as `from_gen`, along with a query
+    /// parameter for the page size (`limit`). Endpoint will return changes
+    /// that have occured since the caller's generation number up to the latest
+    /// change or until the `limit` is reached. If there are no changes, an
+    /// empty vec is returned.
+    #[endpoint {
+        method = GET,
+        path = "/nat/ipv4/changeset/{from_gen}"
+    }]
+    async fn ipv4_nat_changeset(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<RpwNatPathParam>,
+        query_params: Query<RpwNatQueryParam>,
+    ) -> Result<HttpResponseOk<Vec<Ipv4NatEntryView>>, HttpError>;
+
+    // APIs for managing blueprints
+    //
+    // These are not (yet) intended for use by any other programs.  Eventually, we
+    // will want this functionality part of the public API.  But we don't want to
+    // commit to any of this yet.  These properly belong in an RFD 399-style
+    // "Service and Support API".  Absent that, we stick them here.
+
+    /// Lists blueprints
+    #[endpoint {
+        method = GET,
+        path = "/deployment/blueprints/all",
+    }]
+    async fn blueprint_list(
+        rqctx: RequestContext<Self::Context>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<ResultsPage<BlueprintMetadata>>, HttpError>;
+
+    /// Fetches one blueprint
+    #[endpoint {
+        method = GET,
+        path = "/deployment/blueprints/all/{blueprint_id}",
+    }]
+    async fn blueprint_view(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<nexus_types::external_api::params::BlueprintPath>,
+    ) -> Result<HttpResponseOk<Blueprint>, HttpError>;
+
+    /// Deletes one blueprint
+    #[endpoint {
+        method = DELETE,
+        path = "/deployment/blueprints/all/{blueprint_id}",
+    }]
+    async fn blueprint_delete(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<nexus_types::external_api::params::BlueprintPath>,
+    ) -> Result<HttpResponseDeleted, HttpError>;
+
+    // Managing the current target blueprint
+
+    /// Fetches the current target blueprint, if any
+    #[endpoint {
+        method = GET,
+        path = "/deployment/blueprints/target",
+    }]
+    async fn blueprint_target_view(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<BlueprintTarget>, HttpError>;
+
+    /// Make the specified blueprint the new target
+    #[endpoint {
+        method = POST,
+        path = "/deployment/blueprints/target",
+    }]
+    async fn blueprint_target_set(
+        rqctx: RequestContext<Self::Context>,
+        target: TypedBody<BlueprintTargetSet>,
+    ) -> Result<HttpResponseOk<BlueprintTarget>, HttpError>;
+
+    /// Set the `enabled` field of the current target blueprint
+    #[endpoint {
+        method = PUT,
+        path = "/deployment/blueprints/target/enabled",
+    }]
+    async fn blueprint_target_set_enabled(
+        rqctx: RequestContext<Self::Context>,
+        target: TypedBody<BlueprintTargetSet>,
+    ) -> Result<HttpResponseOk<BlueprintTarget>, HttpError>;
+
+    // Generating blueprints
+
+    /// Generates a new blueprint for the current system, re-evaluating anything
+    /// that's changed since the last one was generated
+    #[endpoint {
+        method = POST,
+        path = "/deployment/blueprints/regenerate",
+    }]
+    async fn blueprint_regenerate(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<Blueprint>, HttpError>;
+
+    /// Imports a client-provided blueprint
+    ///
+    /// This is intended for development and support, not end users or operators.
+    #[endpoint {
+        method = POST,
+        path = "/deployment/blueprints/import",
+    }]
+    async fn blueprint_import(
+        rqctx: RequestContext<Self::Context>,
+        blueprint: TypedBody<Blueprint>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
+    /// List uninitialized sleds
+    #[endpoint {
+        method = GET,
+        path = "/sleds/uninitialized",
+    }]
+    async fn sled_list_uninitialized(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<ResultsPage<UninitializedSled>>, HttpError>;
+
+    /// Add sled to initialized rack
+    //
+    // TODO: In the future this should really be a PUT request, once we resolve
+    // https://github.com/oxidecomputer/omicron/issues/4494. It should also
+    // explicitly be tied to a rack via a `rack_id` path param. For now we assume
+    // we are only operating on single rack systems.
+    #[endpoint {
+        method = POST,
+        path = "/sleds/add",
+    }]
+    async fn sled_add(
+        rqctx: RequestContext<Self::Context>,
+        sled: TypedBody<UninitializedSledId>,
+    ) -> Result<HttpResponseCreated<SledId>, HttpError>;
+
+    /// Mark a sled as expunged
+    ///
+    /// This is an irreversible process! It should only be called after
+    /// sufficient warning to the operator.
+    ///
+    /// This is idempotent, and it returns the old policy of the sled.
+    #[endpoint {
+        method = POST,
+        path = "/sleds/expunge",
+    }]
+    async fn sled_expunge(
+        rqctx: RequestContext<Self::Context>,
+        sled: TypedBody<SledSelector>,
+    ) -> Result<HttpResponseOk<SledPolicy>, HttpError>;
+
+    /// Get all the probes associated with a given sled.
+    #[endpoint {
+        method = GET,
+        path = "/probes/{sled}"
+    }]
+    async fn probes_get(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<ProbePathParam>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<Vec<ProbeInfo>>, HttpError>;
+}
+
+/// Path parameters for Sled Agent requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct SledAgentPathParam {
+    pub sled_id: Uuid,
+}
+
+/// Path parameters for Disk requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct DiskPathParam {
+    pub disk_id: Uuid,
+}
+
+/// Path parameters for Volume requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct VolumePathParam {
+    pub volume_id: Uuid,
+}
+
+/// Path parameters for Rack requests.
+#[derive(Deserialize, JsonSchema)]
+pub struct RackPathParam {
+    pub rack_id: Uuid,
+}
+
+/// Path parameters for Switch requests.
+#[derive(Deserialize, JsonSchema)]
+pub struct SwitchPathParam {
+    pub switch_id: Uuid,
+}
+
+/// Path parameters for Instance requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct InstancePathParam {
+    pub instance_id: Uuid,
+}
+
+#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)]
+pub struct CollectorIdPathParams {
+    /// The ID of the oximeter collector.
+    pub collector_id: Uuid,
+}
+
+/// Path parameters for Upstairs requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct UpstairsPathParam {
+    pub upstairs_id: TypedUuid<UpstairsKind>,
+}
+
+/// Path parameters for Upstairs requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct UpstairsRepairPathParam {
+    pub upstairs_id: TypedUuid<UpstairsKind>,
+    pub repair_id: TypedUuid<UpstairsRepairKind>,
+}
+
+/// Path parameters for Downstairs requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct UpstairsDownstairsPathParam {
+    pub upstairs_id: TypedUuid<UpstairsKind>,
+    pub downstairs_id: TypedUuid<DownstairsKind>,
+}
+
+/// Path parameters for Saga requests
+#[derive(Deserialize, JsonSchema)]
+pub struct SagaPathParam {
+    #[serde(rename = "saga_id")]
+    pub saga_id: Uuid,
+}
+
+/// Path parameters for Background Task requests
+#[derive(Deserialize, JsonSchema)]
+pub struct BackgroundTaskPathParam {
+    pub bgtask_name: String,
+}
+
+/// Query parameters for Background Task activation requests.
+#[derive(Deserialize, JsonSchema)]
+pub struct BackgroundTasksActivateRequest {
+    pub bgtask_names: BTreeSet<String>,
+}
+
+/// Path parameters for NAT ChangeSet
+#[derive(Deserialize, JsonSchema)]
+pub struct RpwNatPathParam {
+    /// which change number to start generating
+    /// the change set from
+    pub from_gen: i64,
+}
+
+/// Query parameters for NAT ChangeSet
+#[derive(Deserialize, JsonSchema)]
+pub struct RpwNatQueryParam {
+    pub limit: u32,
+}
+
+#[derive(Clone, Debug, Serialize, JsonSchema)]
+pub struct SledId {
+    pub id: SledUuid,
+}
+
+/// Path parameters for probes
+#[derive(Deserialize, JsonSchema)]
+pub struct ProbePathParam {
+    pub sled: Uuid,
+}
diff --git a/nexus/src/app/probe.rs b/nexus/src/app/probe.rs
index 41ea4eece2..67673d8e00 100644
--- a/nexus/src/app/probe.rs
+++ b/nexus/src/app/probe.rs
@@ -1,9 +1,9 @@
 use nexus_db_model::Probe;
 use nexus_db_queries::authz;
 use nexus_db_queries::context::OpContext;
-use nexus_db_queries::db::datastore::ProbeInfo;
 use nexus_db_queries::db::lookup;
 use nexus_types::external_api::params;
+use nexus_types::external_api::shared::ProbeInfo;
 use nexus_types::identity::Resource;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::{
diff --git a/nexus/src/bin/nexus.rs b/nexus/src/bin/nexus.rs
index 452e033ce6..33870b39e3 100644
--- a/nexus/src/bin/nexus.rs
+++ b/nexus/src/bin/nexus.rs
@@ -17,7 +17,6 @@ use nexus_config::NexusConfig;
 use omicron_common::cmd::fatal;
 use omicron_common::cmd::CmdError;
 use omicron_nexus::run_openapi_external;
-use omicron_nexus::run_openapi_internal;
 use omicron_nexus::run_server;
 
 #[derive(Debug, Parser)]
@@ -27,19 +26,10 @@ struct Args {
         short = 'O',
         long = "openapi",
         help = "Print the external OpenAPI Spec document and exit",
-        conflicts_with = "openapi_internal",
         action
     )]
     openapi: bool,
 
-    #[clap(
-        short = 'I',
-        long = "openapi-internal",
-        help = "Print the internal OpenAPI Spec document and exit",
-        action
-    )]
-    openapi_internal: bool,
-
     #[clap(name = "CONFIG_FILE_PATH", action)]
     config_file_path: Option<Utf8PathBuf>,
 }
@@ -56,8 +46,6 @@ async fn do_run() -> Result<(), CmdError> {
 
     if args.openapi {
         run_openapi_external().map_err(|err| CmdError::Failure(anyhow!(err)))
-    } else if args.openapi_internal {
-        run_openapi_internal().map_err(|err| CmdError::Failure(anyhow!(err)))
     } else {
         let config_path = match args.config_file_path {
             Some(path) => path,
diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs
index 1f185ae820..d23f0d035a 100644
--- a/nexus/src/external_api/http_entrypoints.rs
+++ b/nexus/src/external_api/http_entrypoints.rs
@@ -34,13 +34,13 @@ use dropshot::{ApiDescription, StreamingBody};
 use dropshot::{ApiDescriptionRegisterError, HttpError};
 use dropshot::{ApiEndpoint, EmptyScanParams};
 use ipnetwork::IpNetwork;
+use nexus_db_queries::authz;
 use nexus_db_queries::db;
 use nexus_db_queries::db::identity::Resource;
 use nexus_db_queries::db::lookup::ImageLookup;
 use nexus_db_queries::db::lookup::ImageParentLookup;
 use nexus_db_queries::db::model::Name;
-use nexus_db_queries::{authz, db::datastore::ProbeInfo};
-use nexus_types::external_api::shared::BfdStatus;
+use nexus_types::external_api::shared::{BfdStatus, ProbeInfo};
 use omicron_common::api::external::http_pagination::marker_for_name;
 use omicron_common::api::external::http_pagination::marker_for_name_or_id;
 use omicron_common::api::external::http_pagination::name_or_id_pagination;
@@ -7023,7 +7023,7 @@ async fn probe_list(
             probes,
             &|_, p: &ProbeInfo| match paginated_by {
                 PaginatedBy::Id(_) => NameOrId::Id(p.id),
-                PaginatedBy::Name(_) => NameOrId::Name(p.name.clone().into()),
+                PaginatedBy::Name(_) => NameOrId::Name(p.name.clone()),
             },
         )?))
     };
diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs
index 8e7b39c111..f324ea787d 100644
--- a/nexus/src/internal_api/http_entrypoints.rs
+++ b/nexus/src/internal_api/http_entrypoints.rs
@@ -6,9 +6,7 @@
 
 use super::params::{OximeterInfo, RackInitializationRequest};
 use crate::context::ApiContext;
-use dropshot::endpoint;
 use dropshot::ApiDescription;
-use dropshot::ApiDescriptionRegisterError;
 use dropshot::FreeformBody;
 use dropshot::HttpError;
 use dropshot::HttpResponseCreated;
@@ -21,14 +19,14 @@ use dropshot::RequestContext;
 use dropshot::ResultsPage;
 use dropshot::TypedBody;
 use hyper::Body;
-use nexus_db_model::Ipv4NatEntryView;
-use nexus_db_queries::db::datastore::ProbeInfo;
+use nexus_internal_api::*;
 use nexus_types::deployment::Blueprint;
 use nexus_types::deployment::BlueprintMetadata;
 use nexus_types::deployment::BlueprintTarget;
 use nexus_types::deployment::BlueprintTargetSet;
 use nexus_types::external_api::params::SledSelector;
 use nexus_types::external_api::params::UninitializedSledId;
+use nexus_types::external_api::shared::ProbeInfo;
 use nexus_types::external_api::shared::UninitializedSled;
 use nexus_types::external_api::views::SledPolicy;
 use nexus_types::internal_api::params::SledAgentInfo;
@@ -36,6 +34,7 @@ use nexus_types::internal_api::params::SwitchPutRequest;
 use nexus_types::internal_api::params::SwitchPutResponse;
 use nexus_types::internal_api::views::to_list;
 use nexus_types::internal_api::views::BackgroundTask;
+use nexus_types::internal_api::views::Ipv4NatEntryView;
 use nexus_types::internal_api::views::Saga;
 use omicron_common::api::external::http_pagination::data_page_params_for;
 use omicron_common::api::external::http_pagination::PaginatedById;
@@ -51,1067 +50,805 @@ use omicron_common::api::internal::nexus::RepairProgress;
 use omicron_common::api::internal::nexus::RepairStartInfo;
 use omicron_common::api::internal::nexus::SledInstanceState;
 use omicron_common::update::ArtifactId;
-use omicron_uuid_kinds::DownstairsKind;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::InstanceUuid;
-use omicron_uuid_kinds::SledUuid;
-use omicron_uuid_kinds::TypedUuid;
-use omicron_uuid_kinds::UpstairsKind;
-use omicron_uuid_kinds::UpstairsRepairKind;
-use schemars::JsonSchema;
-use serde::Deserialize;
-use serde::Serialize;
 use std::collections::BTreeMap;
-use std::collections::BTreeSet;
-use uuid::Uuid;
 
 type NexusApiDescription = ApiDescription<ApiContext>;
 
 /// Returns a description of the internal nexus API
 pub(crate) fn internal_api() -> NexusApiDescription {
-    fn register_endpoints(
-        api: &mut NexusApiDescription,
-    ) -> Result<(), ApiDescriptionRegisterError> {
-        api.register(sled_agent_get)?;
-        api.register(sled_agent_put)?;
-        api.register(sled_firewall_rules_request)?;
-        api.register(switch_put)?;
-        api.register(rack_initialization_complete)?;
-        api.register(cpapi_instances_put)?;
-        api.register(cpapi_disks_put)?;
-        api.register(cpapi_volume_remove_read_only_parent)?;
-        api.register(cpapi_disk_remove_read_only_parent)?;
-        api.register(cpapi_producers_post)?;
-        api.register(cpapi_assigned_producers_list)?;
-        api.register(cpapi_collectors_post)?;
-        api.register(cpapi_artifact_download)?;
-
-        api.register(cpapi_upstairs_repair_start)?;
-        api.register(cpapi_upstairs_repair_finish)?;
-        api.register(cpapi_upstairs_repair_progress)?;
-        api.register(cpapi_downstairs_client_stop_request)?;
-        api.register(cpapi_downstairs_client_stopped)?;
-
-        api.register(saga_list)?;
-        api.register(saga_view)?;
-
-        api.register(ipv4_nat_changeset)?;
-
-        api.register(bgtask_list)?;
-        api.register(bgtask_view)?;
-        api.register(bgtask_activate)?;
-
-        api.register(blueprint_list)?;
-        api.register(blueprint_view)?;
-        api.register(blueprint_delete)?;
-        api.register(blueprint_target_view)?;
-        api.register(blueprint_target_set)?;
-        api.register(blueprint_target_set_enabled)?;
-        api.register(blueprint_regenerate)?;
-        api.register(blueprint_import)?;
+    nexus_internal_api_mod::api_description::<NexusInternalApiImpl>()
+        .expect("registered API endpoints successfully")
+}
 
-        api.register(sled_list_uninitialized)?;
-        api.register(sled_add)?;
-        api.register(sled_expunge)?;
+enum NexusInternalApiImpl {}
 
-        api.register(probes_get)?;
+impl NexusInternalApi for NexusInternalApiImpl {
+    type Context = ApiContext;
 
-        Ok(())
+    async fn sled_agent_get(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SledAgentPathParam>,
+    ) -> Result<HttpResponseOk<SledAgentInfo>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
+        let path = path_params.into_inner();
+        let sled_id = &path.sled_id;
+        let handler = async {
+            let (.., sled) =
+                nexus.sled_lookup(&opctx, sled_id)?.fetch().await?;
+            Ok(HttpResponseOk(sled.into()))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
     }
 
-    let mut api = NexusApiDescription::new();
-    if let Err(err) = register_endpoints(&mut api) {
-        panic!("failed to register entrypoints: {}", err);
+    async fn sled_agent_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SledAgentPathParam>,
+        sled_info: TypedBody<SledAgentInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
+        let path = path_params.into_inner();
+        let info = sled_info.into_inner();
+        let sled_id = &path.sled_id;
+        let handler = async {
+            nexus.upsert_sled(&opctx, *sled_id, info).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
     }
-    api
-}
-
-/// Path parameters for Sled Agent requests (internal API)
-#[derive(Deserialize, JsonSchema)]
-struct SledAgentPathParam {
-    sled_id: Uuid,
-}
-
-/// Return information about the given sled agent
-#[endpoint {
-     method = GET,
-     path = "/sled-agents/{sled_id}",
- }]
-async fn sled_agent_get(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<SledAgentPathParam>,
-) -> Result<HttpResponseOk<SledAgentInfo>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-    let path = path_params.into_inner();
-    let sled_id = &path.sled_id;
-    let handler = async {
-        let (.., sled) = nexus.sled_lookup(&opctx, sled_id)?.fetch().await?;
-        Ok(HttpResponseOk(sled.into()))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// Report that the sled agent for the specified sled has come online.
-#[endpoint {
-     method = POST,
-     path = "/sled-agents/{sled_id}",
- }]
-async fn sled_agent_put(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<SledAgentPathParam>,
-    sled_info: TypedBody<SledAgentInfo>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-    let path = path_params.into_inner();
-    let info = sled_info.into_inner();
-    let sled_id = &path.sled_id;
-    let handler = async {
-        nexus.upsert_sled(&opctx, *sled_id, info).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// Request a new set of firewall rules for a sled.
-///
-/// This causes Nexus to read the latest set of rules for the sled,
-/// and call a Sled endpoint which applies the rules to all OPTE ports
-/// that happen to exist.
-#[endpoint {
-     method = POST,
-     path = "/sled-agents/{sled_id}/firewall-rules-update",
- }]
-async fn sled_firewall_rules_request(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<SledAgentPathParam>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-    let path = path_params.into_inner();
-    let sled_id = &path.sled_id;
-    let handler = async {
-        nexus.sled_request_firewall_rules(&opctx, *sled_id).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// Path parameters for Rack requests.
-#[derive(Deserialize, JsonSchema)]
-struct RackPathParam {
-    rack_id: Uuid,
-}
-
-/// Report that the Rack Setup Service initialization is complete
-///
-/// See RFD 278 for more details.
-#[endpoint {
-     method = PUT,
-     path = "/racks/{rack_id}/initialization-complete",
- }]
-async fn rack_initialization_complete(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<RackPathParam>,
-    info: TypedBody<RackInitializationRequest>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-    let request = info.into_inner();
-    let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-
-    nexus.rack_initialize(&opctx, path.rack_id, request).await?;
-
-    Ok(HttpResponseUpdatedNoContent())
-}
 
-/// Path parameters for Switch requests.
-#[derive(Deserialize, JsonSchema)]
-struct SwitchPathParam {
-    switch_id: Uuid,
-}
+    async fn sled_firewall_rules_request(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SledAgentPathParam>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
+        let path = path_params.into_inner();
+        let sled_id = &path.sled_id;
+        let handler = async {
+            nexus.sled_request_firewall_rules(&opctx, *sled_id).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-#[endpoint {
-    method = PUT,
-    path = "/switch/{switch_id}",
-}]
-async fn switch_put(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<SwitchPathParam>,
-    body: TypedBody<SwitchPutRequest>,
-) -> Result<HttpResponseOk<SwitchPutResponse>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
+    async fn rack_initialization_complete(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<RackPathParam>,
+        info: TypedBody<RackInitializationRequest>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
         let path = path_params.into_inner();
-        let switch = body.into_inner();
-        nexus.switch_upsert(path.switch_id, switch).await?;
-        Ok(HttpResponseOk(SwitchPutResponse {}))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+        let request = info.into_inner();
+        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
 
-/// Path parameters for Instance requests (internal API)
-#[derive(Deserialize, JsonSchema)]
-struct InstancePathParam {
-    instance_id: Uuid,
-}
+        nexus.rack_initialize(&opctx, path.rack_id, request).await?;
 
-/// Report updated state for an instance.
-#[endpoint {
-     method = PUT,
-     path = "/instances/{instance_id}",
- }]
-async fn cpapi_instances_put(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<InstancePathParam>,
-    new_runtime_state: TypedBody<SledInstanceState>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-    let new_state = new_runtime_state.into_inner();
-    let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-    let handler = async {
-        nexus
-            .notify_instance_updated(
-                &opctx,
-                &InstanceUuid::from_untyped_uuid(path.instance_id),
-                &new_state,
-            )
-            .await?;
         Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    }
 
-/// Path parameters for Disk requests (internal API)
-#[derive(Deserialize, JsonSchema)]
-struct DiskPathParam {
-    disk_id: Uuid,
-}
+    async fn switch_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SwitchPathParam>,
+        body: TypedBody<SwitchPutRequest>,
+    ) -> Result<HttpResponseOk<SwitchPutResponse>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let nexus = &apictx.nexus;
+            let path = path_params.into_inner();
+            let switch = body.into_inner();
+            nexus.switch_upsert(path.switch_id, switch).await?;
+            Ok(HttpResponseOk(SwitchPutResponse {}))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Report updated state for a disk.
-#[endpoint {
-     method = PUT,
-     path = "/disks/{disk_id}",
- }]
-async fn cpapi_disks_put(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<DiskPathParam>,
-    new_runtime_state: TypedBody<DiskRuntimeState>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-    let new_state = new_runtime_state.into_inner();
-    let handler = async {
+    async fn cpapi_instances_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<InstancePathParam>,
+        new_runtime_state: TypedBody<SledInstanceState>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let path = path_params.into_inner();
+        let new_state = new_runtime_state.into_inner();
         let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus.notify_disk_updated(&opctx, path.disk_id, &new_state).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// Path parameters for Volume requests (internal API)
-#[derive(Deserialize, JsonSchema)]
-struct VolumePathParam {
-    volume_id: Uuid,
-}
+        let handler = async {
+            nexus
+                .notify_instance_updated(
+                    &opctx,
+                    &InstanceUuid::from_untyped_uuid(path.instance_id),
+                    &new_state,
+                )
+                .await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Request removal of a read_only_parent from a volume
-/// A volume can be created with the source data for that volume being another
-/// volume that attached as a "read_only_parent". In the background there
-/// exists a scrubber that will copy the data from the read_only_parent
-/// into the volume. When that scrubber has completed copying the data, this
-/// endpoint can be called to update the database that the read_only_parent
-/// is no longer needed for a volume and future attachments of this volume
-/// should not include that read_only_parent.
-#[endpoint {
-     method = POST,
-     path = "/volume/{volume_id}/remove-read-only-parent",
- }]
-async fn cpapi_volume_remove_read_only_parent(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<VolumePathParam>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
+    async fn cpapi_disks_put(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<DiskPathParam>,
+        new_runtime_state: TypedBody<DiskRuntimeState>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let path = path_params.into_inner();
+        let new_state = new_runtime_state.into_inner();
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus.notify_disk_updated(&opctx, path.disk_id, &new_state).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus.volume_remove_read_only_parent(&opctx, path.volume_id).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn cpapi_volume_remove_read_only_parent(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<VolumePathParam>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let path = path_params.into_inner();
 
-/// Request removal of a read_only_parent from a disk
-/// This is a thin wrapper around the volume_remove_read_only_parent saga.
-/// All we are doing here is, given a disk UUID, figure out what the
-/// volume_id is for that disk, then use that to call the
-/// volume_remove_read_only_parent saga on it.
-#[endpoint {
-     method = POST,
-     path = "/disk/{disk_id}/remove-read-only-parent",
- }]
-async fn cpapi_disk_remove_read_only_parent(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<DiskPathParam>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus
+                .volume_remove_read_only_parent(&opctx, path.volume_id)
+                .await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus.disk_remove_read_only_parent(&opctx, path.disk_id).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn cpapi_disk_remove_read_only_parent(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<DiskPathParam>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let path = path_params.into_inner();
 
-/// Accept a registration from a new metric producer
-#[endpoint {
-     method = POST,
-     path = "/metrics/producers",
- }]
-async fn cpapi_producers_post(
-    request_context: RequestContext<ApiContext>,
-    producer_info: TypedBody<ProducerEndpoint>,
-) -> Result<HttpResponseCreated<ProducerRegistrationResponse>, HttpError> {
-    let context = &request_context.context().context;
-    let handler = async {
-        let nexus = &context.nexus;
-        let producer_info = producer_info.into_inner();
-        let opctx =
-            crate::context::op_context_for_internal_api(&request_context).await;
-        nexus
-            .assign_producer(&opctx, producer_info)
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus.disk_remove_read_only_parent(&opctx, path.disk_id).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
             .await
-            .map_err(HttpError::from)
-            .map(|_| {
-                HttpResponseCreated(ProducerRegistrationResponse {
-                    lease_duration:
-                        crate::app::oximeter::PRODUCER_LEASE_DURATION,
+    }
+
+    async fn cpapi_producers_post(
+        request_context: RequestContext<Self::Context>,
+        producer_info: TypedBody<ProducerEndpoint>,
+    ) -> Result<HttpResponseCreated<ProducerRegistrationResponse>, HttpError>
+    {
+        let context = &request_context.context().context;
+        let handler = async {
+            let nexus = &context.nexus;
+            let producer_info = producer_info.into_inner();
+            let opctx =
+                crate::context::op_context_for_internal_api(&request_context)
+                    .await;
+            nexus
+                .assign_producer(&opctx, producer_info)
+                .await
+                .map_err(HttpError::from)
+                .map(|_| {
+                    HttpResponseCreated(ProducerRegistrationResponse {
+                        lease_duration:
+                            crate::app::oximeter::PRODUCER_LEASE_DURATION,
+                    })
                 })
-            })
-    };
-    context
-        .internal_latencies
-        .instrument_dropshot_handler(&request_context, handler)
-        .await
-}
+        };
+        context
+            .internal_latencies
+            .instrument_dropshot_handler(&request_context, handler)
+            .await
+    }
 
-#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)]
-pub struct CollectorIdPathParams {
-    /// The ID of the oximeter collector.
-    pub collector_id: Uuid,
-}
+    async fn cpapi_assigned_producers_list(
+        request_context: RequestContext<Self::Context>,
+        path_params: Path<CollectorIdPathParams>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<ResultsPage<ProducerEndpoint>>, HttpError> {
+        let context = &request_context.context().context;
+        let handler = async {
+            let nexus = &context.nexus;
+            let collector_id = path_params.into_inner().collector_id;
+            let query = query_params.into_inner();
+            let pagparams = data_page_params_for(&request_context, &query)?;
+            let opctx =
+                crate::context::op_context_for_internal_api(&request_context)
+                    .await;
+            let producers = nexus
+                .list_assigned_producers(&opctx, collector_id, &pagparams)
+                .await?;
+            Ok(HttpResponseOk(ScanById::results_page(
+                &query,
+                producers,
+                &|_, producer: &ProducerEndpoint| producer.id,
+            )?))
+        };
+        context
+            .internal_latencies
+            .instrument_dropshot_handler(&request_context, handler)
+            .await
+    }
 
-/// List all metric producers assigned to an oximeter collector.
-#[endpoint {
-     method = GET,
-     path = "/metrics/collectors/{collector_id}/producers",
- }]
-async fn cpapi_assigned_producers_list(
-    request_context: RequestContext<ApiContext>,
-    path_params: Path<CollectorIdPathParams>,
-    query_params: Query<PaginatedById>,
-) -> Result<HttpResponseOk<ResultsPage<ProducerEndpoint>>, HttpError> {
-    let context = &request_context.context().context;
-    let handler = async {
-        let nexus = &context.nexus;
-        let collector_id = path_params.into_inner().collector_id;
-        let query = query_params.into_inner();
-        let pagparams = data_page_params_for(&request_context, &query)?;
-        let opctx =
-            crate::context::op_context_for_internal_api(&request_context).await;
-        let producers = nexus
-            .list_assigned_producers(&opctx, collector_id, &pagparams)
-            .await?;
-        Ok(HttpResponseOk(ScanById::results_page(
-            &query,
-            producers,
-            &|_, producer: &ProducerEndpoint| producer.id,
-        )?))
-    };
-    context
-        .internal_latencies
-        .instrument_dropshot_handler(&request_context, handler)
-        .await
-}
+    async fn cpapi_collectors_post(
+        request_context: RequestContext<Self::Context>,
+        oximeter_info: TypedBody<OximeterInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let context = &request_context.context().context;
+        let handler = async {
+            let nexus = &context.nexus;
+            let oximeter_info = oximeter_info.into_inner();
+            let opctx =
+                crate::context::op_context_for_internal_api(&request_context)
+                    .await;
+            nexus.upsert_oximeter_collector(&opctx, &oximeter_info).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        context
+            .internal_latencies
+            .instrument_dropshot_handler(&request_context, handler)
+            .await
+    }
 
-/// Accept a notification of a new oximeter collection server.
-#[endpoint {
-     method = POST,
-     path = "/metrics/collectors",
- }]
-async fn cpapi_collectors_post(
-    request_context: RequestContext<ApiContext>,
-    oximeter_info: TypedBody<OximeterInfo>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let context = &request_context.context().context;
-    let handler = async {
+    async fn cpapi_artifact_download(
+        request_context: RequestContext<Self::Context>,
+        path_params: Path<ArtifactId>,
+    ) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
+        let context = &request_context.context().context;
         let nexus = &context.nexus;
-        let oximeter_info = oximeter_info.into_inner();
         let opctx =
             crate::context::op_context_for_internal_api(&request_context).await;
-        nexus.upsert_oximeter_collector(&opctx, &oximeter_info).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    context
-        .internal_latencies
-        .instrument_dropshot_handler(&request_context, handler)
-        .await
-}
-
-/// Endpoint used by Sled Agents to download cached artifacts.
-#[endpoint {
-    method = GET,
-    path = "/artifacts/{kind}/{name}/{version}",
-}]
-async fn cpapi_artifact_download(
-    request_context: RequestContext<ApiContext>,
-    path_params: Path<ArtifactId>,
-) -> Result<HttpResponseOk<FreeformBody>, HttpError> {
-    let context = &request_context.context().context;
-    let nexus = &context.nexus;
-    let opctx =
-        crate::context::op_context_for_internal_api(&request_context).await;
-    // TODO: return 404 if the error we get here says that the record isn't found
-    let body = nexus
-        .updates_download_artifact(&opctx, path_params.into_inner())
-        .await?;
-
-    Ok(HttpResponseOk(Body::from(body).into()))
-}
-
-/// Path parameters for Upstairs requests (internal API)
-#[derive(Deserialize, JsonSchema)]
-struct UpstairsPathParam {
-    upstairs_id: TypedUuid<UpstairsKind>,
-}
-
-/// An Upstairs will notify this endpoint when a repair starts
-#[endpoint {
-     method = POST,
-     path = "/crucible/0/upstairs/{upstairs_id}/repair-start",
- }]
-async fn cpapi_upstairs_repair_start(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<UpstairsPathParam>,
-    repair_start_info: TypedBody<RepairStartInfo>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus
-            .upstairs_repair_start(
-                &opctx,
-                path.upstairs_id,
-                repair_start_info.into_inner(),
-            )
-            .await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// An Upstairs will notify this endpoint when a repair finishes.
-#[endpoint {
-     method = POST,
-     path = "/crucible/0/upstairs/{upstairs_id}/repair-finish",
- }]
-async fn cpapi_upstairs_repair_finish(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<UpstairsPathParam>,
-    repair_finish_info: TypedBody<RepairFinishInfo>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus
-            .upstairs_repair_finish(
-                &opctx,
-                path.upstairs_id,
-                repair_finish_info.into_inner(),
-            )
-            .await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// Path parameters for Upstairs requests (internal API)
-#[derive(Deserialize, JsonSchema)]
-struct UpstairsRepairPathParam {
-    upstairs_id: TypedUuid<UpstairsKind>,
-    repair_id: TypedUuid<UpstairsRepairKind>,
-}
-
-/// An Upstairs will update this endpoint with the progress of a repair
-#[endpoint {
-     method = POST,
-     path = "/crucible/0/upstairs/{upstairs_id}/repair/{repair_id}/progress",
- }]
-async fn cpapi_upstairs_repair_progress(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<UpstairsRepairPathParam>,
-    repair_progress: TypedBody<RepairProgress>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus
-            .upstairs_repair_progress(
-                &opctx,
-                path.upstairs_id,
-                path.repair_id,
-                repair_progress.into_inner(),
-            )
+        // TODO: return 404 if the error we get here says that the record isn't found
+        let body = nexus
+            .updates_download_artifact(&opctx, path_params.into_inner())
             .await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
 
-/// Path parameters for Downstairs requests (internal API)
-#[derive(Deserialize, JsonSchema)]
-struct UpstairsDownstairsPathParam {
-    upstairs_id: TypedUuid<UpstairsKind>,
-    downstairs_id: TypedUuid<DownstairsKind>,
-}
-
-/// An Upstairs will update this endpoint if a Downstairs client task is
-/// requested to stop
-#[endpoint {
-     method = POST,
-     path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stop-request",
- }]
-async fn cpapi_downstairs_client_stop_request(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<UpstairsDownstairsPathParam>,
-    downstairs_client_stop_request: TypedBody<DownstairsClientStopRequest>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus
-            .downstairs_client_stop_request_notification(
-                &opctx,
-                path.upstairs_id,
-                path.downstairs_id,
-                downstairs_client_stop_request.into_inner(),
-            )
-            .await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// An Upstairs will update this endpoint if a Downstairs client task stops for
-/// any reason (not just after being requested to)
-#[endpoint {
-     method = POST,
-     path = "/crucible/0/upstairs/{upstairs_id}/downstairs/{downstairs_id}/stopped",
- }]
-async fn cpapi_downstairs_client_stopped(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<UpstairsDownstairsPathParam>,
-    downstairs_client_stopped: TypedBody<DownstairsClientStopped>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let path = path_params.into_inner();
-
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        nexus
-            .downstairs_client_stopped_notification(
-                &opctx,
-                path.upstairs_id,
-                path.downstairs_id,
-                downstairs_client_stopped.into_inner(),
-            )
-            .await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-// Sagas
+        Ok(HttpResponseOk(Body::from(body).into()))
+    }
 
-/// List sagas
-#[endpoint {
-    method = GET,
-    path = "/sagas",
-}]
-async fn saga_list(
-    rqctx: RequestContext<ApiContext>,
-    query_params: Query<PaginatedById>,
-) -> Result<HttpResponseOk<ResultsPage<Saga>>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
+    async fn cpapi_upstairs_repair_start(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsPathParam>,
+        repair_start_info: TypedBody<RepairStartInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
-        let query = query_params.into_inner();
-        let pagparams = data_page_params_for(&rqctx, &query)?;
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let saga_stream = nexus.sagas_list(&opctx, &pagparams).await?;
-        let view_list = to_list(saga_stream).await;
-        Ok(HttpResponseOk(ScanById::results_page(
-            &query,
-            view_list,
-            &|_, saga: &Saga| saga.id,
-        )?))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+        let path = path_params.into_inner();
 
-/// Path parameters for Saga requests
-#[derive(Deserialize, JsonSchema)]
-struct SagaPathParam {
-    saga_id: Uuid,
-}
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus
+                .upstairs_repair_start(
+                    &opctx,
+                    path.upstairs_id,
+                    repair_start_info.into_inner(),
+                )
+                .await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Fetch a saga
-#[endpoint {
-    method = GET,
-    path = "/sagas/{saga_id}",
-}]
-async fn saga_view(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<SagaPathParam>,
-) -> Result<HttpResponseOk<Saga>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
+    async fn cpapi_upstairs_repair_finish(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsPathParam>,
+        repair_finish_info: TypedBody<RepairFinishInfo>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
         let path = path_params.into_inner();
-        let saga = nexus.saga_get(&opctx, path.saga_id).await?;
-        Ok(HttpResponseOk(saga))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
 
-// Background Tasks
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus
+                .upstairs_repair_finish(
+                    &opctx,
+                    path.upstairs_id,
+                    repair_finish_info.into_inner(),
+                )
+                .await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// List background tasks
-///
-/// This is a list of discrete background activities that Nexus carries out.
-/// This is exposed for support and debugging.
-#[endpoint {
-    method = GET,
-    path = "/bgtasks",
-}]
-async fn bgtask_list(
-    rqctx: RequestContext<ApiContext>,
-) -> Result<HttpResponseOk<BTreeMap<String, BackgroundTask>>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
+    async fn cpapi_upstairs_repair_progress(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsRepairPathParam>,
+        repair_progress: TypedBody<RepairProgress>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let bgtask_list = nexus.bgtasks_list(&opctx).await?;
-        Ok(HttpResponseOk(bgtask_list))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// Path parameters for Background Task requests
-#[derive(Deserialize, JsonSchema)]
-struct BackgroundTaskPathParam {
-    bgtask_name: String,
-}
+        let path = path_params.into_inner();
 
-/// Query parameters for Background Task activation requests.
-#[derive(Deserialize, JsonSchema)]
-struct BackgroundTasksActivateRequest {
-    bgtask_names: BTreeSet<String>,
-}
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus
+                .upstairs_repair_progress(
+                    &opctx,
+                    path.upstairs_id,
+                    path.repair_id,
+                    repair_progress.into_inner(),
+                )
+                .await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Fetch status of one background task
-///
-/// This is exposed for support and debugging.
-#[endpoint {
-    method = GET,
-    path = "/bgtasks/view/{bgtask_name}",
-}]
-async fn bgtask_view(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<BackgroundTaskPathParam>,
-) -> Result<HttpResponseOk<BackgroundTask>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
+    async fn cpapi_downstairs_client_stop_request(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsDownstairsPathParam>,
+        downstairs_client_stop_request: TypedBody<DownstairsClientStopRequest>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
         let path = path_params.into_inner();
-        let bgtask = nexus.bgtask_status(&opctx, &path.bgtask_name).await?;
-        Ok(HttpResponseOk(bgtask))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-/// Activates one or more background tasks, causing them to be run immediately
-/// if idle, or scheduled to run again as soon as possible if already running.
-#[endpoint {
-    method = POST,
-    path = "/bgtasks/activate",
-}]
-async fn bgtask_activate(
-    rqctx: RequestContext<ApiContext>,
-    body: TypedBody<BackgroundTasksActivateRequest>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let body = body.into_inner();
-        nexus.bgtask_activate(&opctx, body.bgtask_names).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
-
-// NAT RPW internal APIs
 
-/// Path parameters for NAT ChangeSet
-#[derive(Deserialize, JsonSchema)]
-struct RpwNatPathParam {
-    /// which change number to start generating
-    /// the change set from
-    from_gen: i64,
-}
-
-/// Query parameters for NAT ChangeSet
-#[derive(Deserialize, JsonSchema)]
-struct RpwNatQueryParam {
-    limit: u32,
-}
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus
+                .downstairs_client_stop_request_notification(
+                    &opctx,
+                    path.upstairs_id,
+                    path.downstairs_id,
+                    downstairs_client_stop_request.into_inner(),
+                )
+                .await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Fetch NAT ChangeSet
-///
-/// Caller provides their generation as `from_gen`, along with a query
-/// parameter for the page size (`limit`). Endpoint will return changes
-/// that have occured since the caller's generation number up to the latest
-/// change or until the `limit` is reached. If there are no changes, an
-/// empty vec is returned.
-#[endpoint {
-    method = GET,
-    path = "/nat/ipv4/changeset/{from_gen}"
-}]
-async fn ipv4_nat_changeset(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<RpwNatPathParam>,
-    query_params: Query<RpwNatQueryParam>,
-) -> Result<HttpResponseOk<Vec<Ipv4NatEntryView>>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
+    async fn cpapi_downstairs_client_stopped(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<UpstairsDownstairsPathParam>,
+        downstairs_client_stopped: TypedBody<DownstairsClientStopped>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
         let path = path_params.into_inner();
-        let query = query_params.into_inner();
-        let mut changeset = nexus
-            .datastore()
-            .ipv4_nat_changeset(&opctx, path.from_gen, query.limit)
-            .await?;
-        changeset.sort_by_key(|e| e.gen);
-        Ok(HttpResponseOk(changeset))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
 
-// APIs for managing blueprints
-//
-// These are not (yet) intended for use by any other programs.  Eventually, we
-// will want this functionality part of the public API.  But we don't want to
-// commit to any of this yet.  These properly belong in an RFD 399-style
-// "Service and Support API".  Absent that, we stick them here.
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus
+                .downstairs_client_stopped_notification(
+                    &opctx,
+                    path.upstairs_id,
+                    path.downstairs_id,
+                    downstairs_client_stopped.into_inner(),
+                )
+                .await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Lists blueprints
-#[endpoint {
-    method = GET,
-    path = "/deployment/blueprints/all",
-}]
-async fn blueprint_list(
-    rqctx: RequestContext<ApiContext>,
-    query_params: Query<PaginatedById>,
-) -> Result<HttpResponseOk<ResultsPage<BlueprintMetadata>>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let nexus = &apictx.nexus;
-        let query = query_params.into_inner();
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let pagparams = data_page_params_for(&rqctx, &query)?;
-        let blueprints = nexus.blueprint_list(&opctx, &pagparams).await?;
-        Ok(HttpResponseOk(ScanById::results_page(
-            &query,
-            blueprints,
-            &|_, blueprint: &BlueprintMetadata| blueprint.id,
-        )?))
-    };
+    // Sagas
+
+    async fn saga_list(
+        rqctx: RequestContext<Self::Context>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<ResultsPage<Saga>>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let nexus = &apictx.nexus;
+            let query = query_params.into_inner();
+            let pagparams = data_page_params_for(&rqctx, &query)?;
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let saga_stream = nexus.sagas_list(&opctx, &pagparams).await?;
+            let view_list = to_list(saga_stream).await;
+            Ok(HttpResponseOk(ScanById::results_page(
+                &query,
+                view_list,
+                &|_, saga: &Saga| saga.id,
+            )?))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn saga_view(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<SagaPathParam>,
+    ) -> Result<HttpResponseOk<Saga>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let path = path_params.into_inner();
+            let saga = nexus.saga_get(&opctx, path.saga_id).await?;
+            Ok(HttpResponseOk(saga))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Fetches one blueprint
-#[endpoint {
-    method = GET,
-    path = "/deployment/blueprints/all/{blueprint_id}",
-}]
-async fn blueprint_view(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<nexus_types::external_api::params::BlueprintPath>,
-) -> Result<HttpResponseOk<Blueprint>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let path = path_params.into_inner();
-        let blueprint = nexus.blueprint_view(&opctx, path.blueprint_id).await?;
-        Ok(HttpResponseOk(blueprint))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    // Background Tasks
+
+    async fn bgtask_list(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<BTreeMap<String, BackgroundTask>>, HttpError>
+    {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let nexus = &apictx.nexus;
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let bgtask_list = nexus.bgtasks_list(&opctx).await?;
+            Ok(HttpResponseOk(bgtask_list))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Deletes one blueprint
-#[endpoint {
-    method = DELETE,
-    path = "/deployment/blueprints/all/{blueprint_id}",
-}]
-async fn blueprint_delete(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<nexus_types::external_api::params::BlueprintPath>,
-) -> Result<HttpResponseDeleted, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let path = path_params.into_inner();
-        nexus.blueprint_delete(&opctx, path.blueprint_id).await?;
-        Ok(HttpResponseDeleted())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn bgtask_view(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<BackgroundTaskPathParam>,
+    ) -> Result<HttpResponseOk<BackgroundTask>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let path = path_params.into_inner();
+            let bgtask = nexus.bgtask_status(&opctx, &path.bgtask_name).await?;
+            Ok(HttpResponseOk(bgtask))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-// Managing the current target blueprint
+    async fn bgtask_activate(
+        rqctx: RequestContext<Self::Context>,
+        body: TypedBody<BackgroundTasksActivateRequest>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let body = body.into_inner();
+            nexus.bgtask_activate(&opctx, body.bgtask_names).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Fetches the current target blueprint, if any
-#[endpoint {
-    method = GET,
-    path = "/deployment/blueprints/target",
-}]
-async fn blueprint_target_view(
-    rqctx: RequestContext<ApiContext>,
-) -> Result<HttpResponseOk<BlueprintTarget>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let target = nexus.blueprint_target_view(&opctx).await?;
-        Ok(HttpResponseOk(target))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    // NAT RPW internal APIs
+
+    async fn ipv4_nat_changeset(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<RpwNatPathParam>,
+        query_params: Query<RpwNatQueryParam>,
+    ) -> Result<HttpResponseOk<Vec<Ipv4NatEntryView>>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let path = path_params.into_inner();
+            let query = query_params.into_inner();
+            let mut changeset = nexus
+                .datastore()
+                .ipv4_nat_changeset(&opctx, path.from_gen, query.limit)
+                .await?;
+            changeset.sort_by_key(|e| e.gen);
+            Ok(HttpResponseOk(changeset))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Make the specified blueprint the new target
-#[endpoint {
-    method = POST,
-    path = "/deployment/blueprints/target",
-}]
-async fn blueprint_target_set(
-    rqctx: RequestContext<ApiContext>,
-    target: TypedBody<BlueprintTargetSet>,
-) -> Result<HttpResponseOk<BlueprintTarget>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let target = target.into_inner();
-        let target = nexus.blueprint_target_set(&opctx, target).await?;
-        Ok(HttpResponseOk(target))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    // APIs for managing blueprints
+    async fn blueprint_list(
+        rqctx: RequestContext<Self::Context>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<ResultsPage<BlueprintMetadata>>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let nexus = &apictx.nexus;
+            let query = query_params.into_inner();
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let pagparams = data_page_params_for(&rqctx, &query)?;
+            let blueprints = nexus.blueprint_list(&opctx, &pagparams).await?;
+            Ok(HttpResponseOk(ScanById::results_page(
+                &query,
+                blueprints,
+                &|_, blueprint: &BlueprintMetadata| blueprint.id,
+            )?))
+        };
+
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Set the `enabled` field of the current target blueprint
-#[endpoint {
-    method = PUT,
-    path = "/deployment/blueprints/target/enabled",
-}]
-async fn blueprint_target_set_enabled(
-    rqctx: RequestContext<ApiContext>,
-    target: TypedBody<BlueprintTargetSet>,
-) -> Result<HttpResponseOk<BlueprintTarget>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let target = target.into_inner();
-        let target = nexus.blueprint_target_set_enabled(&opctx, target).await?;
-        Ok(HttpResponseOk(target))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    /// Fetches one blueprint
+    async fn blueprint_view(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<nexus_types::external_api::params::BlueprintPath>,
+    ) -> Result<HttpResponseOk<Blueprint>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let path = path_params.into_inner();
+            let blueprint =
+                nexus.blueprint_view(&opctx, path.blueprint_id).await?;
+            Ok(HttpResponseOk(blueprint))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-// Generating blueprints
+    /// Deletes one blueprint
+    async fn blueprint_delete(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<nexus_types::external_api::params::BlueprintPath>,
+    ) -> Result<HttpResponseDeleted, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let path = path_params.into_inner();
+            nexus.blueprint_delete(&opctx, path.blueprint_id).await?;
+            Ok(HttpResponseDeleted())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Generates a new blueprint for the current system, re-evaluating anything
-/// that's changed since the last one was generated
-#[endpoint {
-    method = POST,
-    path = "/deployment/blueprints/regenerate",
-}]
-async fn blueprint_regenerate(
-    rqctx: RequestContext<ApiContext>,
-) -> Result<HttpResponseOk<Blueprint>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let result = nexus.blueprint_create_regenerate(&opctx).await?;
-        Ok(HttpResponseOk(result))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn blueprint_target_view(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<BlueprintTarget>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let target = nexus.blueprint_target_view(&opctx).await?;
+            Ok(HttpResponseOk(target))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Imports a client-provided blueprint
-///
-/// This is intended for development and support, not end users or operators.
-#[endpoint {
-    method = POST,
-    path = "/deployment/blueprints/import",
-}]
-async fn blueprint_import(
-    rqctx: RequestContext<ApiContext>,
-    blueprint: TypedBody<Blueprint>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let nexus = &apictx.nexus;
-        let blueprint = blueprint.into_inner();
-        nexus.blueprint_import(&opctx, blueprint).await?;
-        Ok(HttpResponseUpdatedNoContent())
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn blueprint_target_set(
+        rqctx: RequestContext<Self::Context>,
+        target: TypedBody<BlueprintTargetSet>,
+    ) -> Result<HttpResponseOk<BlueprintTarget>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let target = target.into_inner();
+            let target = nexus.blueprint_target_set(&opctx, target).await?;
+            Ok(HttpResponseOk(target))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// List uninitialized sleds
-#[endpoint {
-    method = GET,
-    path = "/sleds/uninitialized",
-}]
-async fn sled_list_uninitialized(
-    rqctx: RequestContext<ApiContext>,
-) -> Result<HttpResponseOk<ResultsPage<UninitializedSled>>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let nexus = &apictx.nexus;
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let sleds = nexus.sled_list_uninitialized(&opctx).await?;
-        Ok(HttpResponseOk(ResultsPage { items: sleds, next_page: None }))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn blueprint_target_set_enabled(
+        rqctx: RequestContext<Self::Context>,
+        target: TypedBody<BlueprintTargetSet>,
+    ) -> Result<HttpResponseOk<BlueprintTarget>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let target = target.into_inner();
+            let target =
+                nexus.blueprint_target_set_enabled(&opctx, target).await?;
+            Ok(HttpResponseOk(target))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-#[derive(Clone, Debug, Serialize, JsonSchema)]
-pub struct SledId {
-    pub id: SledUuid,
-}
+    async fn blueprint_regenerate(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<Blueprint>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let result = nexus.blueprint_create_regenerate(&opctx).await?;
+            Ok(HttpResponseOk(result))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Add sled to initialized rack
-//
-// TODO: In the future this should really be a PUT request, once we resolve
-// https://github.com/oxidecomputer/omicron/issues/4494. It should also
-// explicitly be tied to a rack via a `rack_id` path param. For now we assume
-// we are only operating on single rack systems.
-#[endpoint {
-    method = POST,
-    path = "/sleds/add",
-}]
-async fn sled_add(
-    rqctx: RequestContext<ApiContext>,
-    sled: TypedBody<UninitializedSledId>,
-) -> Result<HttpResponseCreated<SledId>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let id = nexus.sled_add(&opctx, sled.into_inner()).await?;
-        Ok(HttpResponseCreated(SledId { id }))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn blueprint_import(
+        rqctx: RequestContext<Self::Context>,
+        blueprint: TypedBody<Blueprint>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let nexus = &apictx.nexus;
+            let blueprint = blueprint.into_inner();
+            nexus.blueprint_import(&opctx, blueprint).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Mark a sled as expunged
-///
-/// This is an irreversible process! It should only be called after
-/// sufficient warning to the operator.
-///
-/// This is idempotent, and it returns the old policy of the sled.
-#[endpoint {
-    method = POST,
-    path = "/sleds/expunge",
-}]
-async fn sled_expunge(
-    rqctx: RequestContext<ApiContext>,
-    sled: TypedBody<SledSelector>,
-) -> Result<HttpResponseOk<SledPolicy>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let nexus = &apictx.nexus;
-    let handler = async {
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let previous_policy =
-            nexus.sled_expunge(&opctx, sled.into_inner().sled).await?;
-        Ok(HttpResponseOk(previous_policy))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
-}
+    async fn sled_list_uninitialized(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<ResultsPage<UninitializedSled>>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let nexus = &apictx.nexus;
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let sleds = nexus.sled_list_uninitialized(&opctx).await?;
+            Ok(HttpResponseOk(ResultsPage { items: sleds, next_page: None }))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Path parameters for probes
-#[derive(Deserialize, JsonSchema)]
-struct ProbePathParam {
-    sled: Uuid,
-}
+    async fn sled_add(
+        rqctx: RequestContext<Self::Context>,
+        sled: TypedBody<UninitializedSledId>,
+    ) -> Result<HttpResponseCreated<SledId>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let id = nexus.sled_add(&opctx, sled.into_inner()).await?;
+            Ok(HttpResponseCreated(SledId { id }))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 
-/// Get all the probes associated with a given sled.
-#[endpoint {
-    method = GET,
-    path = "/probes/{sled}"
-}]
-async fn probes_get(
-    rqctx: RequestContext<ApiContext>,
-    path_params: Path<ProbePathParam>,
-    query_params: Query<PaginatedById>,
-) -> Result<HttpResponseOk<Vec<ProbeInfo>>, HttpError> {
-    let apictx = &rqctx.context().context;
-    let handler = async {
-        let query = query_params.into_inner();
-        let path = path_params.into_inner();
+    async fn sled_expunge(
+        rqctx: RequestContext<Self::Context>,
+        sled: TypedBody<SledSelector>,
+    ) -> Result<HttpResponseOk<SledPolicy>, HttpError> {
+        let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
-        let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
-        let pagparams = data_page_params_for(&rqctx, &query)?;
-        Ok(HttpResponseOk(
-            nexus.probe_list_for_sled(&opctx, &pagparams, path.sled).await?,
-        ))
-    };
-    apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let previous_policy =
+                nexus.sled_expunge(&opctx, sled.into_inner().sled).await?;
+            Ok(HttpResponseOk(previous_policy))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
+
+    async fn probes_get(
+        rqctx: RequestContext<Self::Context>,
+        path_params: Path<ProbePathParam>,
+        query_params: Query<PaginatedById>,
+    ) -> Result<HttpResponseOk<Vec<ProbeInfo>>, HttpError> {
+        let apictx = &rqctx.context().context;
+        let handler = async {
+            let query = query_params.into_inner();
+            let path = path_params.into_inner();
+            let nexus = &apictx.nexus;
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            let pagparams = data_page_params_for(&rqctx, &query)?;
+            Ok(HttpResponseOk(
+                nexus
+                    .probe_list_for_sled(&opctx, &pagparams, path.sled)
+                    .await?,
+            ))
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
 }
diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs
index e48ec83d98..a359ead038 100644
--- a/nexus/src/lib.rs
+++ b/nexus/src/lib.rs
@@ -65,16 +65,6 @@ pub fn run_openapi_external() -> Result<(), String> {
         .map_err(|e| e.to_string())
 }
 
-pub fn run_openapi_internal() -> Result<(), String> {
-    internal_api()
-        .openapi("Nexus internal API", "0.0.1")
-        .description("Nexus internal API")
-        .contact_url("https://oxide.computer")
-        .contact_email("api@oxide.computer")
-        .write(&mut std::io::stdout())
-        .map_err(|e| e.to_string())
-}
-
 /// A partially-initialized Nexus server, which exposes an internal interface,
 /// but is not ready to receive external requests.
 pub struct InternalServer {
diff --git a/nexus/tests/integration_tests/commands.rs b/nexus/tests/integration_tests/commands.rs
index 1a6e717345..3e133e8681 100644
--- a/nexus/tests/integration_tests/commands.rs
+++ b/nexus/tests/integration_tests/commands.rs
@@ -180,19 +180,3 @@ fn test_nexus_openapi() {
     // renaming, or changing the tags are what you intend.
     assert_contents("tests/output/nexus_tags.txt", &tags);
 }
-
-#[test]
-fn test_nexus_openapi_internal() {
-    let (stdout_text, _) = run_command_with_arg("--openapi-internal");
-    let spec: OpenAPI = serde_json::from_str(&stdout_text)
-        .expect("stdout was not valid OpenAPI");
-
-    // Check for lint errors.
-    let errors = openapi_lint::validate(&spec);
-    assert!(errors.is_empty(), "{}", errors.join("\n\n"));
-
-    // Confirm that the output hasn't changed. It's expected that we'll change
-    // this file as the API evolves, but pay attention to the diffs to ensure
-    // that the changes match your expectations.
-    assert_contents("../openapi/nexus-internal.json", &stdout_text);
-}
diff --git a/nexus/tests/integration_tests/probe.rs b/nexus/tests/integration_tests/probe.rs
index 71a695bf8c..53ad6a3ef9 100644
--- a/nexus/tests/integration_tests/probe.rs
+++ b/nexus/tests/integration_tests/probe.rs
@@ -1,13 +1,12 @@
 use dropshot::HttpErrorResponseBody;
 use http::{Method, StatusCode};
-use nexus_db_queries::db::datastore::ProbeInfo;
 use nexus_test_utils::{
     http_testing::{AuthnMode, NexusRequest},
     resource_helpers::{create_default_ip_pool, create_project},
     SLED_AGENT_UUID,
 };
 use nexus_test_utils_macros::nexus_test;
-use nexus_types::external_api::params::ProbeCreate;
+use nexus_types::external_api::{params::ProbeCreate, shared::ProbeInfo};
 use omicron_common::api::external::{IdentityMetadataCreateParams, Probe};
 
 type ControlPlaneTestContext =
diff --git a/nexus/tests/output/cmd-nexus-noargs-stderr b/nexus/tests/output/cmd-nexus-noargs-stderr
index 8dff679340..385248bd0e 100644
--- a/nexus/tests/output/cmd-nexus-noargs-stderr
+++ b/nexus/tests/output/cmd-nexus-noargs-stderr
@@ -6,8 +6,7 @@ Arguments:
   [CONFIG_FILE_PATH]  
 
 Options:
-  -O, --openapi           Print the external OpenAPI Spec document and exit
-  -I, --openapi-internal  Print the internal OpenAPI Spec document and exit
-  -h, --help              Print help
+  -O, --openapi  Print the external OpenAPI Spec document and exit
+  -h, --help     Print help
 
 nexus: CONFIG_FILE_PATH is required
diff --git a/nexus/types/src/external_api/shared.rs b/nexus/types/src/external_api/shared.rs
index 96843ba6a4..32d8765a54 100644
--- a/nexus/types/src/external_api/shared.rs
+++ b/nexus/types/src/external_api/shared.rs
@@ -7,6 +7,7 @@
 use std::net::IpAddr;
 
 use omicron_common::api::external::Name;
+use omicron_common::api::internal::shared::NetworkInterface;
 use parse_display::FromStr;
 use schemars::JsonSchema;
 use serde::de::Error as _;
@@ -412,3 +413,28 @@ mod test {
         );
     }
 }
+
+#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)]
+pub struct ProbeInfo {
+    pub id: Uuid,
+    pub name: Name,
+    pub sled: Uuid,
+    pub external_ips: Vec<ProbeExternalIp>,
+    pub interface: NetworkInterface,
+}
+
+#[derive(Debug, Clone, JsonSchema, Serialize, Deserialize)]
+pub struct ProbeExternalIp {
+    pub ip: IpAddr,
+    pub first_port: u16,
+    pub last_port: u16,
+    pub kind: ProbeExternalIpKind,
+}
+
+#[derive(Debug, Clone, Copy, JsonSchema, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ProbeExternalIpKind {
+    Snat,
+    Floating,
+    Ephemeral,
+}
diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs
index fde2d07072..b71fd04779 100644
--- a/nexus/types/src/internal_api/views.rs
+++ b/nexus/types/src/internal_api/views.rs
@@ -6,9 +6,13 @@ use chrono::DateTime;
 use chrono::Utc;
 use futures::future::ready;
 use futures::stream::StreamExt;
+use omicron_common::api::external::MacAddr;
 use omicron_common::api::external::ObjectStream;
+use omicron_common::api::external::Vni;
 use schemars::JsonSchema;
 use serde::Serialize;
+use std::net::Ipv4Addr;
+use std::net::Ipv6Addr;
 use std::time::Duration;
 use std::time::Instant;
 use steno::SagaResultErr;
@@ -296,3 +300,16 @@ pub struct LastResultCompleted {
     /// arbitrary datum emitted by the background task
     pub details: serde_json::Value,
 }
+
+/// NAT Record
+#[derive(Clone, Debug, Serialize, JsonSchema)]
+pub struct Ipv4NatEntryView {
+    pub external_address: Ipv4Addr,
+    pub first_port: u16,
+    pub last_port: u16,
+    pub sled_address: Ipv6Addr,
+    pub vni: Vni,
+    pub mac: MacAddr,
+    pub gen: i64,
+    pub deleted: bool,
+}
diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json
index 6d380891aa..27430c7599 100644
--- a/openapi/nexus-internal.json
+++ b/openapi/nexus-internal.json
@@ -628,8 +628,8 @@
     },
     "/disk/{disk_id}/remove-read-only-parent": {
       "post": {
-        "summary": "Request removal of a read_only_parent from a disk",
-        "description": "This is a thin wrapper around the volume_remove_read_only_parent saga. All we are doing here is, given a disk UUID, figure out what the volume_id is for that disk, then use that to call the volume_remove_read_only_parent saga on it.",
+        "summary": "Request removal of a read_only_parent from a disk.",
+        "description": "This is a thin wrapper around the volume_remove_read_only_parent saga. All we are doing here is, given a disk UUID, figure out what the volume_id is for that disk, then use that to call the disk_remove_read_only_parent saga on it.",
         "operationId": "cpapi_disk_remove_read_only_parent",
         "parameters": [
           {
@@ -1347,7 +1347,7 @@
     },
     "/volume/{volume_id}/remove-read-only-parent": {
       "post": {
-        "summary": "Request removal of a read_only_parent from a volume",
+        "summary": "Request removal of a read_only_parent from a volume.",
         "description": "A volume can be created with the source data for that volume being another volume that attached as a \"read_only_parent\". In the background there exists a scrubber that will copy the data from the read_only_parent into the volume. When that scrubber has completed copying the data, this endpoint can be called to update the database that the read_only_parent is no longer needed for a volume and future attachments of this volume should not include that read_only_parent.",
         "operationId": "cpapi_volume_remove_read_only_parent",
         "parameters": [
@@ -3201,14 +3201,6 @@
           "time_updated"
         ]
       },
-      "IpKind": {
-        "type": "string",
-        "enum": [
-          "snat",
-          "floating",
-          "ephemeral"
-        ]
-      },
       "IpNet": {
         "x-rust-type": {
           "crate": "oxnet",
@@ -3940,7 +3932,7 @@
             "format": "ip"
           },
           "kind": {
-            "$ref": "#/components/schemas/IpKind"
+            "$ref": "#/components/schemas/ProbeExternalIpKind"
           },
           "last_port": {
             "type": "integer",
@@ -3955,6 +3947,14 @@
           "last_port"
         ]
       },
+      "ProbeExternalIpKind": {
+        "type": "string",
+        "enum": [
+          "snat",
+          "floating",
+          "ephemeral"
+        ]
+      },
       "ProbeInfo": {
         "type": "object",
         "properties": {
diff --git a/openapi/nexus.json b/openapi/nexus.json
index 51278f3f6d..c9d85a8ee3 100644
--- a/openapi/nexus.json
+++ b/openapi/nexus.json
@@ -15638,14 +15638,6 @@
           }
         ]
       },
-      "IpKind": {
-        "type": "string",
-        "enum": [
-          "snat",
-          "floating",
-          "ephemeral"
-        ]
-      },
       "IpNet": {
         "x-rust-type": {
           "crate": "oxnet",
@@ -16871,7 +16863,7 @@
             "format": "ip"
           },
           "kind": {
-            "$ref": "#/components/schemas/IpKind"
+            "$ref": "#/components/schemas/ProbeExternalIpKind"
           },
           "last_port": {
             "type": "integer",
@@ -16886,6 +16878,14 @@
           "last_port"
         ]
       },
+      "ProbeExternalIpKind": {
+        "type": "string",
+        "enum": [
+          "snat",
+          "floating",
+          "ephemeral"
+        ]
+      },
       "ProbeInfo": {
         "type": "object",
         "properties": {
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index cd76fd2611..796cf0bf63 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -25,7 +25,8 @@ bit-set = { version = "0.5.3" }
 bit-vec = { version = "0.6.3" }
 bitflags-dff4ba8e3ae991db = { package = "bitflags", version = "1.3.2" }
 bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.5.0", default-features = false, features = ["serde", "std"] }
-bstr = { version = "1.9.1" }
+bstr-6f8ce4dd05d13bba = { package = "bstr", version = "0.2.17" }
+bstr-dff4ba8e3ae991db = { package = "bstr", version = "1.9.1" }
 byteorder = { version = "1.5.0" }
 bytes = { version = "1.6.0", features = ["serde"] }
 chrono = { version = "0.4.38", features = ["serde"] }
@@ -95,7 +96,7 @@ semver = { version = "1.0.23", features = ["serde"] }
 serde = { version = "1.0.204", features = ["alloc", "derive", "rc"] }
 serde_json = { version = "1.0.120", features = ["raw_value", "unbounded_depth"] }
 sha2 = { version = "0.10.8", features = ["oid"] }
-similar = { version = "2.5.0", features = ["inline", "unicode"] }
+similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] }
 slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] }
 smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] }
 spin = { version = "0.9.8" }
@@ -129,7 +130,8 @@ bit-set = { version = "0.5.3" }
 bit-vec = { version = "0.6.3" }
 bitflags-dff4ba8e3ae991db = { package = "bitflags", version = "1.3.2" }
 bitflags-f595c2ba2a3f28df = { package = "bitflags", version = "2.5.0", default-features = false, features = ["serde", "std"] }
-bstr = { version = "1.9.1" }
+bstr-6f8ce4dd05d13bba = { package = "bstr", version = "0.2.17" }
+bstr-dff4ba8e3ae991db = { package = "bstr", version = "1.9.1" }
 byteorder = { version = "1.5.0" }
 bytes = { version = "1.6.0", features = ["serde"] }
 chrono = { version = "0.4.38", features = ["serde"] }
@@ -199,7 +201,7 @@ semver = { version = "1.0.23", features = ["serde"] }
 serde = { version = "1.0.204", features = ["alloc", "derive", "rc"] }
 serde_json = { version = "1.0.120", features = ["raw_value", "unbounded_depth"] }
 sha2 = { version = "0.10.8", features = ["oid"] }
-similar = { version = "2.5.0", features = ["inline", "unicode"] }
+similar = { version = "2.5.0", features = ["bytes", "inline", "unicode"] }
 slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] }
 smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] }
 spin = { version = "0.9.8" }

From fe60eb9c20b105c023e68d27cdb6605e4211fe2c Mon Sep 17 00:00:00 2001
From: John Gallagher <john@oxidecomputer.com>
Date: Fri, 12 Jul 2024 11:23:46 -0700
Subject: [PATCH 08/27] reconfigurator: Ensure all durable datasets, not just
 crucible (#6065)

Reconfigurator currently ensures a `Dataset` row exists for all crucible
zones, but fails to do so for other zones with durable datasets (notably
Cockroach, which we now support adding and expunging!). This PR fixes
that.

I think we also need reconfigurator to learn how to _delete_ dataset
rows for expunged zones, right? That will be a followup PR.
---
 .../execution/src/cockroachdb.rs              |   3 +
 .../reconfigurator/execution/src/datasets.rs  | 204 ++++++++----------
 nexus/reconfigurator/execution/src/dns.rs     |   4 +
 nexus/reconfigurator/execution/src/lib.rs     |  85 +++++++-
 .../planning/src/blueprint_builder/builder.rs |   4 +-
 .../background/tasks/blueprint_execution.rs   |  23 +-
 nexus/types/src/deployment.rs                 |   1 +
 nexus/types/src/deployment/zone_type.rs       |  60 ++++--
 8 files changed, 242 insertions(+), 142 deletions(-)

diff --git a/nexus/reconfigurator/execution/src/cockroachdb.rs b/nexus/reconfigurator/execution/src/cockroachdb.rs
index 6bd72955c7..5a8710a1c5 100644
--- a/nexus/reconfigurator/execution/src/cockroachdb.rs
+++ b/nexus/reconfigurator/execution/src/cockroachdb.rs
@@ -88,6 +88,9 @@ mod test {
             settings.preserve_downgrade,
             CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string()
         );
+        // Record the zpools so we don't fail to ensure datasets (unrelated to
+        // crdb settings) during blueprint execution.
+        crate::tests::insert_zpool_records(datastore, &opctx, &blueprint).await;
         // Execute the initial blueprint.
         let overrides = Overridables::for_test(cptestctx);
         crate::realize_blueprint_with_overrides(
diff --git a/nexus/reconfigurator/execution/src/datasets.rs b/nexus/reconfigurator/execution/src/datasets.rs
index e007c2528e..51ac45c9df 100644
--- a/nexus/reconfigurator/execution/src/datasets.rs
+++ b/nexus/reconfigurator/execution/src/datasets.rs
@@ -6,12 +6,10 @@
 
 use anyhow::Context;
 use nexus_db_model::Dataset;
-use nexus_db_model::DatasetKind;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
-use nexus_types::deployment::blueprint_zone_type;
 use nexus_types::deployment::BlueprintZoneConfig;
-use nexus_types::deployment::BlueprintZoneType;
+use nexus_types::deployment::DurableDataset;
 use nexus_types::identity::Asset;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::OmicronZoneUuid;
@@ -19,12 +17,12 @@ use slog::info;
 use slog::warn;
 use std::collections::BTreeSet;
 
-/// For each crucible zone in `all_omicron_zones`, ensure that a corresponding
-/// dataset record exists in `datastore`
+/// For each zone in `all_omicron_zones` that has an associated durable dataset,
+/// ensure that a corresponding dataset record exists in `datastore`.
 ///
 /// Does not modify any existing dataset records. Returns the number of
 /// datasets inserted.
-pub(crate) async fn ensure_crucible_dataset_records_exist(
+pub(crate) async fn ensure_dataset_records_exist(
     opctx: &OpContext,
     datastore: &DataStore,
     all_omicron_zones: impl Iterator<Item = &BlueprintZoneConfig>,
@@ -32,15 +30,14 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
     // Before attempting to insert any datasets, first query for any existing
     // dataset records so we can filter them out. This looks like a typical
     // TOCTOU issue, but it is purely a performance optimization. We expect
-    // almost all executions of this function to do nothing: new crucible
-    // datasets are created very rarely relative to how frequently blueprint
-    // realization happens. We could remove this check and filter and instead
-    // run the below "insert if not exists" query on every crucible zone, and
-    // the behavior would still be correct. However, that would issue far more
-    // queries than necessary in the very common case of "we don't need to do
-    // anything at all".
-    let mut crucible_datasets = datastore
-        .dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
+    // almost all executions of this function to do nothing: new datasets are
+    // created very rarely relative to how frequently blueprint realization
+    // happens. We could remove this check and filter and instead run the below
+    // "insert if not exists" query on every zone, and the behavior would still
+    // be correct. However, that would issue far more queries than necessary in
+    // the very common case of "we don't need to do anything at all".
+    let mut existing_datasets = datastore
+        .dataset_list_all_batched(opctx, None)
         .await
         .context("failed to list all datasets")?
         .into_iter()
@@ -51,10 +48,8 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
     let mut num_already_exist = 0;
 
     for zone in all_omicron_zones {
-        let BlueprintZoneType::Crucible(blueprint_zone_type::Crucible {
-            address,
-            dataset,
-        }) = &zone.zone_type
+        let Some(DurableDataset { dataset, kind, address }) =
+            zone.zone_type.durable_dataset()
         else {
             continue;
         };
@@ -62,7 +57,7 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
         let id = zone.id;
 
         // If already present in the datastore, move on.
-        if crucible_datasets.remove(&id) {
+        if existing_datasets.remove(&id) {
             num_already_exist += 1;
             continue;
         }
@@ -71,8 +66,8 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
         let dataset = Dataset::new(
             id.into_untyped_uuid(),
             pool_id.into_untyped_uuid(),
-            *address,
-            DatasetKind::Crucible,
+            address,
+            kind.into(),
         );
         let maybe_inserted = datastore
             .dataset_insert_if_not_exists(dataset)
@@ -87,8 +82,9 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
         if maybe_inserted.is_some() {
             info!(
                 opctx.log,
-                "inserted new dataset for crucible zone";
+                "inserted new dataset for Omicron zone";
                 "id" => %id,
+                "kind" => ?kind,
             );
             num_inserted += 1;
         } else {
@@ -99,18 +95,18 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
     // We don't currently support removing datasets, so this would be
     // surprising: the database contains dataset records that are no longer in
     // our blueprint. We can't do anything about this, so just warn.
-    if !crucible_datasets.is_empty() {
+    if !existing_datasets.is_empty() {
         warn!(
             opctx.log,
-            "database contains {} unexpected crucible datasets",
-            crucible_datasets.len();
-            "dataset_ids" => ?crucible_datasets,
+            "database contains {} unexpected datasets",
+            existing_datasets.len();
+            "dataset_ids" => ?existing_datasets,
         );
     }
 
     info!(
         opctx.log,
-        "ensured all crucible zones have dataset records";
+        "ensured all Omicron zones have dataset records";
         "num_inserted" => num_inserted,
         "num_already_existed" => num_already_exist,
     );
@@ -121,30 +117,27 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use nexus_db_model::Generation;
-    use nexus_db_model::SledBaseboard;
-    use nexus_db_model::SledSystemHardware;
-    use nexus_db_model::SledUpdate;
     use nexus_db_model::Zpool;
     use nexus_reconfigurator_planning::example::example;
     use nexus_test_utils_macros::nexus_test;
+    use nexus_types::deployment::blueprint_zone_type;
     use nexus_types::deployment::BlueprintZoneDisposition;
     use nexus_types::deployment::BlueprintZoneFilter;
+    use nexus_types::deployment::BlueprintZoneType;
     use omicron_common::zpool_name::ZpoolName;
     use omicron_uuid_kinds::GenericUuid;
     use omicron_uuid_kinds::ZpoolUuid;
     use sled_agent_client::types::OmicronZoneDataset;
-    use sled_agent_client::types::OmicronZoneType;
     use uuid::Uuid;
 
     type ControlPlaneTestContext =
         nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;
 
     #[nexus_test]
-    async fn test_ensure_crucible_dataset_records_exist(
+    async fn test_ensure_dataset_records_exist(
         cptestctx: &ControlPlaneTestContext,
     ) {
-        const TEST_NAME: &str = "test_ensure_crucible_dataset_records_exist";
+        const TEST_NAME: &str = "test_ensure_dataset_records_exist";
 
         // Set up.
         let nexus = &cptestctx.server.server_context().nexus;
@@ -158,55 +151,14 @@ mod tests {
         // Use the standard example system.
         let (collection, _, blueprint) = example(&opctx.log, TEST_NAME, 5);
 
-        // Record the sleds and zpools contained in this collection.
-        let rack_id = Uuid::new_v4();
-        for (&sled_id, config) in &collection.omicron_zones {
-            let sled = SledUpdate::new(
-                sled_id.into_untyped_uuid(),
-                "[::1]:0".parse().unwrap(),
-                SledBaseboard {
-                    serial_number: format!("test-{sled_id}"),
-                    part_number: "test-sled".to_string(),
-                    revision: 0,
-                },
-                SledSystemHardware {
-                    is_scrimlet: false,
-                    usable_hardware_threads: 128,
-                    usable_physical_ram: (64 << 30).try_into().unwrap(),
-                    reservoir_size: (16 << 30).try_into().unwrap(),
-                },
-                rack_id,
-                Generation::new(),
-            );
-            datastore.sled_upsert(sled).await.expect("failed to upsert sled");
-
-            for zone in &config.zones.zones {
-                let OmicronZoneType::Crucible { dataset, .. } = &zone.zone_type
-                else {
-                    continue;
-                };
-                let zpool = Zpool::new(
-                    dataset.pool_name.id().into_untyped_uuid(),
-                    sled_id.into_untyped_uuid(),
-                    Uuid::new_v4(), // physical_disk_id
-                );
-                datastore
-                    .zpool_insert(opctx, zpool)
-                    .await
-                    .expect("failed to upsert zpool");
-            }
-        }
-
-        // How many crucible zones are there?
-        let ncrucible_zones = collection
-            .all_omicron_zones()
-            .filter(|z| matches!(z.zone_type, OmicronZoneType::Crucible { .. }))
-            .count();
+        // Record the sleds and zpools.
+        crate::tests::insert_sled_records(datastore, &blueprint).await;
+        crate::tests::insert_zpool_records(datastore, opctx, &blueprint).await;
 
         // Prior to ensuring datasets exist, there should be none.
         assert_eq!(
             datastore
-                .dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
+                .dataset_list_all_batched(opctx, None)
                 .await
                 .unwrap()
                 .len(),
@@ -219,46 +171,52 @@ mod tests {
             .map(|(_, zone)| zone)
             .collect::<Vec<_>>();
 
-        let ndatasets_inserted = ensure_crucible_dataset_records_exist(
+        // How many zones are there with durable datasets?
+        let nzones_with_durable_datasets = all_omicron_zones
+            .iter()
+            .filter(|z| z.zone_type.durable_dataset().is_some())
+            .count();
+
+        let ndatasets_inserted = ensure_dataset_records_exist(
             opctx,
             datastore,
             all_omicron_zones.iter().copied(),
         )
         .await
-        .expect("failed to ensure crucible datasets");
+        .expect("failed to ensure datasets");
 
-        // We should have inserted a dataset for each crucible zone.
-        assert_eq!(ncrucible_zones, ndatasets_inserted);
+        // We should have inserted a dataset for each zone with a durable
+        // dataset.
+        assert_eq!(nzones_with_durable_datasets, ndatasets_inserted);
         assert_eq!(
             datastore
-                .dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
+                .dataset_list_all_batched(opctx, None)
                 .await
                 .unwrap()
                 .len(),
-            ncrucible_zones,
+            nzones_with_durable_datasets,
         );
 
-        // Ensuring the same crucible datasets again should insert no new
-        // records.
-        let ndatasets_inserted = ensure_crucible_dataset_records_exist(
+        // Ensuring the same datasets again should insert no new records.
+        let ndatasets_inserted = ensure_dataset_records_exist(
             opctx,
             datastore,
             all_omicron_zones.iter().copied(),
         )
         .await
-        .expect("failed to ensure crucible datasets");
+        .expect("failed to ensure datasets");
         assert_eq!(0, ndatasets_inserted);
         assert_eq!(
             datastore
-                .dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
+                .dataset_list_all_batched(opctx, None)
                 .await
                 .unwrap()
                 .len(),
-            ncrucible_zones,
+            nzones_with_durable_datasets,
         );
 
-        // Create another zpool on one of the sleds, so we can add a new
-        // crucible zone that uses it.
+        // Create another zpool on one of the sleds, so we can add new
+        // zones that use it.
         let new_zpool_id = ZpoolUuid::new_v4();
         for &sled_id in collection.omicron_zones.keys().take(1) {
             let zpool = Zpool::new(
@@ -272,37 +230,53 @@ mod tests {
                 .expect("failed to upsert zpool");
         }
 
-        // Call `ensure_crucible_dataset_records_exist` again, adding a new
-        // crucible zone. It should insert only this new zone.
-        let new_zone = BlueprintZoneConfig {
-            disposition: BlueprintZoneDisposition::InService,
-            id: OmicronZoneUuid::new_v4(),
-            underlay_address: "::1".parse().unwrap(),
-            filesystem_pool: Some(ZpoolName::new_external(new_zpool_id)),
-            zone_type: BlueprintZoneType::Crucible(
-                blueprint_zone_type::Crucible {
-                    address: "[::1]:0".parse().unwrap(),
-                    dataset: OmicronZoneDataset {
-                        pool_name: ZpoolName::new_external(new_zpool_id),
+        // Call `ensure_dataset_records_exist` again, adding new crucible and
+        // cockroach zones. It should insert only these new zones.
+        let new_zones = [
+            BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id: OmicronZoneUuid::new_v4(),
+                underlay_address: "::1".parse().unwrap(),
+                filesystem_pool: Some(ZpoolName::new_external(new_zpool_id)),
+                zone_type: BlueprintZoneType::Crucible(
+                    blueprint_zone_type::Crucible {
+                        address: "[::1]:0".parse().unwrap(),
+                        dataset: OmicronZoneDataset {
+                            pool_name: ZpoolName::new_external(new_zpool_id),
+                        },
                     },
-                },
-            ),
-        };
-        let ndatasets_inserted = ensure_crucible_dataset_records_exist(
+                ),
+            },
+            BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id: OmicronZoneUuid::new_v4(),
+                underlay_address: "::1".parse().unwrap(),
+                filesystem_pool: Some(ZpoolName::new_external(new_zpool_id)),
+                zone_type: BlueprintZoneType::CockroachDb(
+                    blueprint_zone_type::CockroachDb {
+                        address: "[::1]:0".parse().unwrap(),
+                        dataset: OmicronZoneDataset {
+                            pool_name: ZpoolName::new_external(new_zpool_id),
+                        },
+                    },
+                ),
+            },
+        ];
+        let ndatasets_inserted = ensure_dataset_records_exist(
             opctx,
             datastore,
-            all_omicron_zones.iter().copied().chain(std::iter::once(&new_zone)),
+            all_omicron_zones.iter().copied().chain(&new_zones),
         )
         .await
-        .expect("failed to ensure crucible datasets");
-        assert_eq!(ndatasets_inserted, 1);
+        .expect("failed to ensure datasets");
+        assert_eq!(ndatasets_inserted, 2);
         assert_eq!(
             datastore
-                .dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
+                .dataset_list_all_batched(opctx, None)
                 .await
                 .unwrap()
                 .len(),
-            ncrucible_zones + 1,
+            nzones_with_durable_datasets + 2,
         );
     }
 }
diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs
index f3b718ee54..f3210a12aa 100644
--- a/nexus/reconfigurator/execution/src/dns.rs
+++ b/nexus/reconfigurator/execution/src/dns.rs
@@ -1180,6 +1180,10 @@ mod test {
         blueprint.cockroachdb_setting_preserve_downgrade =
             CockroachDbPreserveDowngrade::DoNotModify;
 
+        // Record the zpools so we don't fail to ensure datasets (unrelated to
+        // DNS) during blueprint execution.
+        crate::tests::insert_zpool_records(datastore, &opctx, &blueprint).await;
+
         // Now, execute the initial blueprint.
         let overrides = Overridables::for_test(cptestctx);
         crate::realize_blueprint_with_overrides(
diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs
index 0e9ab394f1..8cdbd46265 100644
--- a/nexus/reconfigurator/execution/src/lib.rs
+++ b/nexus/reconfigurator/execution/src/lib.rs
@@ -187,7 +187,7 @@ where
     .context("failed to plumb service firewall rules to sleds")
     .map_err(|err| vec![err])?;
 
-    datasets::ensure_crucible_dataset_records_exist(
+    datasets::ensure_dataset_records_exist(
         &opctx,
         datastore,
         blueprint
@@ -236,3 +236,86 @@ where
 
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use nexus_db_model::Generation;
+    use nexus_db_model::SledBaseboard;
+    use nexus_db_model::SledSystemHardware;
+    use nexus_db_model::SledUpdate;
+    use nexus_db_model::Zpool;
+    use std::collections::BTreeSet;
+    use uuid::Uuid;
+
+    // Helper function to insert sled records from an initial blueprint. Some
+    // tests expect to be able to realize the the blueprint created from an
+    // initial collection, and ensuring the zones' datasets exist requires first
+    // inserting the sled and zpool records.
+    pub(crate) async fn insert_sled_records(
+        datastore: &DataStore,
+        blueprint: &Blueprint,
+    ) {
+        let rack_id = Uuid::new_v4();
+        let mut sleds_inserted = BTreeSet::new();
+
+        for sled_id in blueprint.blueprint_zones.keys().copied() {
+            if sleds_inserted.insert(sled_id) {
+                let sled = SledUpdate::new(
+                    sled_id.into_untyped_uuid(),
+                    "[::1]:0".parse().unwrap(),
+                    SledBaseboard {
+                        serial_number: format!("test-{sled_id}"),
+                        part_number: "test-sled".to_string(),
+                        revision: 0,
+                    },
+                    SledSystemHardware {
+                        is_scrimlet: false,
+                        usable_hardware_threads: 128,
+                        usable_physical_ram: (64 << 30).try_into().unwrap(),
+                        reservoir_size: (16 << 30).try_into().unwrap(),
+                    },
+                    rack_id,
+                    Generation::new(),
+                );
+                datastore
+                    .sled_upsert(sled)
+                    .await
+                    .expect("failed to upsert sled");
+            }
+        }
+    }
+
+    // Helper function to insert zpool records from an initial blueprint. Some
+    // tests expect to be able to realize the the blueprint created from an
+    // initial collection, and ensuring the zones' datasets exist requires first
+    // inserting the sled and zpool records.
+    pub(crate) async fn insert_zpool_records(
+        datastore: &DataStore,
+        opctx: &OpContext,
+        blueprint: &Blueprint,
+    ) {
+        let mut pool_inserted = BTreeSet::new();
+
+        for (sled_id, config) in
+            blueprint.all_omicron_zones(BlueprintZoneFilter::All)
+        {
+            let Some(dataset) = config.zone_type.durable_dataset() else {
+                continue;
+            };
+
+            let pool_id = dataset.dataset.pool_name.id();
+            if pool_inserted.insert(pool_id) {
+                let zpool = Zpool::new(
+                    pool_id.into_untyped_uuid(),
+                    sled_id.into_untyped_uuid(),
+                    Uuid::new_v4(), // physical_disk_id
+                );
+                datastore
+                    .zpool_insert(opctx, zpool)
+                    .await
+                    .expect("failed to upsert zpool");
+            }
+        }
+    }
+}
diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs
index 4177d4884f..93400a3708 100644
--- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs
+++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs
@@ -1317,7 +1317,7 @@ pub mod test {
             if let Some(dataset) = zone.zone_type.durable_dataset() {
                 let kind = zone.zone_type.kind();
                 if let Some(previous) = kinds_by_zpool
-                    .entry(dataset.pool_name.id())
+                    .entry(dataset.dataset.pool_name.id())
                     .or_default()
                     .insert(kind, zone.id)
                 {
@@ -1325,7 +1325,7 @@ pub mod test {
                         "zpool {} has two zones of kind {kind:?}: {} and {}\
                             \n\n\
                             blueprint: {}",
-                        dataset.pool_name,
+                        dataset.dataset.pool_name,
                         zone.id,
                         previous,
                         blueprint.display(),
diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs
index 16bf872f2a..f5d15eab3d 100644
--- a/nexus/src/app/background/tasks/blueprint_execution.rs
+++ b/nexus/src/app/background/tasks/blueprint_execution.rs
@@ -120,11 +120,12 @@ mod test {
     use httptest::responders::status_code;
     use httptest::Expectation;
     use nexus_db_model::{
-        ByteCount, SledBaseboard, SledSystemHardware, SledUpdate,
+        ByteCount, SledBaseboard, SledSystemHardware, SledUpdate, Zpool,
     };
     use nexus_db_queries::authn;
     use nexus_db_queries::context::OpContext;
     use nexus_test_utils_macros::nexus_test;
+    use nexus_types::deployment::BlueprintZoneFilter;
     use nexus_types::deployment::{
         blueprint_zone_type, Blueprint, BlueprintPhysicalDisksConfig,
         BlueprintTarget, BlueprintZoneConfig, BlueprintZoneDisposition,
@@ -307,6 +308,26 @@ mod test {
             generation,
         );
 
+        // Insert records for the zpools backing the datasets in these zones.
+        for (sled_id, config) in
+            blueprint.1.all_omicron_zones(BlueprintZoneFilter::All)
+        {
+            let Some(dataset) = config.zone_type.durable_dataset() else {
+                continue;
+            };
+
+            let pool_id = dataset.dataset.pool_name.id();
+            let zpool = Zpool::new(
+                pool_id.into_untyped_uuid(),
+                sled_id.into_untyped_uuid(),
+                Uuid::new_v4(), // physical_disk_id
+            );
+            datastore
+                .zpool_insert(&opctx, zpool)
+                .await
+                .expect("failed to upsert zpool");
+        }
+
         blueprint_tx.send(Some(Arc::new(blueprint.clone()))).unwrap();
 
         // Make sure that requests get made to the sled agent.  This is not a
diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs
index 4e655a1ed0..6f6c10a9c2 100644
--- a/nexus/types/src/deployment.rs
+++ b/nexus/types/src/deployment.rs
@@ -77,6 +77,7 @@ pub use planning_input::ZpoolFilter;
 pub use sled_agent_client::ZoneKind;
 pub use zone_type::blueprint_zone_type;
 pub use zone_type::BlueprintZoneType;
+pub use zone_type::DurableDataset;
 
 use blueprint_display::{
     constants::*, BpDiffState, BpGeneration, BpOmicronZonesSubtableSchema,
diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs
index 4c40bfc1de..eb0b2dc126 100644
--- a/nexus/types/src/deployment/zone_type.rs
+++ b/nexus/types/src/deployment/zone_type.rs
@@ -9,6 +9,7 @@
 //! that is not needed by sled-agent.
 
 use super::OmicronZoneExternalIp;
+use crate::internal_api::params::DatasetKind;
 use omicron_common::api::internal::shared::NetworkInterface;
 use schemars::JsonSchema;
 use serde::Deserialize;
@@ -16,6 +17,7 @@ use serde::Serialize;
 use sled_agent_client::types::OmicronZoneDataset;
 use sled_agent_client::types::OmicronZoneType;
 use sled_agent_client::ZoneKind;
+use std::net::SocketAddrV6;
 
 #[derive(Debug, Clone, Eq, PartialEq, JsonSchema, Deserialize, Serialize)]
 #[serde(tag = "type", rename_all = "snake_case")]
@@ -38,7 +40,7 @@ impl BlueprintZoneType {
     pub fn durable_zpool(
         &self,
     ) -> Option<&omicron_common::zpool_name::ZpoolName> {
-        self.durable_dataset().map(|dataset| &dataset.pool_name)
+        self.durable_dataset().map(|dataset| &dataset.dataset.pool_name)
     }
 
     pub fn external_networking(
@@ -118,38 +120,50 @@ impl BlueprintZoneType {
         }
     }
 
-    /// Returns a durable dataset associated with this zone, if any exists.
-    pub fn durable_dataset(&self) -> Option<&OmicronZoneDataset> {
-        match self {
+    /// Returns the durable dataset associated with this zone, if any exists.
+    pub fn durable_dataset(&self) -> Option<DurableDataset<'_>> {
+        let (dataset, kind, &address) = match self {
             BlueprintZoneType::Clickhouse(
-                blueprint_zone_type::Clickhouse { dataset, .. },
-            )
-            | BlueprintZoneType::ClickhouseKeeper(
-                blueprint_zone_type::ClickhouseKeeper { dataset, .. },
-            )
-            | BlueprintZoneType::CockroachDb(
-                blueprint_zone_type::CockroachDb { dataset, .. },
-            )
-            | BlueprintZoneType::Crucible(blueprint_zone_type::Crucible {
+                blueprint_zone_type::Clickhouse { dataset, address },
+            ) => (dataset, DatasetKind::Clickhouse, address),
+            BlueprintZoneType::ClickhouseKeeper(
+                blueprint_zone_type::ClickhouseKeeper { dataset, address },
+            ) => (dataset, DatasetKind::ClickhouseKeeper, address),
+            BlueprintZoneType::CockroachDb(
+                blueprint_zone_type::CockroachDb { dataset, address },
+            ) => (dataset, DatasetKind::Cockroach, address),
+            BlueprintZoneType::Crucible(blueprint_zone_type::Crucible {
                 dataset,
-                ..
-            })
-            | BlueprintZoneType::ExternalDns(
-                blueprint_zone_type::ExternalDns { dataset, .. },
-            )
-            | BlueprintZoneType::InternalDns(
-                blueprint_zone_type::InternalDns { dataset, .. },
-            ) => Some(dataset),
+                address,
+            }) => (dataset, DatasetKind::Crucible, address),
+            BlueprintZoneType::ExternalDns(
+                blueprint_zone_type::ExternalDns {
+                    dataset, http_address, ..
+                },
+            ) => (dataset, DatasetKind::ExternalDns, http_address),
+            BlueprintZoneType::InternalDns(
+                blueprint_zone_type::InternalDns {
+                    dataset, http_address, ..
+                },
+            ) => (dataset, DatasetKind::InternalDns, http_address),
             // Transient-dataset-only zones
             BlueprintZoneType::BoundaryNtp(_)
             | BlueprintZoneType::CruciblePantry(_)
             | BlueprintZoneType::InternalNtp(_)
             | BlueprintZoneType::Nexus(_)
-            | BlueprintZoneType::Oximeter(_) => None,
-        }
+            | BlueprintZoneType::Oximeter(_) => return None,
+        };
+
+        Some(DurableDataset { dataset, kind, address })
     }
 }
 
+pub struct DurableDataset<'a> {
+    pub dataset: &'a OmicronZoneDataset,
+    pub kind: DatasetKind,
+    pub address: SocketAddrV6,
+}
+
 impl From<BlueprintZoneType> for OmicronZoneType {
     fn from(zone_type: BlueprintZoneType) -> Self {
         match zone_type {

From e4bcfeeef8b73d60fd880a4bce3cd2465cb11c65 Mon Sep 17 00:00:00 2001
From: Ryan Goodfellow <ryan.goodfellow@oxide.computer>
Date: Fri, 12 Jul 2024 15:26:48 -0700
Subject: [PATCH 09/27] fix query selector for bgp filters and communities
 (#6072)

---
 nexus/db-queries/src/db/datastore/bgp.rs           | 14 ++++++++------
 .../background/tasks/sync_switch_configuration.rs  |  7 ++++---
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/nexus/db-queries/src/db/datastore/bgp.rs b/nexus/db-queries/src/db/datastore/bgp.rs
index d73e7ff327..1244184c1d 100644
--- a/nexus/db-queries/src/db/datastore/bgp.rs
+++ b/nexus/db-queries/src/db/datastore/bgp.rs
@@ -572,14 +572,14 @@ impl DataStore {
         &self,
         opctx: &OpContext,
         port_settings_id: Uuid,
-        interface_name: &String,
+        interface_name: &str,
         addr: IpNetwork,
     ) -> ListResultVec<SwitchPortBgpPeerConfigCommunity> {
         use db::schema::switch_port_settings_bgp_peer_config_communities::dsl;
 
         let results = dsl::switch_port_settings_bgp_peer_config_communities
             .filter(dsl::port_settings_id.eq(port_settings_id))
-            .filter(dsl::interface_name.eq(interface_name.clone()))
+            .filter(dsl::interface_name.eq(interface_name.to_owned()))
             .filter(dsl::addr.eq(addr))
             .load_async(&*self.pool_connection_authorized(opctx).await?)
             .await
@@ -592,7 +592,7 @@ impl DataStore {
         &self,
         opctx: &OpContext,
         port_settings_id: Uuid,
-        interface_name: &String,
+        interface_name: &str,
         addr: IpNetwork,
     ) -> LookupResult<Option<Vec<SwitchPortBgpPeerConfigAllowExport>>> {
         use db::schema::switch_port_settings_bgp_peer_config as db_peer;
@@ -619,7 +619,8 @@ impl DataStore {
                     dsl::switch_port_settings_bgp_peer_config_allow_export
                         .filter(db_allow::port_settings_id.eq(port_settings_id))
                         .filter(
-                            db_allow::interface_name.eq(interface_name.clone()),
+                            db_allow::interface_name
+                                .eq(interface_name.to_owned()),
                         )
                         .filter(db_allow::addr.eq(addr))
                         .load_async(&conn)
@@ -637,7 +638,7 @@ impl DataStore {
         &self,
         opctx: &OpContext,
         port_settings_id: Uuid,
-        interface_name: &String,
+        interface_name: &str,
         addr: IpNetwork,
     ) -> LookupResult<Option<Vec<SwitchPortBgpPeerConfigAllowImport>>> {
         use db::schema::switch_port_settings_bgp_peer_config as db_peer;
@@ -664,7 +665,8 @@ impl DataStore {
                     dsl::switch_port_settings_bgp_peer_config_allow_import
                         .filter(db_allow::port_settings_id.eq(port_settings_id))
                         .filter(
-                            db_allow::interface_name.eq(interface_name.clone()),
+                            db_allow::interface_name
+                                .eq(interface_name.to_owned()),
                         )
                         .filter(db_allow::addr.eq(addr))
                         .load_async(&conn)
diff --git a/nexus/src/app/background/tasks/sync_switch_configuration.rs b/nexus/src/app/background/tasks/sync_switch_configuration.rs
index e8f07726a5..20a12d1127 100644
--- a/nexus/src/app/background/tasks/sync_switch_configuration.rs
+++ b/nexus/src/app/background/tasks/sync_switch_configuration.rs
@@ -63,6 +63,7 @@ use std::{
 };
 
 const DPD_TAG: Option<&'static str> = Some(OMICRON_DPD_TAG);
+const PHY0: &str = "phy0";
 
 // This is more of an implementation detail of the BGP implementation. It
 // defines the maximum time the peering engine will wait for external messages
@@ -999,7 +1000,7 @@ impl BackgroundTask for SwitchPortSettingsManager {
                             .communities_for_peer(
                                 opctx,
                                 port.port_settings_id.unwrap(),
-                                &peer.port,
+                                PHY0, //TODO https://github.com/oxidecomputer/omicron/issues/3062
                                 IpNetwork::from(IpAddr::from(peer.addr))
                             ).await {
                                 Ok(cs) => cs.iter().map(|c| c.community.0).collect(),
@@ -1017,7 +1018,7 @@ impl BackgroundTask for SwitchPortSettingsManager {
                         let allow_import = match self.datastore.allow_import_for_peer(
                             opctx,
                             port.port_settings_id.unwrap(),
-                            &peer.port,
+                            PHY0, //TODO https://github.com/oxidecomputer/omicron/issues/3062
                             IpNetwork::from(IpAddr::from(peer.addr)),
                         ).await {
                             Ok(cs) => cs,
@@ -1041,7 +1042,7 @@ impl BackgroundTask for SwitchPortSettingsManager {
                         let allow_export = match self.datastore.allow_export_for_peer(
                             opctx,
                             port.port_settings_id.unwrap(),
-                            &peer.port,
+                            PHY0, //TODO https://github.com/oxidecomputer/omicron/issues/3062
                             IpNetwork::from(IpAddr::from(peer.addr)),
                         ).await {
                             Ok(cs) => cs,

From b7accd3e8894d37e8308f9ebdfa5d4fe96e8e4dc Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Sun, 14 Jul 2024 20:46:41 -0700
Subject: [PATCH 10/27] Update Rust crate clap to v4.5.9 (#5969)

Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com>
---
 Cargo.lock                | 12 ++++++------
 workspace-hack/Cargo.toml |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 84669a13e7..e243958276 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1018,9 +1018,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.4"
+version = "4.5.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0"
+checksum = "64acc1846d54c1fe936a78dc189c34e28d3f5afc348403f28ecf53660b9b8462"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -1028,9 +1028,9 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.5.2"
+version = "4.5.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
+checksum = "6fb8393d67ba2e7bfaf28a23458e4e2b543cc73a99595511eb207fdb8aede942"
 dependencies = [
  "anstream",
  "anstyle",
@@ -1041,9 +1041,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.5.4"
+version = "4.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64"
+checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index 796cf0bf63..cc12f6d032 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -31,8 +31,8 @@ byteorder = { version = "1.5.0" }
 bytes = { version = "1.6.0", features = ["serde"] }
 chrono = { version = "0.4.38", features = ["serde"] }
 cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] }
-clap = { version = "4.5.4", features = ["cargo", "derive", "env", "wrap_help"] }
-clap_builder = { version = "4.5.2", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] }
+clap = { version = "4.5.9", features = ["cargo", "derive", "env", "wrap_help"] }
+clap_builder = { version = "4.5.9", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] }
 console = { version = "0.15.8" }
 const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] }
 crossbeam-epoch = { version = "0.9.18" }
@@ -136,8 +136,8 @@ byteorder = { version = "1.5.0" }
 bytes = { version = "1.6.0", features = ["serde"] }
 chrono = { version = "0.4.38", features = ["serde"] }
 cipher = { version = "0.4.4", default-features = false, features = ["block-padding", "zeroize"] }
-clap = { version = "4.5.4", features = ["cargo", "derive", "env", "wrap_help"] }
-clap_builder = { version = "4.5.2", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] }
+clap = { version = "4.5.9", features = ["cargo", "derive", "env", "wrap_help"] }
+clap_builder = { version = "4.5.9", default-features = false, features = ["cargo", "color", "env", "std", "suggestions", "usage", "wrap_help"] }
 console = { version = "0.15.8" }
 const-oid = { version = "0.9.6", default-features = false, features = ["db", "std"] }
 crossbeam-epoch = { version = "0.9.18" }

From d993746ecf3205bdf591c47a1e30bed065d2ca64 Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Sun, 14 Jul 2024 20:47:11 -0700
Subject: [PATCH 11/27] Update Rust crate sqlformat to 0.2.4 (#6061)

Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com>
---
 Cargo.lock | 5 ++---
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e243958276..87a9467e45 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9287,11 +9287,10 @@ dependencies = [
 
 [[package]]
 name = "sqlformat"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce81b7bd7c4493975347ef60d8c7e8b742d4694f4c49f93e0a12ea263938176c"
+checksum = "f895e3734318cc55f1fe66258926c9b910c124d47520339efecbb6c59cec7c1f"
 dependencies = [
- "itertools 0.12.1",
  "nom",
  "unicode_categories",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index e5783b39eb..fea5a44bce 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -480,7 +480,7 @@ sp-sim = { path = "sp-sim" }
 sprockets-common = { git = "https://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" }
 sprockets-host = { git = "https://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" }
 sprockets-rot = { git = "https://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" }
-sqlformat = "0.2.3"
+sqlformat = "0.2.4"
 sqlparser = { version = "0.45.0", features = [ "visitor" ] }
 static_assertions = "1.1.0"
 # Please do not change the Steno version to a Git dependency.  It makes it

From 8fc8312e91fc078957fddbe0d5d043518156fed9 Mon Sep 17 00:00:00 2001
From: Rain <rain@oxide.computer>
Date: Sun, 14 Jul 2024 21:52:14 -0700
Subject: [PATCH 12/27] [dns-server] convert DNS server API into a trait
 (#6079)

Straightforward, and resulted in some nice cleanup.
---
 Cargo.lock                            |  13 +++
 Cargo.toml                            |   3 +
 dev-tools/openapi-manager/Cargo.toml  |   1 +
 dev-tools/openapi-manager/src/spec.rs |  10 ++
 dns-server-api/Cargo.toml             |  15 +++
 dns-server-api/src/lib.rs             | 160 ++++++++++++++++++++++++++
 dns-server/Cargo.toml                 |   1 +
 dns-server/src/bin/apigen.rs          |  29 -----
 dns-server/src/dns_server.rs          |   9 +-
 dns-server/src/dns_types.rs           |  50 --------
 dns-server/src/http_server.rs         | 157 ++++++-------------------
 dns-server/src/lib.rs                 |   1 -
 dns-server/src/storage.rs             |   8 +-
 dns-server/tests/openapi_test.rs      |  27 -----
 openapi/dns-server.json               |   7 +-
 15 files changed, 248 insertions(+), 243 deletions(-)
 create mode 100644 dns-server-api/Cargo.toml
 create mode 100644 dns-server-api/src/lib.rs
 delete mode 100644 dns-server/src/bin/apigen.rs
 delete mode 100644 dns-server/src/dns_types.rs
 delete mode 100644 dns-server/tests/openapi_test.rs

diff --git a/Cargo.lock b/Cargo.lock
index 87a9467e45..9a09d1d61a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1929,6 +1929,7 @@ dependencies = [
  "camino-tempfile",
  "chrono",
  "clap",
+ "dns-server-api",
  "dns-service-client",
  "dropshot",
  "expectorate",
@@ -1958,6 +1959,17 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "dns-server-api"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "dropshot",
+ "omicron-workspace-hack",
+ "schemars",
+ "serde",
+]
+
 [[package]]
 name = "dns-service-client"
 version = "0.1.0"
@@ -6106,6 +6118,7 @@ dependencies = [
  "atomicwrites",
  "camino",
  "clap",
+ "dns-server-api",
  "dropshot",
  "fs-err",
  "indent_write",
diff --git a/Cargo.toml b/Cargo.toml
index fea5a44bce..a44e69e1e5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
     "dev-tools/releng",
     "dev-tools/xtask",
     "dns-server",
+    "dns-server-api",
     "end-to-end-tests",
     "gateway-cli",
     "gateway-test-utils",
@@ -119,6 +120,7 @@ default-members = [
     # hakari to not work as well and build times to be longer.
     # See omicron#4392.
     "dns-server",
+    "dns-server-api",
     # Do not include end-to-end-tests in the list of default members, as its
     # tests only work on a deployed control plane.
     "gateway-cli",
@@ -279,6 +281,7 @@ derive-where = "1.2.7"
 diesel = { version = "2.1.6", features = ["postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] }
 diesel-dtrace = { git = "https://github.com/oxidecomputer/diesel-dtrace", branch = "main" }
 dns-server = { path = "dns-server" }
+dns-server-api = { path = "dns-server-api" }
 dns-service-client = { path = "clients/dns-service-client" }
 dpd-client = { path = "clients/dpd-client" }
 dropshot = { git = "https://github.com/oxidecomputer/dropshot", branch = "main", features = [ "usdt-probes" ] }
diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml
index b50aeec69f..1534181e9c 100644
--- a/dev-tools/openapi-manager/Cargo.toml
+++ b/dev-tools/openapi-manager/Cargo.toml
@@ -12,6 +12,7 @@ anyhow.workspace = true
 atomicwrites.workspace = true
 camino.workspace = true
 clap.workspace = true
+dns-server-api.workspace = true
 dropshot.workspace = true
 fs-err.workspace = true
 indent_write.workspace = true
diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs
index 37330d6922..53f3260ca9 100644
--- a/dev-tools/openapi-manager/src/spec.rs
+++ b/dev-tools/openapi-manager/src/spec.rs
@@ -14,6 +14,16 @@ use openapiv3::OpenAPI;
 /// All APIs managed by openapi-manager.
 pub fn all_apis() -> Vec<ApiSpec> {
     vec![
+        ApiSpec {
+            title: "Internal DNS".to_string(),
+            version: "0.0.1".to_string(),
+            description: "API for the internal DNS server".to_string(),
+            boundary: ApiBoundary::Internal,
+            api_description:
+                dns_server_api::dns_server_api::stub_api_description,
+            filename: "dns-server.json".to_string(),
+            extra_validation: None,
+        },
         ApiSpec {
             title: "Nexus internal API".to_string(),
             version: "0.0.1".to_string(),
diff --git a/dns-server-api/Cargo.toml b/dns-server-api/Cargo.toml
new file mode 100644
index 0000000000..c87af14e0d
--- /dev/null
+++ b/dns-server-api/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "dns-server-api"
+version = "0.1.0"
+edition = "2021"
+license = "MPL-2.0"
+
+[lints]
+workspace = true
+
+[dependencies]
+chrono.workspace = true
+dropshot.workspace = true
+omicron-workspace-hack.workspace = true
+schemars.workspace = true
+serde.workspace = true
diff --git a/dns-server-api/src/lib.rs b/dns-server-api/src/lib.rs
new file mode 100644
index 0000000000..2c59caf0c5
--- /dev/null
+++ b/dns-server-api/src/lib.rs
@@ -0,0 +1,160 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Dropshot API for configuring DNS namespace.
+//!
+//! ## Shape of the API
+//!
+//! The DNS configuration API has just two endpoints: PUT and GET of the entire
+//! DNS configuration.  This is pretty anti-REST.  But it's important to think
+//! about how this server fits into the rest of the system.  When changes are
+//! made to DNS data, they're grouped together and assigned a monotonically
+//! increasing generation number.  The DNS data is first stored into CockroachDB
+//! and then propagated from a distributed fleet of Nexus instances to a
+//! distributed fleet of these DNS servers.  If we accepted individual updates to
+//! DNS names, then propagating a particular change would be non-atomic, and
+//! Nexus would have to do a lot more work to ensure (1) that all changes were
+//! propagated (even if it crashes) and (2) that they were propagated in the
+//! correct order (even if two Nexus instances concurrently propagate separate
+//! changes).
+//!
+//! This DNS server supports hosting multiple zones.  We could imagine supporting
+//! separate endpoints to update the DNS data for a particular zone.  That feels
+//! nicer (although it's not clear what it would buy us).  But as with updates to
+//! multiple names, Nexus's job is potentially much easier if the entire state
+//! for all zones is updated at once.  (Otherwise, imagine how Nexus would
+//! implement _renaming_ one zone to another without loss of service.  With
+//! a combined endpoint and generation number for all zones, all that's necessary
+//! is to configure a new zone with all the same names, and then remove the old
+//! zone later in another update.  That can be managed by the same mechanism in
+//! Nexus that manages regular name updates.  On the other hand, if there were
+//! separate endpoints with separate generation numbers, then Nexus has more to
+//! keep track of in order to do the rename safely.)
+//!
+//! See RFD 367 for more on DNS propagation.
+//!
+//! ## ETags and Conditional Requests
+//!
+//! It's idiomatic in HTTP use ETags and conditional requests to provide
+//! synchronization.  We could define an ETag to be just the current generation
+//! number of the server and honor standard `if-match` headers to fail requests
+//! where the generation number doesn't match what the client expects.  This
+//! would be fine, but it's rather annoying:
+//!
+//! 1. When the client wants to propagate generation X, the client would have
+//!    make an extra request just to fetch the current ETag, just so it can put
+//!    it into the conditional request.
+//!
+//! 2. If some other client changes the configuration in the meantime, the
+//!    conditional request would fail and the client would have to take another
+//!    lap (fetching the current config and potentially making another
+//!    conditional PUT).
+//!
+//! 3. This approach would make synchronization opt-in.  If a client (or just
+//!    one errant code path) neglected to set the if-match header, we could do
+//!    the wrong thing and cause the system to come to rest with the wrong DNS
+//!    data.
+//!
+//! Since the semantics here are so simple (we only ever want to move the
+//! generation number forward), we don't bother with ETags or conditional
+//! requests.  Instead we have the server implement the behavior we want, which
+//! is that when a request comes in to update DNS data to generation X, the
+//! server replies with one of:
+//!
+//! (1) the update has been applied and the server is now running generation X
+//!     (client treats this as success)
+//!
+//! (2) the update was not applied because the server is already at generation X
+//!     (client treats this as success)
+//!
+//! (3) the update was not applied because the server is already at a newer
+//!     generation
+//!     (client probably starts the whole propagation process over because its
+//!     current view of the world is out of date)
+//!
+//! This way, the DNS data can never move backwards and the client only ever has
+//! to make one request.
+//!
+//! ## Concurrent updates
+//!
+//! Given that we've got just one API to update the all DNS zones, and given
+//! that might therefore take a minute for a large zone, and also that there may
+//! be multiple Nexus instances trying to do it at the same time, we need to
+//! think a bit about what should happen if two Nexus do try to do it at the same
+//! time.  Spoiler: we immediately fail any request to update the DNS data if
+//! there's already an update in progress.
+//!
+//! What else could we do?  We could queue the incoming request behind the
+//! in-progress one.  How large do we allow that queue to grow?  At some point
+//! we'll need to stop queueing them.  So why bother at all?
+
+use std::{
+    collections::HashMap,
+    net::{Ipv4Addr, Ipv6Addr},
+};
+
+use dropshot::{HttpError, HttpResponseOk, RequestContext};
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+
+#[dropshot::api_description]
+pub trait DnsServerApi {
+    type Context;
+
+    #[endpoint(
+        method = GET,
+        path = "/config",
+    )]
+    async fn dns_config_get(
+        rqctx: RequestContext<Self::Context>,
+    ) -> Result<HttpResponseOk<DnsConfig>, HttpError>;
+
+    #[endpoint(
+        method = PUT,
+        path = "/config",
+    )]
+    async fn dns_config_put(
+        rqctx: RequestContext<Self::Context>,
+        rq: dropshot::TypedBody<DnsConfigParams>,
+    ) -> Result<dropshot::HttpResponseUpdatedNoContent, dropshot::HttpError>;
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+pub struct DnsConfigParams {
+    pub generation: u64,
+    pub time_created: chrono::DateTime<chrono::Utc>,
+    pub zones: Vec<DnsConfigZone>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+pub struct DnsConfig {
+    pub generation: u64,
+    pub time_created: chrono::DateTime<chrono::Utc>,
+    pub time_applied: chrono::DateTime<chrono::Utc>,
+    pub zones: Vec<DnsConfigZone>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
+pub struct DnsConfigZone {
+    pub zone_name: String,
+    pub records: HashMap<String, Vec<DnsRecord>>,
+}
+
+#[allow(clippy::upper_case_acronyms)]
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)]
+#[serde(tag = "type", content = "data")]
+pub enum DnsRecord {
+    A(Ipv4Addr),
+    AAAA(Ipv6Addr),
+    SRV(SRV),
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)]
+#[serde(rename = "Srv")]
+pub struct SRV {
+    pub prio: u16,
+    pub weight: u16,
+    pub port: u16,
+    pub target: String,
+}
diff --git a/dns-server/Cargo.toml b/dns-server/Cargo.toml
index 237d2a2fbb..d11dabaf85 100644
--- a/dns-server/Cargo.toml
+++ b/dns-server/Cargo.toml
@@ -12,6 +12,7 @@ anyhow.workspace = true
 camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
+dns-server-api.workspace = true
 dns-service-client.workspace = true
 dropshot.workspace = true
 http.workspace = true
diff --git a/dns-server/src/bin/apigen.rs b/dns-server/src/bin/apigen.rs
deleted file mode 100644
index e130ee0211..0000000000
--- a/dns-server/src/bin/apigen.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! Generate the OpenAPI spec for the DNS server
-
-use anyhow::{bail, Result};
-use dns_server::http_server::api;
-use std::fs::File;
-use std::io;
-
-fn usage(args: &[String]) -> String {
-    format!("{} [output path]", args[0])
-}
-
-fn main() -> Result<()> {
-    let args: Vec<String> = std::env::args().collect();
-
-    let mut out = match args.len() {
-        1 => Box::new(io::stdout()) as Box<dyn io::Write>,
-        2 => Box::new(File::create(args[1].clone())?) as Box<dyn io::Write>,
-        _ => bail!(usage(&args)),
-    };
-
-    let api = api();
-    let openapi = api.openapi("Internal DNS", "v0.1.0");
-    openapi.write(&mut out)?;
-    Ok(())
-}
diff --git a/dns-server/src/dns_server.rs b/dns-server/src/dns_server.rs
index 01a8430b62..5c761f2aa3 100644
--- a/dns-server/src/dns_server.rs
+++ b/dns-server/src/dns_server.rs
@@ -7,12 +7,12 @@
 //! The facilities here handle binding a UDP socket, receiving DNS messages on
 //! that socket, and replying to them.
 
-use crate::dns_types::DnsRecord;
 use crate::storage;
 use crate::storage::QueryError;
 use crate::storage::Store;
 use anyhow::anyhow;
 use anyhow::Context;
+use dns_server_api::DnsRecord;
 use pretty_hex::*;
 use serde::Deserialize;
 use slog::{debug, error, info, o, trace, Logger};
@@ -234,12 +234,7 @@ fn dns_record_to_record(
             Ok(aaaa)
         }
 
-        DnsRecord::SRV(crate::dns_types::SRV {
-            prio,
-            weight,
-            port,
-            target,
-        }) => {
+        DnsRecord::SRV(dns_server_api::SRV { prio, weight, port, target }) => {
             let tgt = Name::from_str(&target).map_err(|error| {
                 RequestError::ServFail(anyhow!(
                     "serialization failed due to bad SRV target {:?}: {:#}",
diff --git a/dns-server/src/dns_types.rs b/dns-server/src/dns_types.rs
deleted file mode 100644
index 941124feb6..0000000000
--- a/dns-server/src/dns_types.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! types describing DNS records and configuration
-
-use schemars::JsonSchema;
-use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use std::net::Ipv4Addr;
-use std::net::Ipv6Addr;
-
-#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
-pub struct DnsConfigParams {
-    pub generation: u64,
-    pub time_created: chrono::DateTime<chrono::Utc>,
-    pub zones: Vec<DnsConfigZone>,
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
-pub struct DnsConfig {
-    pub generation: u64,
-    pub time_created: chrono::DateTime<chrono::Utc>,
-    pub time_applied: chrono::DateTime<chrono::Utc>,
-    pub zones: Vec<DnsConfigZone>,
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
-pub struct DnsConfigZone {
-    pub zone_name: String,
-    pub records: HashMap<String, Vec<DnsRecord>>,
-}
-
-#[allow(clippy::upper_case_acronyms)]
-#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)]
-#[serde(tag = "type", content = "data")]
-pub enum DnsRecord {
-    A(Ipv4Addr),
-    AAAA(Ipv6Addr),
-    SRV(SRV),
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema, PartialEq)]
-#[serde(rename = "Srv")]
-pub struct SRV {
-    pub prio: u16,
-    pub weight: u16,
-    pub port: u16,
-    pub target: String,
-}
diff --git a/dns-server/src/http_server.rs b/dns-server/src/http_server.rs
index e50346d828..84ffbc90e9 100644
--- a/dns-server/src/http_server.rs
+++ b/dns-server/src/http_server.rs
@@ -4,102 +4,12 @@
 
 //! Dropshot server for configuring DNS namespace
 
-// Shape of the API
-// ------------------------------
-//
-// The DNS configuration API has just two endpoints: PUT and GET of the entire
-// DNS configuration.  This is pretty anti-REST.  But it's important to think
-// about how this server fits into the rest of the system.  When changes are
-// made to DNS data, they're grouped together and assigned a monotonically
-// increasing generation number.  The DNS data is first stored into CockroachDB
-// and then propagated from a distributed fleet of Nexus instances to a
-// distributed fleet of these DNS servers.  If we accepted individual updates to
-// DNS names, then propagating a particular change would be non-atomic, and
-// Nexus would have to do a lot more work to ensure (1) that all changes were
-// propagated (even if it crashes) and (2) that they were propagated in the
-// correct order (even if two Nexus instances concurrently propagate separate
-// changes).
-//
-// This DNS server supports hosting multiple zones.  We could imagine supporting
-// separate endpoints to update the DNS data for a particular zone.  That feels
-// nicer (although it's not clear what it would buy us).  But as with updates to
-// multiple names, Nexus's job is potentially much easier if the entire state
-// for all zones is updated at once.  (Otherwise, imagine how Nexus would
-// implement _renaming_ one zone to another without loss of service.  With
-// a combined endpoint and generation number for all zones, all that's necessary
-// is to configure a new zone with all the same names, and then remove the old
-// zone later in another update.  That can be managed by the same mechanism in
-// Nexus that manages regular name updates.  On the other hand, if there were
-// separate endpoints with separate generation numbers, then Nexus has more to
-// keep track of in order to do the rename safely.)
-//
-// See RFD 367 for more on DNS propagation.
-//
-//
-// ETags and Conditional Requests
-// ------------------------------
-//
-// It's idiomatic in HTTP use ETags and conditional requests to provide
-// synchronization.  We could define an ETag to be just the current generation
-// number of the server and honor standard `if-match` headers to fail requests
-// where the generation number doesn't match what the client expects.  This
-// would be fine, but it's rather annoying:
-//
-// (1) When the client wants to propagate generation X, the client would have
-//     make an extra request just to fetch the current ETag, just so it can put
-//     it into the conditional request.
-//
-// (2) If some other client changes the configuration in the meantime, the
-//     conditional request would fail and the client would  have to take another
-//     lap (fetching the current config and potentially making another
-//     conditional PUT).
-//
-// (3) This approach would make synchronization opt-in.  If a client (or just
-//     one errant code path) neglected to set the if-match header, we could do
-//     the wrong thing and cause the system to come to rest with the wrong DNS
-//     data.
-//
-// Since the semantics here are so simple (we only ever want to move the
-// generation number forward), we don't bother with ETags or conditional
-// requests.  Instead we have the server implement the behavior we want, which
-// is that when a request comes in to update DNS data to generation X, the
-// server replies with one of:
-//
-// (1) the update has been applied and the server is now running generation X
-//     (client treats this as success)
-//
-// (2) the update was not applied because the server is already at generation X
-//     (client treats this as success)
-//
-// (3) the update was not applied because the server is already at a newer
-//     generation
-//     (client probably starts the whole propagation process over because its
-//     current view of the world is out of date)
-//
-// This way, the DNS data can never move backwards and the client only ever has
-// to make one request.
-//
-//
-// Concurrent updates
-// ------------------
-//
-// Given that we've got just one API to update the all DNS zones, and given
-// that might therefore take a minute for a large zone, and also that there may
-// be multiple Nexus instances trying to do it at the same time, we need to
-// think a bit about what should happen if two Nexus do try to do it at the same
-// time.  Spoiler: we immediately fail any request to update the DNS data if
-// there's already an update in progress.
-//
-// What else could we do?  We could queue the incoming request behind the
-// in-progress one.  How large do we allow that queue to grow?  At some point
-// we'll need to stop queueing them.  So why bother at all?
-
-use crate::dns_types::{DnsConfig, DnsConfigParams};
 use crate::storage::{self, UpdateError};
+use dns_server_api::{DnsConfig, DnsConfigParams, DnsServerApi};
 use dns_service_client::{
     ERROR_CODE_BAD_UPDATE_GENERATION, ERROR_CODE_UPDATE_IN_PROGRESS,
 };
-use dropshot::{endpoint, RequestContext};
+use dropshot::RequestContext;
 
 pub struct Context {
     store: storage::Store,
@@ -112,41 +22,40 @@ impl Context {
 }
 
 pub fn api() -> dropshot::ApiDescription<Context> {
-    let mut api = dropshot::ApiDescription::new();
-
-    api.register(dns_config_get).expect("register dns_config_get");
-    api.register(dns_config_put).expect("register dns_config_update");
-    api
+    dns_server_api::dns_server_api::api_description::<DnsServerApiImpl>()
+        .expect("registered DNS server entrypoints")
 }
 
-#[endpoint(
-    method = GET,
-    path = "/config",
-)]
-async fn dns_config_get(
-    rqctx: RequestContext<Context>,
-) -> Result<dropshot::HttpResponseOk<DnsConfig>, dropshot::HttpError> {
-    let apictx = rqctx.context();
-    let config = apictx.store.dns_config().await.map_err(|e| {
-        dropshot::HttpError::for_internal_error(format!(
-            "internal error: {:?}",
-            e
-        ))
-    })?;
-    Ok(dropshot::HttpResponseOk(config))
-}
+enum DnsServerApiImpl {}
+
+impl DnsServerApi for DnsServerApiImpl {
+    type Context = Context;
 
-#[endpoint(
-    method = PUT,
-    path = "/config",
-)]
-async fn dns_config_put(
-    rqctx: RequestContext<Context>,
-    rq: dropshot::TypedBody<DnsConfigParams>,
-) -> Result<dropshot::HttpResponseUpdatedNoContent, dropshot::HttpError> {
-    let apictx = rqctx.context();
-    apictx.store.dns_config_update(&rq.into_inner(), &rqctx.request_id).await?;
-    Ok(dropshot::HttpResponseUpdatedNoContent())
+    async fn dns_config_get(
+        rqctx: RequestContext<Context>,
+    ) -> Result<dropshot::HttpResponseOk<DnsConfig>, dropshot::HttpError> {
+        let apictx = rqctx.context();
+        let config = apictx.store.dns_config().await.map_err(|e| {
+            dropshot::HttpError::for_internal_error(format!(
+                "internal error: {:?}",
+                e
+            ))
+        })?;
+        Ok(dropshot::HttpResponseOk(config))
+    }
+
+    async fn dns_config_put(
+        rqctx: RequestContext<Context>,
+        rq: dropshot::TypedBody<DnsConfigParams>,
+    ) -> Result<dropshot::HttpResponseUpdatedNoContent, dropshot::HttpError>
+    {
+        let apictx = rqctx.context();
+        apictx
+            .store
+            .dns_config_update(&rq.into_inner(), &rqctx.request_id)
+            .await?;
+        Ok(dropshot::HttpResponseUpdatedNoContent())
+    }
 }
 
 impl From<UpdateError> for dropshot::HttpError {
diff --git a/dns-server/src/lib.rs b/dns-server/src/lib.rs
index ea8625a667..a2b1fda0d7 100644
--- a/dns-server/src/lib.rs
+++ b/dns-server/src/lib.rs
@@ -43,7 +43,6 @@
 //!    the persistent DNS data
 
 pub mod dns_server;
-pub mod dns_types;
 pub mod http_server;
 pub mod storage;
 
diff --git a/dns-server/src/storage.rs b/dns-server/src/storage.rs
index 21fb9ebdc6..85b2e79b8b 100644
--- a/dns-server/src/storage.rs
+++ b/dns-server/src/storage.rs
@@ -92,9 +92,9 @@
 // backwards-compatible way (but obviously one wouldn't get the scaling benefits
 // while continuing to use the old API).
 
-use crate::dns_types::{DnsConfig, DnsConfigParams, DnsConfigZone, DnsRecord};
 use anyhow::{anyhow, Context};
 use camino::Utf8PathBuf;
+use dns_server_api::{DnsConfig, DnsConfigParams, DnsConfigZone, DnsRecord};
 use serde::{Deserialize, Serialize};
 use sled::transaction::ConflictableTransactionError;
 use slog::{debug, error, info, o, warn};
@@ -777,13 +777,13 @@ impl<'a, 'b> Drop for UpdateGuard<'a, 'b> {
 #[cfg(test)]
 mod test {
     use super::{Config, Store, UpdateError};
-    use crate::dns_types::DnsConfigParams;
-    use crate::dns_types::DnsConfigZone;
-    use crate::dns_types::DnsRecord;
     use crate::storage::QueryError;
     use anyhow::Context;
     use camino::Utf8PathBuf;
     use camino_tempfile::Utf8TempDir;
+    use dns_server_api::DnsConfigParams;
+    use dns_server_api::DnsConfigZone;
+    use dns_server_api::DnsRecord;
     use omicron_test_utils::dev::test_setup_log;
     use std::collections::BTreeSet;
     use std::collections::HashMap;
diff --git a/dns-server/tests/openapi_test.rs b/dns-server/tests/openapi_test.rs
deleted file mode 100644
index 490680eda4..0000000000
--- a/dns-server/tests/openapi_test.rs
+++ /dev/null
@@ -1,27 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-use expectorate::assert_contents;
-use omicron_test_utils::dev::test_cmds::assert_exit_code;
-use omicron_test_utils::dev::test_cmds::path_to_executable;
-use omicron_test_utils::dev::test_cmds::run_command;
-use omicron_test_utils::dev::test_cmds::EXIT_SUCCESS;
-use openapiv3::OpenAPI;
-use subprocess::Exec;
-
-const CMD_API_GEN: &str = env!("CARGO_BIN_EXE_apigen");
-
-#[test]
-fn test_dns_server_openapi() {
-    let exec = Exec::cmd(path_to_executable(CMD_API_GEN));
-    let (exit_status, stdout, stderr) = run_command(exec);
-    assert_exit_code(exit_status, EXIT_SUCCESS, &stderr);
-
-    let spec: OpenAPI =
-        serde_json::from_str(&stdout).expect("stdout was not valid OpenAPI");
-    let errors = openapi_lint::validate(&spec);
-    assert!(errors.is_empty(), "{}", errors.join("\n\n"));
-
-    assert_contents("../openapi/dns-server.json", &stdout);
-}
diff --git a/openapi/dns-server.json b/openapi/dns-server.json
index 1b02199b76..0252c1538a 100644
--- a/openapi/dns-server.json
+++ b/openapi/dns-server.json
@@ -2,7 +2,12 @@
   "openapi": "3.0.3",
   "info": {
     "title": "Internal DNS",
-    "version": "v0.1.0"
+    "description": "API for the internal DNS server",
+    "contact": {
+      "url": "https://oxide.computer",
+      "email": "api@oxide.computer"
+    },
+    "version": "0.0.1"
   },
   "paths": {
     "/config": {

From a610a84f8a70841c60090cac78161428e934b90c Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:30:31 +0000
Subject: [PATCH 13/27] Update taiki-e/install-action digest to 996330b (#6082)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR contains the following updates:

| Package | Type | Update | Change |
|---|---|---|---|
| [taiki-e/install-action](https://togithub.com/taiki-e/install-action)
| action | digest | [`0256b3e` ->
`996330b`](https://togithub.com/taiki-e/install-action/compare/0256b3e...996330b)
|

---

### Configuration

📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone
America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone
America/Los_Angeles.

🚦 **Automerge**: Enabled.

♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the
rebase/retry checkbox.

🔕 **Ignore**: Close this PR and you won't be reminded about this update
again.

---

- [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check
this box

---

This PR has been generated by [Renovate
Bot](https://togithub.com/renovatebot/renovate).

<!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNy40MzEuNCIsInVwZGF0ZWRJblZlciI6IjM3LjQzMS40IiwidGFyZ2V0QnJhbmNoIjoibWFpbiIsImxhYmVscyI6WyJkZXBlbmRlbmNpZXMiXX0=-->

Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com>
---
 .github/workflows/hakari.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml
index a9beb49ed5..6e847ce8c4 100644
--- a/.github/workflows/hakari.yml
+++ b/.github/workflows/hakari.yml
@@ -24,7 +24,7 @@ jobs:
         with:
           toolchain: stable
       - name: Install cargo-hakari
-        uses: taiki-e/install-action@0256b3ea9ae3d751755a35cbb0608979a842f1d2 # v2
+        uses: taiki-e/install-action@996330bfc2ff267dc45a3d59354705b61547df0b # v2
         with:
           tool: cargo-hakari
       - name: Check workspace-hack Cargo.toml is up-to-date

From 88246d9250f991907b74868bce0977fdb04002e2 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Mon, 15 Jul 2024 11:18:00 -0700
Subject: [PATCH 14/27] partially fix up how-to-run-simulated instructions
 (#6075)

---
 docs/how-to-run-simulated.adoc    |  97 +++++++++++++++-
 nexus-config/src/nexus_config.rs  |   6 +
 nexus/examples/config-second.toml | 180 ++++++++++++++++++++++++++++++
 nexus/test-utils/src/lib.rs       |   6 +-
 nexus/tests/config.test.toml      |   2 +-
 5 files changed, 286 insertions(+), 5 deletions(-)
 create mode 100644 nexus/examples/config-second.toml

diff --git a/docs/how-to-run-simulated.adoc b/docs/how-to-run-simulated.adoc
index de19b70f04..86f7a0915b 100644
--- a/docs/how-to-run-simulated.adoc
+++ b/docs/how-to-run-simulated.adoc
@@ -94,6 +94,10 @@ omicron-dev: external DNS:          [::1]:54342
 
 === Running the pieces by hand
 
+There are many reasons it's useful to run the pieces of the stack by hand, especially during development and debugging: to test stopping and starting a component while the rest of the stack remains online; to run one component in a custom environment; to use a custom binary; to use a custom config file; to run under the debugger or with extra tracing enabled; etc.
+
+CAUTION: This process does not currently work.  See https://github.com/oxidecomputer/omicron/issues/4421[omicron#4421] for details.  The pieces here may still be useful for reference.
+
 . Start CockroachDB using `omicron-dev db-run`:
 +
 [source,text]
@@ -181,6 +185,8 @@ omicron-dev: using /tmp/.tmpFH6v8h and /tmp/.tmpkUjDji for ClickHouse data stora
 $ cargo run --bin=nexus -- nexus/examples/config.toml
 ----
 Nexus can also serve the web console. Instructions for downloading (or building) the console's static assets and pointing Nexus to them are https://github.com/oxidecomputer/console/blob/main/docs/serve-from-nexus.md[here]. Without console assets, Nexus will still start and run normally as an API. A few link:./nexus/src/external_api/console_api.rs[console-specific routes] will 404.
++
+CAUTION: This step does not currently work.  See https://github.com/oxidecomputer/omicron/issues/4421[omicron#4421] for details.
 
 . `dns-server` is run similar to Nexus, except that the bind addresses are specified on the command line:
 +
@@ -207,9 +213,98 @@ Dec 02 18:00:01.093 DEBG registered endpoint, path: /producers, method: POST, lo
 ...
 ----
 
+=== Using both `omicron-dev run-all` and running Nexus manually
+
+While it's often useful to run _some_ part of the stack by hand (see above), if you only want to run your own Nexus, one option is to run `omicron-dev run-all` first to get a whole simulated stack up, then run a second Nexus by hand with a custom config file.
+
+To do this, first run `omicron-dev run-all`:
+
+[source,text]
+----
+$ cargo run --bin=omicron-dev -- run-all
+    Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.95s
+     Running `target/debug/omicron-dev run-all`
+omicron-dev: setting up all services ... 
+log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.29765.0.log
+note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.29765.0.log"
+DB URL: postgresql://root@[::1]:43256/omicron?sslmode=disable
+DB address: [::1]:43256
+log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.29765.2.log
+note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.29765.2.log"
+log file: /dangerzone/omicron_tmp/omicron-dev-omicron-dev.29765.3.log
+note: configured to log to "/dangerzone/omicron_tmp/omicron-dev-omicron-dev.29765.3.log"
+omicron-dev: services are running.
+omicron-dev: nexus external API:    127.0.0.1:12220
+omicron-dev: nexus internal API:    [::1]:12221
+omicron-dev: cockroachdb pid:       29769
+omicron-dev: cockroachdb URL:       postgresql://root@[::1]:43256/omicron?sslmode=disable
+omicron-dev: cockroachdb directory: /dangerzone/omicron_tmp/.tmpikyLO8
+omicron-dev: internal DNS HTTP:     http://[::1]:39841
+omicron-dev: internal DNS:          [::1]:54025
+omicron-dev: external DNS name:     oxide-dev.test
+omicron-dev: external DNS HTTP:     http://[::1]:63482
+omicron-dev: external DNS:          [::1]:45276
+omicron-dev:   e.g. `dig @::1 -p 45276 test-suite-silo.sys.oxide-dev.test`
+omicron-dev: management gateway:    http://[::1]:49188 (switch0)
+omicron-dev: management gateway:    http://[::1]:39352 (switch1)
+omicron-dev: silo name:             test-suite-silo
+omicron-dev: privileged user name:  test-privileged
+----
+
+You'll need to note:
+
+* the TCP ports for the two management gateways (`49188` and `39352` here for switch0 and switch1, respectively)
+* the TCP port for internal DNS  (`54025` here)
+* the TCP port in the CockroachDB URL (`43256` here)
+
+Next, you'll need to customize the Nexus configuration file.  Start with nexus/examples/config-second.toml (_not_ nexus/examples/config.toml, which uses various values that conflict with what `omicron-dev run-all` uses).  You should only need to modify the block at the **bottom** of the file:
+
+[source,toml]
+----
+################################################################################
+# INSTRUCTIONS: To run Nexus against an existing stack started with            #
+# `omicron-dev run-all`, you should only have to modify values in this 	       #
+# section. 								       #
+# 									       #
+# Modify the port numbers below based on the output of `omicron-dev run-all`   #
+################################################################################
+
+[mgd]
+# Look for "management gateway:    http://[::1]:49188 (switch0)"
+# The "http://" does not go in this string -- just the socket address.
+switch0.address = "[::1]:49188"
+
+# Look for "management gateway:    http://[::1]:39352 (switch1)"
+# The "http://" does not go in this string -- just the socket address.
+switch1.address = "[::1]:39352"
+
+[deployment.internal_dns]
+# Look for "internal DNS:          [::1]:54025"
+# and adjust the port number below.
+address = "[::1]:54025"
+# You should not need to change this.
+type = "from_address"
+
+[deployment.database]
+# Look for "cockroachdb URL:       postgresql://root@[::1]:43256/omicron?sslmode=disable"
+# and adjust the port number below.
+url = "postgresql://root@[::1]:43256/omicron?sslmode=disable"
+# You should not need to change this.
+type = "from_url"
+################################################################################
+----
+
+So it's:
+
+* Copy the example config file: `cp nexus/examples/config-second.toml config-second.toml`
+* Edit as described above: `vim config-second.toml`
+* Start Nexus like above, but with this config file: `cargo run --bin=nexus -- config-second.toml`
+
+=== Using the stack
+
 Once everything is up and running, you can use the system in a few ways:
 
-* Use the browser-based console.  The Nexus log output will show what IP address and port it's listening on.  This is also configured in the config file.  If you're using the defaults, you can reach the console at `http://127.0.0.1:12220/projects`.  Depending on the environment where you're running this, you may need an ssh tunnel or the like to reach this from your browser.
+* Use the browser-based console.  The Nexus log output will show what IP address and port it's listening on.  This is also configured in the config file.  If you're using the defaults with `omicron-dev run-all`, you can reach the console at `http://127.0.0.1:12220/projects`.  If you ran a second Nexus using the `config-second.toml` config file, it will be on port `12222` instead (because that config file specifies port 12222).  Depending on the environment where you're running this, you may need an ssh tunnel or the like to reach this from your browser.
 * Use the xref:cli.adoc[`oxide` CLI].
 
 == Running with TLS
diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 5ca1d2d6ed..4bdee4ab4e 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -1174,6 +1174,12 @@ mod test {
         let example_config = NexusConfig::from_file(config_path)
             .expect("example config file is not valid");
 
+        // The second example config file should be valid.
+        let config_path = "../nexus/examples/config-second.toml";
+        println!("checking {:?}", config_path);
+        let _ = NexusConfig::from_file(config_path)
+            .expect("second example config file is not valid");
+
         // The config file used for the tests should also be valid.  The tests
         // won't clear the runway anyway if this file isn't valid.  But it's
         // helpful to verify this here explicitly as well.
diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml
new file mode 100644
index 0000000000..5dadb329cd
--- /dev/null
+++ b/nexus/examples/config-second.toml
@@ -0,0 +1,180 @@
+#
+# Example configuration file for running a second Nexus instance locally
+# alongside the stack started by `omicron-dev run-all`.  See the
+# how-to-run-simulated instructions for details.
+#
+
+################################################################################
+# INSTRUCTIONS: To run Nexus against an existing stack started with 	       #
+# `omicron-dev run-all`, see the very bottom of this file.  		       #
+################################################################################
+
+[console]
+# Directory for static assets. Absolute path or relative to CWD.
+static_dir = "out/console-assets"
+session_idle_timeout_minutes = 480 # 8 hours
+session_absolute_timeout_minutes = 1440 # 24 hours
+
+# List of authentication schemes to support.
+[authn]
+schemes_external = ["session_cookie", "access_token"]
+
+[log]
+# Show log messages of this level and more severe
+level = "info"
+
+# Example output to a terminal (with colors)
+mode = "stderr-terminal"
+
+# Example output to a file, appending if it already exists.
+#mode = "file"
+#path = "logs/server.log"
+#if_exists = "append"
+
+# Configuration for interacting with the timeseries database
+[timeseries_db]
+address = "[::1]:8123"
+
+
+
+[deployment]
+# Identifier for this instance of Nexus
+id = "a4ef738a-1fb0-47b1-9da2-4919c7ec7c7f"
+rack_id = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"
+# Since we expect to be the second instance of Nexus running on this system,
+# pick any available port.
+techport_external_server_port = 0
+
+# Nexus may need to resolve external hosts (e.g. to grab IdP metadata).
+# These are the DNS servers it should use.
+external_dns_servers = ["1.1.1.1", "9.9.9.9"]
+
+[deployment.dropshot_external]
+# IP Address and TCP port on which to listen for the external API
+# This config file uses 12222 to avoid colliding with the usual 12220 that's
+# used by `omicron-dev run-all`
+bind_address = "127.0.0.1:12222"
+# Allow large request bodies to support uploading TUF archives. The number here
+# is picked based on the typical size for tuf-mupdate.zip as of 2024-01
+# (~1.5GiB) and multiplying it by 2.
+#
+# This should be brought back down to a more reasonable value once per-endpoint
+# request body limits are implemented.
+request_body_max_bytes = 3221225472
+# To have Nexus's external HTTP endpoint use TLS, uncomment the line below.  You
+# will also need to provide an initial TLS certificate during rack
+# initialization.  If you're using this config file, you're probably running a
+# simulated system.  In that case, the initial certificate is provided to the
+# simulated sled agent (acting as RSS) via command-line arguments.
+#tls = true
+
+[deployment.dropshot_internal]
+# IP Address and TCP port on which to listen for the internal API
+# This config file uses 12223 to avoid colliding with the usual 12221 that's
+# used by `omicron-dev run-all`
+bind_address = "[::1]:12223"
+request_body_max_bytes = 1048576
+
+#[deployment.internal_dns]
+## These values are overridden at the bottom of this file.
+#type = "from_address"
+#address = "[::1]:3535"
+
+#[deployment.database]
+## These values are overridden at the bottom of this file.
+#type = "from_url"
+#url = "postgresql://root@[::1]:32221/omicron?sslmode=disable"
+
+# Tunable configuration parameters, for testing or experimentation
+[tunables]
+
+# The maximum allowed prefix (thus smallest size) for a VPC Subnet's
+# IPv4 subnetwork. This size allows for ~60 hosts.
+max_vpc_ipv4_subnet_prefix = 26
+
+# Configuration for interacting with the dataplane daemon
+[dendrite.switch0]
+address = "[::1]:12224"
+
+[background_tasks]
+dns_internal.period_secs_config = 60
+dns_internal.period_secs_servers = 60
+dns_internal.period_secs_propagation = 60
+dns_internal.max_concurrent_server_updates = 5
+dns_external.period_secs_config = 60
+dns_external.period_secs_servers = 60
+dns_external.period_secs_propagation = 60
+dns_external.max_concurrent_server_updates = 5
+metrics_producer_gc.period_secs = 60
+# How frequently we check the list of stored TLS certificates.  This is
+# approximately an upper bound on how soon after updating the list of
+# certificates it will take _other_ Nexus instances to notice and stop serving
+# them (on a sunny day).
+external_endpoints.period_secs = 60
+nat_cleanup.period_secs = 30
+bfd_manager.period_secs = 30
+# How frequently to collect hardware/software inventory from the whole system
+# (even if we don't have reason to believe anything has changed).
+inventory.period_secs = 600
+# Maximum number of past collections to keep in the database
+inventory.nkeep = 5
+# Disable inventory collection altogether (for emergencies)
+inventory.disable = false
+phantom_disks.period_secs = 30
+physical_disk_adoption.period_secs = 30
+blueprints.period_secs_load = 10
+blueprints.period_secs_execute = 60
+blueprints.period_secs_collect_crdb_node_ids = 180
+sync_service_zone_nat.period_secs = 30
+switch_port_settings_manager.period_secs = 30
+region_replacement.period_secs = 30
+region_replacement_driver.period_secs = 10
+# How frequently to query the status of active instances.
+instance_watcher.period_secs = 30
+service_firewall_propagation.period_secs = 300
+v2p_mapping_propagation.period_secs = 30
+abandoned_vmm_reaper.period_secs = 60
+lookup_region_port.period_secs = 60
+
+[default_region_allocation_strategy]
+# allocate region on 3 random distinct zpools, on 3 random distinct sleds.
+type = "random_with_distinct_sleds"
+
+# the same as random_with_distinct_sleds, but without requiring distinct sleds
+# type = "random"
+
+# setting `seed` to a fixed value will make dataset selection ordering use the
+# same shuffling order for every region allocation.
+# seed = 0
+
+################################################################################
+# INSTRUCTIONS: To run Nexus against an existing stack started with            #
+# `omicron-dev run-all`, you should only have to modify values in this 	       #
+# section. 								       #
+# 									       #
+# Modify the port numbers below based on the output of `omicron-dev run-all`   #
+################################################################################
+
+[mgd]
+# Look for "management gateway:    http://[::1]:49188 (switch0)"
+# The "http://" does not go in this string -- just the socket address.
+switch0.address = "[::1]:49188"
+
+# Look for "management gateway:    http://[::1]:39352 (switch1)"
+# The "http://" does not go in this string -- just the socket address.
+switch1.address = "[::1]:39352"
+
+[deployment.internal_dns]
+# Look for "internal DNS:          [::1]:54025"
+# and adjust the port number below.
+address = "[::1]:54025"
+# You should not need to change this.
+type = "from_address"
+
+[deployment.database]
+# Look for "cockroachdb URL:       postgresql://root@[::1]:43256/omicron?sslmode=disable"
+# and adjust the port number below.
+url = "postgresql://root@[::1]:43256/omicron?sslmode=disable"
+# You should not need to change this.
+type = "from_url"
+################################################################################
diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs
index 38cdac5fcb..18efe40e27 100644
--- a/nexus/test-utils/src/lib.rs
+++ b/nexus/test-utils/src/lib.rs
@@ -118,7 +118,7 @@ pub struct ControlPlaneTestContext<N> {
     pub sled_agent2: sim::Server,
     pub oximeter: Oximeter,
     pub producer: ProducerServer,
-    pub gateway: HashMap<SwitchLocation, GatewayTestContext>,
+    pub gateway: BTreeMap<SwitchLocation, GatewayTestContext>,
     pub dendrite: HashMap<SwitchLocation, dev::dendrite::DendriteInstance>,
     pub mgd: HashMap<SwitchLocation, dev::maghemite::MgdInstance>,
     pub external_dns_zone_name: String,
@@ -280,7 +280,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> {
     pub sled_agent2: Option<sim::Server>,
     pub oximeter: Option<Oximeter>,
     pub producer: Option<ProducerServer>,
-    pub gateway: HashMap<SwitchLocation, GatewayTestContext>,
+    pub gateway: BTreeMap<SwitchLocation, GatewayTestContext>,
     pub dendrite: HashMap<SwitchLocation, dev::dendrite::DendriteInstance>,
     pub mgd: HashMap<SwitchLocation, dev::maghemite::MgdInstance>,
 
@@ -330,7 +330,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
             sled_agent2: None,
             oximeter: None,
             producer: None,
-            gateway: HashMap::new(),
+            gateway: BTreeMap::new(),
             dendrite: HashMap::new(),
             mgd: HashMap::new(),
             nexus_internal: None,
diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml
index 8415a192b1..dfcaec2157 100644
--- a/nexus/tests/config.test.toml
+++ b/nexus/tests/config.test.toml
@@ -37,7 +37,7 @@ max_vpc_ipv4_subnet_prefix = 29
 [deployment]
 # Identifier for this instance of Nexus.
 # NOTE: The test suite always overrides this.
-id = "e6bff1ff-24fb-49dc-a54e-c6a350cd4d6c"
+id = "913233fe-92a8-4635-9572-183f495429c4"
 rack_id = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"
 techport_external_server_port = 0
 

From 9b0a23c6c0e753e0d7f808e2e97064f288200ca2 Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 12:09:10 -0700
Subject: [PATCH 15/27] Update Rust crate syn to v2.0.71 (#6083)

---
 Cargo.lock                | 152 +++++++++++++++++++-------------------
 workspace-hack/Cargo.toml |   4 +-
 2 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9a09d1d61a..d9727d39d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -166,7 +166,7 @@ dependencies = [
  "omicron-workspace-hack",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -273,7 +273,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -295,7 +295,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -306,7 +306,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -359,7 +359,7 @@ dependencies = [
  "quote",
  "serde",
  "serde_tokenstream",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -517,7 +517,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.68",
+ "syn 2.0.71",
  "which",
 ]
 
@@ -550,7 +550,7 @@ checksum = "1657dce144574f921af10a92876a96f0ca05dd830900598d21d91c8e4cf78f74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1048,7 +1048,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1539,7 +1539,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1563,7 +1563,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1574,7 +1574,7 @@ checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1608,7 +1608,7 @@ dependencies = [
  "quote",
  "serde",
  "serde_tokenstream",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1652,7 +1652,7 @@ dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1685,7 +1685,7 @@ checksum = "5fe87ce4529967e0ba1dcf8450bab64d97dfd5010a6256187ffe2e43e6f0e049"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1706,7 +1706,7 @@ checksum = "62d671cc41a825ebabc75757b62d3d168c577f9149b2d49ece1dad1f72119d25"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1727,7 +1727,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1737,7 +1737,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1750,7 +1750,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustc_version 0.4.0",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1814,7 +1814,7 @@ dependencies = [
  "diesel_table_macro_syntax",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -1823,7 +1823,7 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
 dependencies = [
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -2093,7 +2093,7 @@ dependencies = [
  "quote",
  "serde",
  "serde_tokenstream",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -2508,7 +2508,7 @@ checksum = "1a5c6c585bc94aaf2c7b51dd4c2ba22680844aba4c687be581871a6f518c5742"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -2620,7 +2620,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -3918,7 +3918,7 @@ version = "0.1.0"
 source = "git+https://github.com/oxidecomputer/opte?rev=915975f6d1729db95619f752148974016912412f#915975f6d1729db95619f752148974016912412f"
 dependencies = [
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -4411,7 +4411,7 @@ dependencies = [
  "cfg-if",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -4788,7 +4788,7 @@ dependencies = [
  "omicron-workspace-hack",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -4986,7 +4986,7 @@ version = "0.1.0"
 dependencies = [
  "omicron-workspace-hack",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -5157,7 +5157,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -6024,7 +6024,7 @@ dependencies = [
  "string_cache",
  "subtle",
  "syn 1.0.109",
- "syn 2.0.68",
+ "syn 2.0.71",
  "time",
  "time-macros",
  "tokio",
@@ -6166,7 +6166,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -6317,7 +6317,7 @@ dependencies = [
  "oximeter-macro-impl",
  "oximeter-timeseries-macro",
  "prettyplease",
- "syn 2.0.68",
+ "syn 2.0.71",
  "toml 0.8.14",
  "uuid",
 ]
@@ -6449,7 +6449,7 @@ dependencies = [
  "serde_json",
  "slog-error-chain",
  "strum",
- "syn 2.0.68",
+ "syn 2.0.71",
  "thiserror",
  "toml 0.8.14",
  "trybuild",
@@ -6488,7 +6488,7 @@ dependencies = [
  "omicron-workspace-hack",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -6524,7 +6524,7 @@ dependencies = [
  "oximeter-impl",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -6680,7 +6680,7 @@ dependencies = [
  "regex",
  "regex-syntax 0.8.3",
  "structmeta 0.3.0",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -6848,7 +6848,7 @@ dependencies = [
  "pest_meta",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -6918,7 +6918,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -7182,7 +7182,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e"
 dependencies = [
  "proc-macro2",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -7278,7 +7278,7 @@ dependencies = [
  "schemars",
  "serde",
  "serde_json",
- "syn 2.0.68",
+ "syn 2.0.71",
  "thiserror",
  "typify",
  "unicode-ident",
@@ -7298,7 +7298,7 @@ dependencies = [
  "serde_json",
  "serde_tokenstream",
  "serde_yaml",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -7789,7 +7789,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -8043,7 +8043,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version 0.4.0",
- "syn 2.0.68",
+ "syn 2.0.71",
  "unicode-ident",
 ]
 
@@ -8446,7 +8446,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_derive_internals",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -8472,7 +8472,7 @@ checksum = "7f81c2fde025af7e69b1d1420531c8a8811ca898919db177141a85313b1cb932"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -8601,7 +8601,7 @@ checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -8612,7 +8612,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -8662,7 +8662,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -8683,7 +8683,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -8725,7 +8725,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9073,7 +9073,7 @@ source = "git+https://github.com/oxidecomputer/slog-error-chain?branch=main#15f6
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9200,7 +9200,7 @@ dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9326,7 +9326,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9336,7 +9336,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2ff9eaf853dec4c8802325d8b6d3dffa86cc707fd7a1a4cdbf416e13b061787a"
 dependencies = [
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9422,7 +9422,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta-derive 0.2.0",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9434,7 +9434,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta-derive 0.3.0",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9445,7 +9445,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9456,7 +9456,7 @@ checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9491,7 +9491,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9504,7 +9504,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9551,9 +9551,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.68"
+version = "2.0.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
+checksum = "b146dcf730474b4bcd16c311627b31ede9ab149045db4d6088b3becaea046462"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -9727,7 +9727,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "structmeta 0.2.0",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9758,7 +9758,7 @@ checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9895,7 +9895,7 @@ checksum = "8d9ef545650e79f30233c0003bcc2504d7efac6dad25fca40744de773fe2049c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -9966,7 +9966,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -10243,7 +10243,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -10520,7 +10520,7 @@ dependencies = [
  "semver 1.0.23",
  "serde",
  "serde_json",
- "syn 2.0.68",
+ "syn 2.0.71",
  "thiserror",
  "unicode-ident",
 ]
@@ -10537,7 +10537,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_tokenstream",
- "syn 2.0.68",
+ "syn 2.0.71",
  "typify-impl",
 ]
 
@@ -10754,7 +10754,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_tokenstream",
- "syn 2.0.68",
+ "syn 2.0.71",
  "usdt-impl",
 ]
 
@@ -10772,7 +10772,7 @@ dependencies = [
  "quote",
  "serde",
  "serde_json",
- "syn 2.0.68",
+ "syn 2.0.71",
  "thiserror",
  "thread-id",
  "version_check",
@@ -10788,7 +10788,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "serde_tokenstream",
- "syn 2.0.68",
+ "syn 2.0.71",
  "usdt-impl",
 ]
 
@@ -10967,7 +10967,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
  "wasm-bindgen-shared",
 ]
 
@@ -11001,7 +11001,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -11582,7 +11582,7 @@ checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -11593,7 +11593,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
@@ -11613,7 +11613,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.68",
+ "syn 2.0.71",
 ]
 
 [[package]]
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index cc12f6d032..9c9f47d735 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -102,7 +102,7 @@ smallvec = { version = "1.13.2", default-features = false, features = ["const_ne
 spin = { version = "0.9.8" }
 string_cache = { version = "0.8.7" }
 subtle = { version = "2.5.0" }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.68", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.71", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] }
 tokio = { version = "1.38.0", features = ["full", "test-util"] }
 tokio-postgres = { version = "0.7.10", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] }
@@ -208,7 +208,7 @@ spin = { version = "0.9.8" }
 string_cache = { version = "0.8.7" }
 subtle = { version = "2.5.0" }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extra-traits", "fold", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.68", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.71", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] }
 time-macros = { version = "0.2.18", default-features = false, features = ["formatting", "parsing"] }
 tokio = { version = "1.38.0", features = ["full", "test-util"] }

From b6857a11652c290b8731c2b81d8124f0594081d2 Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 12:09:42 -0700
Subject: [PATCH 16/27] Update Rust crate tar to v0.4.41 (#6084)

---
 Cargo.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d9727d39d0..45490a3c62 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4044,7 +4044,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.5",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -9634,9 +9634,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
 [[package]]
 name = "tar"
-version = "0.4.40"
+version = "0.4.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
+checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909"
 dependencies = [
  "filetime",
  "libc",

From 413620a3b8137988c5a742a2da41e127da588219 Mon Sep 17 00:00:00 2001
From: Rain <rain@oxide.computer>
Date: Mon, 15 Jul 2024 13:13:39 -0700
Subject: [PATCH 17/27] [installinator] convert artifact/progress API to a
 trait (#6074)

To retain consistency with all of the other APIs, I've changed the name
to just `installinator-api`. `artifact` was the case at first but the same API
is now also used for progress reporting, so it's no longer quite accurate.

We had this indirection going on that's no longer necessary. We also don't need
to have the installinator-artifactd binary that generated the OpenAPI document,
since the manager takes care of all of that.
---
 Cargo.lock                                    |  35 ++--
 Cargo.toml                                    |  12 +-
 .../Cargo.toml                                |   2 +-
 .../src/lib.rs                                |   4 +-
 dev-tools/openapi-manager/Cargo.toml          |   1 +
 dev-tools/openapi-manager/src/spec.rs         |  35 ++--
 .../Cargo.toml                                |  20 +--
 installinator-api/src/lib.rs                  | 167 ++++++++++++++++++
 .../src/bin/installinator-artifactd.rs        |  38 ----
 installinator-artifactd/src/context.rs        |  13 --
 .../src/http_entrypoints.rs                   | 115 ------------
 installinator-artifactd/src/lib.rs            |  29 ---
 installinator-artifactd/src/server.rs         |  74 --------
 installinator-artifactd/src/store.rs          |  79 ---------
 .../tests/integration_tests/mod.rs            |   5 -
 .../tests/integration_tests/openapi.rs        |  39 ----
 installinator-artifactd/tests/mod.rs          |  17 --
 .../tests/output/cmd-server-openapi-stderr    |   0
 installinator/Cargo.toml                      |   2 +-
 installinator/src/artifact.rs                 |   7 +-
 installinator/src/errors.rs                   |   2 +-
 installinator/src/mock_peers.rs               |   8 +-
 installinator/src/peers.rs                    |   2 +-
 ...ator-artifactd.json => installinator.json} |   4 +-
 wicketd/Cargo.toml                            |   4 +-
 wicketd/src/artifacts.rs                      |   3 +-
 wicketd/src/artifacts/server.rs               |  99 +++++++----
 wicketd/src/bin/wicketd.rs                    |   5 +-
 wicketd/src/installinator_progress.rs         |   2 +-
 wicketd/src/lib.rs                            |  64 ++++---
 wicketd/tests/integration_tests/setup.rs      |   7 +-
 31 files changed, 351 insertions(+), 543 deletions(-)
 rename clients/{installinator-artifact-client => installinator-client}/Cargo.toml (92%)
 rename clients/{installinator-artifact-client => installinator-client}/src/lib.rs (91%)
 rename {installinator-artifactd => installinator-api}/Cargo.toml (55%)
 create mode 100644 installinator-api/src/lib.rs
 delete mode 100644 installinator-artifactd/src/bin/installinator-artifactd.rs
 delete mode 100644 installinator-artifactd/src/context.rs
 delete mode 100644 installinator-artifactd/src/http_entrypoints.rs
 delete mode 100644 installinator-artifactd/src/lib.rs
 delete mode 100644 installinator-artifactd/src/server.rs
 delete mode 100644 installinator-artifactd/src/store.rs
 delete mode 100644 installinator-artifactd/tests/integration_tests/mod.rs
 delete mode 100644 installinator-artifactd/tests/integration_tests/openapi.rs
 delete mode 100644 installinator-artifactd/tests/mod.rs
 delete mode 100644 installinator-artifactd/tests/output/cmd-server-openapi-stderr
 rename openapi/{installinator-artifactd.json => installinator.json} (99%)

diff --git a/Cargo.lock b/Cargo.lock
index 45490a3c62..459939ce50 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3618,7 +3618,7 @@ dependencies = [
  "hex-literal",
  "http 0.2.12",
  "illumos-utils",
- "installinator-artifact-client",
+ "installinator-client",
  "installinator-common",
  "ipcc",
  "itertools 0.12.1",
@@ -3650,43 +3650,35 @@ dependencies = [
 ]
 
 [[package]]
-name = "installinator-artifact-client"
+name = "installinator-api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "dropshot",
+ "hyper 0.14.28",
  "installinator-common",
+ "omicron-common",
  "omicron-workspace-hack",
- "progenitor",
- "regress",
- "reqwest",
  "schemars",
  "serde",
- "serde_json",
  "slog",
- "update-engine",
  "uuid",
 ]
 
 [[package]]
-name = "installinator-artifactd"
+name = "installinator-client"
 version = "0.1.0"
 dependencies = [
- "anyhow",
- "async-trait",
- "clap",
- "dropshot",
- "expectorate",
- "hyper 0.14.28",
  "installinator-common",
- "omicron-common",
- "omicron-test-utils",
  "omicron-workspace-hack",
- "openapi-lint",
- "openapiv3",
+ "progenitor",
+ "regress",
+ "reqwest",
  "schemars",
  "serde",
  "serde_json",
  "slog",
- "subprocess",
+ "update-engine",
  "uuid",
 ]
 
@@ -6122,6 +6114,7 @@ dependencies = [
  "dropshot",
  "fs-err",
  "indent_write",
+ "installinator-api",
  "nexus-internal-api",
  "omicron-workspace-hack",
  "openapi-lint",
@@ -11188,8 +11181,8 @@ dependencies = [
  "hyper 0.14.28",
  "illumos-utils",
  "installinator",
- "installinator-artifact-client",
- "installinator-artifactd",
+ "installinator-api",
+ "installinator-client",
  "installinator-common",
  "internal-dns",
  "itertools 0.12.1",
diff --git a/Cargo.toml b/Cargo.toml
index a44e69e1e5..379aa7f549 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,7 @@ members = [
     "clients/dns-service-client",
     "clients/dpd-client",
     "clients/gateway-client",
-    "clients/installinator-artifact-client",
+    "clients/installinator-client",
     "clients/nexus-client",
     "clients/oxide-client",
     "clients/oximeter-client",
@@ -32,7 +32,7 @@ members = [
     "gateway-test-utils",
     "gateway",
     "illumos-utils",
-    "installinator-artifactd",
+    "installinator-api",
     "installinator-common",
     "installinator",
     "internal-dns-cli",
@@ -101,7 +101,7 @@ default-members = [
     "clients/dns-service-client",
     "clients/dpd-client",
     "clients/gateway-client",
-    "clients/installinator-artifact-client",
+    "clients/installinator-client",
     "clients/nexus-client",
     "clients/oxide-client",
     "clients/oximeter-client",
@@ -127,7 +127,7 @@ default-members = [
     "gateway-test-utils",
     "gateway",
     "illumos-utils",
-    "installinator-artifactd",
+    "installinator-api",
     "installinator-common",
     "installinator",
     "internal-dns-cli",
@@ -321,8 +321,8 @@ indent_write = "2.2.0"
 indexmap = "2.2.6"
 indicatif = { version = "0.17.8", features = ["rayon"] }
 installinator = { path = "installinator" }
-installinator-artifactd = { path = "installinator-artifactd" }
-installinator-artifact-client = { path = "clients/installinator-artifact-client" }
+installinator-api = { path = "installinator-api" }
+installinator-client = { path = "clients/installinator-client" }
 installinator-common = { path = "installinator-common" }
 internal-dns = { path = "internal-dns" }
 ipcc = { path = "ipcc" }
diff --git a/clients/installinator-artifact-client/Cargo.toml b/clients/installinator-client/Cargo.toml
similarity index 92%
rename from clients/installinator-artifact-client/Cargo.toml
rename to clients/installinator-client/Cargo.toml
index f1e896864f..ca2de0476a 100644
--- a/clients/installinator-artifact-client/Cargo.toml
+++ b/clients/installinator-client/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "installinator-artifact-client"
+name = "installinator-client"
 version = "0.1.0"
 edition = "2021"
 license = "MPL-2.0"
diff --git a/clients/installinator-artifact-client/src/lib.rs b/clients/installinator-client/src/lib.rs
similarity index 91%
rename from clients/installinator-artifact-client/src/lib.rs
rename to clients/installinator-client/src/lib.rs
index 96806c2cab..a39ff3ff80 100644
--- a/clients/installinator-artifact-client/src/lib.rs
+++ b/clients/installinator-client/src/lib.rs
@@ -2,10 +2,10 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
-//! Interface for making API requests to installinator-artifactd.
+//! Interface for installinator to make API requests.
 
 progenitor::generate_api!(
-    spec = "../../openapi/installinator-artifactd.json",
+    spec = "../../openapi/installinator.json",
     inner_type = slog::Logger,
     pre_hook = (|log: &slog::Logger, request: &reqwest::Request| {
         slog::debug!(log, "client request";
diff --git a/dev-tools/openapi-manager/Cargo.toml b/dev-tools/openapi-manager/Cargo.toml
index 1534181e9c..db3152c604 100644
--- a/dev-tools/openapi-manager/Cargo.toml
+++ b/dev-tools/openapi-manager/Cargo.toml
@@ -16,6 +16,7 @@ dns-server-api.workspace = true
 dropshot.workspace = true
 fs-err.workspace = true
 indent_write.workspace = true
+installinator-api.workspace = true
 nexus-internal-api.workspace = true
 omicron-workspace-hack.workspace = true
 openapiv3.workspace = true
diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs
index 53f3260ca9..5ad991e353 100644
--- a/dev-tools/openapi-manager/src/spec.rs
+++ b/dev-tools/openapi-manager/src/spec.rs
@@ -15,23 +15,34 @@ use openapiv3::OpenAPI;
 pub fn all_apis() -> Vec<ApiSpec> {
     vec![
         ApiSpec {
-            title: "Internal DNS".to_string(),
-            version: "0.0.1".to_string(),
-            description: "API for the internal DNS server".to_string(),
+            title: "Internal DNS",
+            version: "0.0.1",
+            description: "API for the internal DNS server",
             boundary: ApiBoundary::Internal,
             api_description:
                 dns_server_api::dns_server_api::stub_api_description,
-            filename: "dns-server.json".to_string(),
+            filename: "dns-server.json",
             extra_validation: None,
         },
         ApiSpec {
-            title: "Nexus internal API".to_string(),
-            version: "0.0.1".to_string(),
-            description: "Nexus internal API".to_string(),
+            title: "Installinator API",
+            version: "0.0.1",
+            description: "API for installinator to fetch artifacts \
+                and report progress",
+            boundary: ApiBoundary::Internal,
+            api_description:
+                installinator_api::installinator_api::stub_api_description,
+            filename: "installinator.json",
+            extra_validation: None,
+        },
+        ApiSpec {
+            title: "Nexus internal API",
+            version: "0.0.1",
+            description: "Nexus internal API",
             boundary: ApiBoundary::Internal,
             api_description:
                 nexus_internal_api::nexus_internal_api_mod::stub_api_description,
-            filename: "nexus-internal.json".to_string(),
+            filename: "nexus-internal.json",
             extra_validation: None,
         },
         // Add your APIs here! Please keep this list sorted by filename.
@@ -40,13 +51,13 @@ pub fn all_apis() -> Vec<ApiSpec> {
 
 pub struct ApiSpec {
     /// The title.
-    pub title: String,
+    pub title: &'static str,
 
     /// The version.
-    pub version: String,
+    pub version: &'static str,
 
     /// The description string.
-    pub description: String,
+    pub description: &'static str,
 
     /// Whether this API is internal or external.
     pub boundary: ApiBoundary,
@@ -57,7 +68,7 @@ pub struct ApiSpec {
         fn() -> Result<ApiDescription<StubContext>, ApiDescriptionBuildErrors>,
 
     /// The JSON filename to write the API description to.
-    pub filename: String,
+    pub filename: &'static str,
 
     /// Extra validation to perform on the OpenAPI spec, if any.
     pub extra_validation: Option<fn(&OpenAPI) -> anyhow::Result<()>>,
diff --git a/installinator-artifactd/Cargo.toml b/installinator-api/Cargo.toml
similarity index 55%
rename from installinator-artifactd/Cargo.toml
rename to installinator-api/Cargo.toml
index 236ea7a51c..52db4362c6 100644
--- a/installinator-artifactd/Cargo.toml
+++ b/installinator-api/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "installinator-artifactd"
+name = "installinator-api"
 version = "0.1.0"
 edition = "2021"
 license = "MPL-2.0"
@@ -9,24 +9,12 @@ workspace = true
 
 [dependencies]
 anyhow.workspace = true
-async-trait.workspace = true
-clap.workspace = true
 dropshot.workspace = true
 hyper.workspace = true
+installinator-common.workspace = true
+omicron-common.workspace = true
+omicron-workspace-hack.workspace = true
 schemars.workspace = true
 serde.workspace = true
-serde_json.workspace = true
 slog.workspace = true
 uuid.workspace = true
-
-installinator-common.workspace = true
-omicron-common.workspace = true
-omicron-workspace-hack.workspace = true
-
-[dev-dependencies]
-expectorate.workspace = true
-omicron-test-utils.workspace = true
-openapiv3.workspace = true
-openapi-lint.workspace = true
-serde_json.workspace = true
-subprocess.workspace = true
diff --git a/installinator-api/src/lib.rs b/installinator-api/src/lib.rs
new file mode 100644
index 0000000000..cd87643a66
--- /dev/null
+++ b/installinator-api/src/lib.rs
@@ -0,0 +1,167 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! The REST API that installinator is a client of.
+//!
+//! Note that most of our APIs are named by their server. This one is instead
+//! named by the client, since it is expected that multiple services will
+//! implement it.
+
+use anyhow::{anyhow, Result};
+use dropshot::{
+    ConfigDropshot, FreeformBody, HandlerTaskMode, HttpError,
+    HttpResponseHeaders, HttpResponseOk, HttpResponseUpdatedNoContent,
+    HttpServerStarter, Path, RequestContext, TypedBody,
+};
+use hyper::{header, Body, StatusCode};
+use installinator_common::EventReport;
+use omicron_common::update::ArtifactHashId;
+use schemars::JsonSchema;
+use serde::Deserialize;
+use uuid::Uuid;
+
+#[derive(Debug, Deserialize, JsonSchema)]
+pub struct ReportQuery {
+    /// A unique identifier for the update.
+    pub update_id: Uuid,
+}
+
+#[dropshot::api_description]
+pub trait InstallinatorApi {
+    type Context;
+
+    /// Fetch an artifact by hash.
+    #[endpoint {
+        method = GET,
+        path = "/artifacts/by-hash/{kind}/{hash}",
+    }]
+    async fn get_artifact_by_hash(
+        rqctx: RequestContext<Self::Context>,
+        path: Path<ArtifactHashId>,
+    ) -> Result<HttpResponseHeaders<HttpResponseOk<FreeformBody>>, HttpError>;
+
+    /// Report progress and completion to the server.
+    ///
+    /// This method requires an `update_id` path parameter. This update ID is
+    /// matched against the server currently performing an update. If the
+    /// server is unaware of the update ID, it will return an HTTP 422
+    /// Unprocessable Entity code.
+    #[endpoint {
+        method = POST,
+        path = "/report-progress/{update_id}",
+    }]
+    async fn report_progress(
+        rqctx: RequestContext<Self::Context>,
+        path: Path<ReportQuery>,
+        report: TypedBody<EventReport>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+}
+
+/// Add a content length header to a response.
+///
+/// Intended to be called by `get_artifact_by_hash` implementations.
+pub fn body_to_artifact_response(
+    size: u64,
+    body: Body,
+) -> HttpResponseHeaders<HttpResponseOk<FreeformBody>> {
+    let mut response =
+        HttpResponseHeaders::new_unnamed(HttpResponseOk(body.into()));
+    let headers = response.headers_mut();
+    headers.append(header::CONTENT_LENGTH, size.into());
+    response
+}
+
+/// The result of processing an installinator event report.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[must_use]
+pub enum EventReportStatus {
+    /// This report was processed by the server.
+    Processed,
+
+    /// The update ID was not recognized by the server.
+    UnrecognizedUpdateId,
+
+    /// The progress receiver is closed.
+    ReceiverClosed,
+}
+
+impl EventReportStatus {
+    /// Convert this status to an HTTP result.
+    ///
+    /// Intended to be called by `report_progress` implementations.
+    pub fn to_http_result(
+        self,
+        update_id: Uuid,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        match self {
+            EventReportStatus::Processed => Ok(HttpResponseUpdatedNoContent()),
+            EventReportStatus::UnrecognizedUpdateId => {
+                Err(HttpError::for_client_error(
+                    None,
+                    StatusCode::UNPROCESSABLE_ENTITY,
+                    format!(
+                        "update ID {update_id} unrecognized by this server"
+                    ),
+                ))
+            }
+            EventReportStatus::ReceiverClosed => {
+                Err(HttpError::for_client_error(
+                    None,
+                    StatusCode::GONE,
+                    format!("update ID {update_id}: receiver closed"),
+                ))
+            }
+        }
+    }
+}
+
+/// Creates a default `ConfigDropshot` for the installinator API.
+pub fn default_config(bind_address: std::net::SocketAddr) -> ConfigDropshot {
+    ConfigDropshot {
+        bind_address,
+        // Even though the installinator sets an upper bound on the number of
+        // items in a progress report, they can get pretty large if they
+        // haven't gone through for a bit. Ensure that hitting the max request
+        // size won't cause a failure by setting a generous upper bound for the
+        // request size.
+        //
+        // TODO: replace with an endpoint-specific option once
+        // https://github.com/oxidecomputer/dropshot/pull/618 lands and is
+        // available in omicron.
+        request_body_max_bytes: 4 * 1024 * 1024,
+        default_handler_task_mode: HandlerTaskMode::Detached,
+    }
+}
+
+/// Make an `HttpServerStarter` for the installinator API with default settings.
+pub fn make_server_starter<T: InstallinatorApi>(
+    context: T::Context,
+    bind_address: std::net::SocketAddr,
+    log: &slog::Logger,
+) -> Result<HttpServerStarter<T::Context>> {
+    let dropshot_config = dropshot::ConfigDropshot {
+        bind_address,
+        // Even though the installinator sets an upper bound on the number
+        // of items in a progress report, they can get pretty large if they
+        // haven't gone through for a bit. Ensure that hitting the max
+        // request size won't cause a failure by setting a generous upper
+        // bound for the request size.
+        //
+        // TODO: replace with an endpoint-specific option once
+        // https://github.com/oxidecomputer/dropshot/pull/618 lands and is
+        // available in omicron.
+        request_body_max_bytes: 4 * 1024 * 1024,
+        default_handler_task_mode: HandlerTaskMode::Detached,
+    };
+
+    let api = crate::installinator_api::api_description::<T>()?;
+    let server =
+        dropshot::HttpServerStarter::new(&dropshot_config, api, context, &log)
+            .map_err(|error| {
+                anyhow!(error)
+                    .context("failed to create installinator artifact server")
+            })?;
+
+    Ok(server)
+}
diff --git a/installinator-artifactd/src/bin/installinator-artifactd.rs b/installinator-artifactd/src/bin/installinator-artifactd.rs
deleted file mode 100644
index abe63bbe31..0000000000
--- a/installinator-artifactd/src/bin/installinator-artifactd.rs
+++ /dev/null
@@ -1,38 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! Executable that generates OpenAPI definitions for the installinator artifact server.
-
-use anyhow::Result;
-use clap::Parser;
-use omicron_common::cmd::CmdError;
-
-#[derive(Debug, Parser)]
-#[clap(name = "installinator-artifactd")]
-enum Args {
-    /// Print the external OpenAPI Spec document and exit
-    Openapi,
-    // NOTE: this server is not intended to be run as a standalone service. Instead, it should be
-    // embedded as part of other servers (e.g. wicketd).
-}
-
-fn main() {
-    if let Err(cmd_error) = do_run() {
-        omicron_common::cmd::fatal(cmd_error);
-    }
-}
-
-fn do_run() -> Result<(), CmdError> {
-    let args = Args::parse();
-
-    match args {
-        Args::Openapi => {
-            installinator_artifactd::run_openapi().map_err(|error| {
-                CmdError::Failure(
-                    error.context("failed to generate OpenAPI spec"),
-                )
-            })
-        }
-    }
-}
diff --git a/installinator-artifactd/src/context.rs b/installinator-artifactd/src/context.rs
deleted file mode 100644
index beea2593aa..0000000000
--- a/installinator-artifactd/src/context.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-// Copyright 2023 Oxide Computer Company
-
-//! User provided dropshot server context
-
-use crate::store::ArtifactStore;
-
-pub struct ServerContext {
-    pub(crate) artifact_store: ArtifactStore,
-}
diff --git a/installinator-artifactd/src/http_entrypoints.rs b/installinator-artifactd/src/http_entrypoints.rs
deleted file mode 100644
index 13163e007b..0000000000
--- a/installinator-artifactd/src/http_entrypoints.rs
+++ /dev/null
@@ -1,115 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-// Copyright 2022 Oxide Computer Company
-
-use dropshot::{
-    endpoint, ApiDescription, ApiDescriptionRegisterError, FreeformBody,
-    HttpError, HttpResponseHeaders, HttpResponseOk,
-    HttpResponseUpdatedNoContent, Path, RequestContext, TypedBody,
-};
-use hyper::{header, Body, StatusCode};
-use installinator_common::EventReport;
-use omicron_common::update::ArtifactHashId;
-use schemars::JsonSchema;
-use serde::Deserialize;
-use uuid::Uuid;
-
-use crate::{context::ServerContext, EventReportStatus};
-
-type ArtifactServerApiDesc = ApiDescription<ServerContext>;
-
-/// Return a description of the artifact server api for use in generating an OpenAPI spec
-pub fn api() -> ArtifactServerApiDesc {
-    fn register_endpoints(
-        api: &mut ArtifactServerApiDesc,
-    ) -> Result<(), ApiDescriptionRegisterError> {
-        api.register(get_artifact_by_hash)?;
-        api.register(report_progress)?;
-        Ok(())
-    }
-
-    let mut api = ArtifactServerApiDesc::new();
-    if let Err(err) = register_endpoints(&mut api) {
-        panic!("failed to register entrypoints: {}", err);
-    }
-    api
-}
-
-/// Fetch an artifact by hash.
-#[endpoint {
-    method = GET,
-    path = "/artifacts/by-hash/{kind}/{hash}",
-}]
-async fn get_artifact_by_hash(
-    rqctx: RequestContext<ServerContext>,
-    path: Path<ArtifactHashId>,
-) -> Result<HttpResponseHeaders<HttpResponseOk<FreeformBody>>, HttpError> {
-    match rqctx
-        .context()
-        .artifact_store
-        .get_artifact_by_hash(&path.into_inner())
-        .await
-    {
-        Some((size, body)) => Ok(body_to_artifact_response(size, body)),
-        None => {
-            Err(HttpError::for_not_found(None, "Artifact not found".into()))
-        }
-    }
-}
-
-#[derive(Debug, Deserialize, JsonSchema)]
-pub(crate) struct ReportQuery {
-    /// A unique identifier for the update.
-    pub(crate) update_id: Uuid,
-}
-
-/// Report progress and completion to the server.
-///
-/// This method requires an `update_id` path parameter. This update ID is
-/// matched against the server currently performing an update. If the server
-/// is unaware of the update ID, it will return an HTTP 422 Unprocessable Entity
-/// code.
-#[endpoint {
-    method = POST,
-    path = "/report-progress/{update_id}",
-}]
-async fn report_progress(
-    rqctx: RequestContext<ServerContext>,
-    path: Path<ReportQuery>,
-    report: TypedBody<EventReport>,
-) -> Result<HttpResponseUpdatedNoContent, HttpError> {
-    let update_id = path.into_inner().update_id;
-    match rqctx
-        .context()
-        .artifact_store
-        .report_progress(update_id, report.into_inner())
-        .await?
-    {
-        EventReportStatus::Processed => Ok(HttpResponseUpdatedNoContent()),
-        EventReportStatus::UnrecognizedUpdateId => {
-            Err(HttpError::for_client_error(
-                None,
-                StatusCode::UNPROCESSABLE_ENTITY,
-                format!("update ID {update_id} unrecognized by this server"),
-            ))
-        }
-        EventReportStatus::ReceiverClosed => Err(HttpError::for_client_error(
-            None,
-            StatusCode::GONE,
-            format!("update ID {update_id}: receiver closed"),
-        )),
-    }
-}
-
-fn body_to_artifact_response(
-    size: u64,
-    body: Body,
-) -> HttpResponseHeaders<HttpResponseOk<FreeformBody>> {
-    let mut response =
-        HttpResponseHeaders::new_unnamed(HttpResponseOk(body.into()));
-    let headers = response.headers_mut();
-    headers.append(header::CONTENT_LENGTH, size.into());
-    response
-}
diff --git a/installinator-artifactd/src/lib.rs b/installinator-artifactd/src/lib.rs
deleted file mode 100644
index c54ed78a97..0000000000
--- a/installinator-artifactd/src/lib.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-// Copyright 2023 Oxide Computer Company
-
-mod context;
-mod http_entrypoints;
-mod server;
-mod store;
-
-pub use context::ServerContext;
-pub use server::ArtifactServer;
-pub use store::{ArtifactGetter, EventReportStatus};
-
-use anyhow::Result;
-
-/// Run the OpenAPI generator for the API; which emits the OpenAPI spec
-/// to stdout.
-pub fn run_openapi() -> Result<()> {
-    http_entrypoints::api()
-        .openapi("Oxide Installinator Artifact Server", "0.0.1")
-        .description("API for use by the installinator to retrieve artifacts")
-        .contact_url("https://oxide.computer")
-        .contact_email("api@oxide.computer")
-        .write(&mut std::io::stdout())?;
-
-    Ok(())
-}
diff --git a/installinator-artifactd/src/server.rs b/installinator-artifactd/src/server.rs
deleted file mode 100644
index 88b622b756..0000000000
--- a/installinator-artifactd/src/server.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-// Copyright 2023 Oxide Computer Company
-
-//! The installinator artifact server.
-
-use std::net::SocketAddrV6;
-
-use anyhow::{anyhow, Result};
-use dropshot::{HandlerTaskMode, HttpServer};
-
-use crate::{
-    context::ServerContext,
-    store::{ArtifactGetter, ArtifactStore},
-};
-
-/// The installinator artifact server.
-#[derive(Debug)]
-pub struct ArtifactServer {
-    address: SocketAddrV6,
-    log: slog::Logger,
-    store: ArtifactStore,
-}
-
-impl ArtifactServer {
-    /// Creates a new artifact server with the given address.
-    pub fn new<Getter: ArtifactGetter>(
-        getter: Getter,
-        address: SocketAddrV6,
-        log: &slog::Logger,
-    ) -> Self {
-        let log = log.new(slog::o!("component" => "installinator artifactd"));
-        let store = ArtifactStore::new(getter, &log);
-        Self { address, log, store }
-    }
-
-    /// Starts the artifact server.
-    ///
-    /// This returns an `HttpServer`, which can be awaited to completion.
-    pub fn start(self) -> Result<HttpServer<ServerContext>> {
-        let context = ServerContext { artifact_store: self.store };
-
-        let dropshot_config = dropshot::ConfigDropshot {
-            bind_address: std::net::SocketAddr::V6(self.address),
-            // Even though the installinator sets an upper bound on the number
-            // of items in a progress report, they can get pretty large if they
-            // haven't gone through for a bit. Ensure that hitting the max
-            // request size won't cause a failure by setting a generous upper
-            // bound for the request size.
-            //
-            // TODO: replace with an endpoint-specific option once
-            // https://github.com/oxidecomputer/dropshot/pull/618 lands and is
-            // available in omicron.
-            request_body_max_bytes: 4 * 1024 * 1024,
-            default_handler_task_mode: HandlerTaskMode::Detached,
-        };
-
-        let api = crate::http_entrypoints::api();
-        let server = dropshot::HttpServerStarter::new(
-            &dropshot_config,
-            api,
-            context,
-            &self.log,
-        )
-        .map_err(|error| {
-            anyhow!(error)
-                .context("failed to create installinator artifact server")
-        })?;
-
-        Ok(server.start())
-    }
-}
diff --git a/installinator-artifactd/src/store.rs b/installinator-artifactd/src/store.rs
deleted file mode 100644
index 12e2880893..0000000000
--- a/installinator-artifactd/src/store.rs
+++ /dev/null
@@ -1,79 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-// Copyright 2023 Oxide Computer Company
-
-use std::fmt;
-
-use async_trait::async_trait;
-use dropshot::HttpError;
-use hyper::Body;
-use installinator_common::EventReport;
-use omicron_common::update::ArtifactHashId;
-use slog::Logger;
-use uuid::Uuid;
-
-/// Represents a way to fetch artifacts.
-#[async_trait]
-pub trait ArtifactGetter: fmt::Debug + Send + Sync + 'static {
-    /// Gets an artifact by hash, returning it as a [`Body`].
-    async fn get_by_hash(&self, id: &ArtifactHashId) -> Option<(u64, Body)>;
-
-    /// Reports update progress events from the installinator.
-    async fn report_progress(
-        &self,
-        update_id: Uuid,
-        report: EventReport,
-    ) -> Result<EventReportStatus, HttpError>;
-}
-
-/// The status returned by [`ArtifactGetter::report_progress`].
-#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
-#[must_use]
-pub enum EventReportStatus {
-    /// This report was processed by the server.
-    Processed,
-
-    /// The update ID was not recognized by the server.
-    UnrecognizedUpdateId,
-
-    /// The progress receiver is closed.
-    ReceiverClosed,
-}
-
-/// The artifact store -- a simple wrapper around a dynamic [`ArtifactGetter`] that does some basic
-/// logging.
-#[derive(Debug)]
-pub(crate) struct ArtifactStore {
-    log: Logger,
-    getter: Box<dyn ArtifactGetter>,
-    // TODO: implement this
-}
-
-impl ArtifactStore {
-    pub(crate) fn new<Getter: ArtifactGetter>(
-        getter: Getter,
-        log: &Logger,
-    ) -> Self {
-        let log = log.new(slog::o!("component" => "artifact store"));
-        Self { log, getter: Box::new(getter) }
-    }
-
-    pub(crate) async fn get_artifact_by_hash(
-        &self,
-        id: &ArtifactHashId,
-    ) -> Option<(u64, Body)> {
-        slog::debug!(self.log, "Artifact requested by hash: {:?}", id);
-        self.getter.get_by_hash(id).await
-    }
-
-    pub(crate) async fn report_progress(
-        &self,
-        update_id: Uuid,
-        report: EventReport,
-    ) -> Result<EventReportStatus, HttpError> {
-        slog::debug!(self.log, "Report for {update_id}: {report:?}");
-        self.getter.report_progress(update_id, report).await
-    }
-}
diff --git a/installinator-artifactd/tests/integration_tests/mod.rs b/installinator-artifactd/tests/integration_tests/mod.rs
deleted file mode 100644
index ebb67c3880..0000000000
--- a/installinator-artifactd/tests/integration_tests/mod.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-mod openapi;
diff --git a/installinator-artifactd/tests/integration_tests/openapi.rs b/installinator-artifactd/tests/integration_tests/openapi.rs
deleted file mode 100644
index 09441731d0..0000000000
--- a/installinator-artifactd/tests/integration_tests/openapi.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-use std::path::PathBuf;
-
-use expectorate::assert_contents;
-use omicron_test_utils::dev::test_cmds::{
-    assert_exit_code, path_to_executable, run_command, EXIT_SUCCESS,
-};
-use openapiv3::OpenAPI;
-use subprocess::Exec;
-
-// name of executable
-const CMD_SERVER: &str = env!("CARGO_BIN_EXE_installinator-artifactd");
-
-fn path_to_server() -> PathBuf {
-    path_to_executable(CMD_SERVER)
-}
-
-#[test]
-fn test_server_openapi() {
-    let exec = Exec::cmd(path_to_server()).arg("openapi");
-    let (exit_status, stdout_text, stderr_text) = run_command(exec);
-    assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text);
-    assert_contents("tests/output/cmd-server-openapi-stderr", &stderr_text);
-
-    let spec: OpenAPI = serde_json::from_str(&stdout_text)
-        .expect("stdout was not valid OpenAPI");
-
-    // Check for lint errors.
-    let errors = openapi_lint::validate(&spec);
-    assert!(errors.is_empty(), "{}", errors.join("\n\n"));
-
-    // Confirm that the output hasn't changed. It's expected that we'll change
-    // this file as the API evolves, but pay attention to the diffs to ensure
-    // that the changes match your expectations.
-    assert_contents("../openapi/installinator-artifactd.json", &stdout_text);
-}
diff --git a/installinator-artifactd/tests/mod.rs b/installinator-artifactd/tests/mod.rs
deleted file mode 100644
index 66fee5d99c..0000000000
--- a/installinator-artifactd/tests/mod.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! Integration tests for the installinator artifact server.
-//!
-//! Why use this weird layer of indirection, you might ask?  Cargo chooses to
-//! compile *each file* within the "tests/" subdirectory as a separate crate.
-//! This means that doing "file-granularity" conditional compilation is
-//! difficult, since a file like "test_for_illumos_only.rs" would get compiled
-//! and tested regardless of the contents of "mod.rs".
-//!
-//! However, by lumping all tests into a submodule, all integration tests are
-//! joined into a single crate, which itself can filter individual files
-//! by (for example) choice of target OS.
-
-mod integration_tests;
diff --git a/installinator-artifactd/tests/output/cmd-server-openapi-stderr b/installinator-artifactd/tests/output/cmd-server-openapi-stderr
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/installinator/Cargo.toml b/installinator/Cargo.toml
index c21c3f2ee2..00dfb6440b 100644
--- a/installinator/Cargo.toml
+++ b/installinator/Cargo.toml
@@ -20,7 +20,7 @@ futures.workspace = true
 hex.workspace = true
 http.workspace = true
 illumos-utils.workspace = true
-installinator-artifact-client.workspace = true
+installinator-client.workspace = true
 installinator-common.workspace = true
 ipcc.workspace = true
 itertools.workspace = true
diff --git a/installinator/src/artifact.rs b/installinator/src/artifact.rs
index 734759a2c2..12e85e0938 100644
--- a/installinator/src/artifact.rs
+++ b/installinator/src/artifact.rs
@@ -7,7 +7,7 @@ use std::net::SocketAddr;
 use anyhow::{Context, Result};
 use clap::Args;
 use futures::StreamExt;
-use installinator_artifact_client::ClientError;
+use installinator_client::ClientError;
 use installinator_common::EventReport;
 use ipcc::{InstallinatorImageId, Ipcc};
 use omicron_common::update::{ArtifactHash, ArtifactHashId};
@@ -63,7 +63,7 @@ impl ArtifactIdOpts {
 #[derive(Debug)]
 pub(crate) struct ArtifactClient {
     log: slog::Logger,
-    client: installinator_artifact_client::Client,
+    client: installinator_client::Client,
 }
 
 impl ArtifactClient {
@@ -81,8 +81,7 @@ impl ArtifactClient {
         let log = log.new(
             slog::o!("component" => "ArtifactClient", "peer" => addr.to_string()),
         );
-        let client =
-            installinator_artifact_client::Client::new(&endpoint, log.clone());
+        let client = installinator_client::Client::new(&endpoint, log.clone());
         Self { log, client }
     }
 
diff --git a/installinator/src/errors.rs b/installinator/src/errors.rs
index 1349cf7d89..577d0d6f4d 100644
--- a/installinator/src/errors.rs
+++ b/installinator/src/errors.rs
@@ -4,7 +4,7 @@
 
 use std::{net::SocketAddr, time::Duration};
 
-use installinator_artifact_client::ClientError;
+use installinator_client::ClientError;
 use thiserror::Error;
 
 #[derive(Debug, Error)]
diff --git a/installinator/src/mock_peers.rs b/installinator/src/mock_peers.rs
index 434276649f..ccb35a2f06 100644
--- a/installinator/src/mock_peers.rs
+++ b/installinator/src/mock_peers.rs
@@ -16,7 +16,7 @@ use std::{
 use anyhow::{bail, Result};
 use async_trait::async_trait;
 use bytes::Bytes;
-use installinator_artifact_client::{ClientError, ResponseValue};
+use installinator_client::{ClientError, ResponseValue};
 use installinator_common::EventReport;
 use omicron_common::update::ArtifactHashId;
 use proptest::prelude::*;
@@ -342,7 +342,7 @@ impl MockPeer {
                 tokio::time::sleep(after).await;
                 _ = sender
                     .send(Err(ClientError::ErrorResponse(ResponseValue::new(
-                        installinator_artifact_client::types::Error {
+                        installinator_client::types::Error {
                             error_code: None,
                             message: format!("not-found error after {after:?}"),
                             request_id: "mock-request-id".to_owned(),
@@ -356,7 +356,7 @@ impl MockPeer {
                 tokio::time::sleep(after).await;
                 _ = sender
                     .send(Err(ClientError::ErrorResponse(ResponseValue::new(
-                        installinator_artifact_client::types::Error {
+                        installinator_client::types::Error {
                             error_code: None,
                             message: format!("forbidden error after {after:?}"),
                             request_id: "mock-request-id".to_owned(),
@@ -526,7 +526,7 @@ impl PeersImpl for MockReportPeers {
             Ok(())
         } else if peer == Self::invalid_peer() {
             Err(ClientError::ErrorResponse(ResponseValue::new(
-                installinator_artifact_client::types::Error {
+                installinator_client::types::Error {
                     error_code: None,
                     message: "invalid peer => HTTP 422".to_owned(),
                     request_id: "mock-request-id".to_owned(),
diff --git a/installinator/src/peers.rs b/installinator/src/peers.rs
index 644507da4b..3d2e05077d 100644
--- a/installinator/src/peers.rs
+++ b/installinator/src/peers.rs
@@ -16,7 +16,7 @@ use buf_list::BufList;
 use bytes::Bytes;
 use display_error_chain::DisplayErrorChain;
 use futures::{Stream, StreamExt};
-use installinator_artifact_client::ClientError;
+use installinator_client::ClientError;
 use installinator_common::{
     EventReport, InstallinatorProgressMetadata, StepContext, StepProgress,
 };
diff --git a/openapi/installinator-artifactd.json b/openapi/installinator.json
similarity index 99%
rename from openapi/installinator-artifactd.json
rename to openapi/installinator.json
index 61f555e10d..0631344b25 100644
--- a/openapi/installinator-artifactd.json
+++ b/openapi/installinator.json
@@ -1,8 +1,8 @@
 {
   "openapi": "3.0.3",
   "info": {
-    "title": "Oxide Installinator Artifact Server",
-    "description": "API for use by the installinator to retrieve artifacts",
+    "title": "Installinator API",
+    "description": "API for installinator to fetch artifacts and report progress",
     "contact": {
       "url": "https://oxide.computer",
       "email": "api@oxide.computer"
diff --git a/wicketd/Cargo.toml b/wicketd/Cargo.toml
index bfd8a4cf45..792201c6ff 100644
--- a/wicketd/Cargo.toml
+++ b/wicketd/Cargo.toml
@@ -52,7 +52,7 @@ uuid.workspace = true
 bootstrap-agent-client.workspace = true
 omicron-ddm-admin-client.workspace = true
 gateway-client.workspace = true
-installinator-artifactd.workspace = true
+installinator-api.workspace = true
 installinator-common.workspace = true
 omicron-certificates.workspace = true
 omicron-common.workspace = true
@@ -76,7 +76,7 @@ fs-err.workspace = true
 gateway-test-utils.workspace = true
 http.workspace = true
 installinator.workspace = true
-installinator-artifact-client.workspace = true
+installinator-client.workspace = true
 maplit.workspace = true
 omicron-test-utils.workspace = true
 openapi-lint.workspace = true
diff --git a/wicketd/src/artifacts.rs b/wicketd/src/artifacts.rs
index 3e5854d17e..59981b2ac3 100644
--- a/wicketd/src/artifacts.rs
+++ b/wicketd/src/artifacts.rs
@@ -5,5 +5,6 @@
 mod server;
 mod store;
 
-pub(crate) use self::server::WicketdArtifactServer;
+pub(crate) use self::server::WicketdInstallinatorApiImpl;
+pub(crate) use self::server::WicketdInstallinatorContext;
 pub(crate) use self::store::WicketdArtifactStore;
diff --git a/wicketd/src/artifacts/server.rs b/wicketd/src/artifacts/server.rs
index 3808f01753..6d677c7b4f 100644
--- a/wicketd/src/artifacts/server.rs
+++ b/wicketd/src/artifacts/server.rs
@@ -2,62 +2,99 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
-use super::store::WicketdArtifactStore;
 use crate::installinator_progress::IprArtifactServer;
-use async_trait::async_trait;
+use dropshot::FreeformBody;
 use dropshot::HttpError;
+use dropshot::HttpResponseHeaders;
+use dropshot::HttpResponseOk;
+use dropshot::HttpResponseUpdatedNoContent;
+use dropshot::Path;
+use dropshot::RequestContext;
+use dropshot::TypedBody;
 use hyper::Body;
-use installinator_artifactd::ArtifactGetter;
-use installinator_artifactd::EventReportStatus;
+use installinator_api::body_to_artifact_response;
+use installinator_api::InstallinatorApi;
+use installinator_api::ReportQuery;
+use installinator_common::EventReport;
 use omicron_common::update::ArtifactHashId;
 use slog::error;
 use slog::Logger;
-use uuid::Uuid;
+
+use super::WicketdArtifactStore;
+
+pub(crate) enum WicketdInstallinatorApiImpl {}
 
 /// The artifact server interface for wicketd.
 #[derive(Debug)]
-pub(crate) struct WicketdArtifactServer {
-    #[allow(dead_code)]
+pub struct WicketdInstallinatorContext {
     log: Logger,
     store: WicketdArtifactStore,
     ipr_artifact: IprArtifactServer,
 }
 
-impl WicketdArtifactServer {
+impl WicketdInstallinatorContext {
     pub(crate) fn new(
         log: &Logger,
         store: WicketdArtifactStore,
         ipr_artifact: IprArtifactServer,
     ) -> Self {
-        let log = log.new(slog::o!("component" => "wicketd artifact server"));
-        Self { log, store, ipr_artifact }
+        Self {
+            log: log
+                .new(slog::o!("component" => "wicketd installinator server")),
+            store,
+            ipr_artifact,
+        }
     }
 }
 
-#[async_trait]
-impl ArtifactGetter for WicketdArtifactServer {
-    async fn get_by_hash(&self, id: &ArtifactHashId) -> Option<(u64, Body)> {
-        let data_handle = self.store.get_by_hash(id)?;
-        let size = data_handle.file_size() as u64;
-        let data_stream = match data_handle.reader_stream().await {
-            Ok(stream) => stream,
-            Err(err) => {
-                error!(
-                    self.log, "failed to open extracted archive on demand";
-                    "error" => #%err,
-                );
-                return None;
-            }
-        };
+impl InstallinatorApi for WicketdInstallinatorApiImpl {
+    type Context = WicketdInstallinatorContext;
+
+    async fn get_artifact_by_hash(
+        rqctx: RequestContext<Self::Context>,
+        path: Path<ArtifactHashId>,
+    ) -> Result<HttpResponseHeaders<HttpResponseOk<FreeformBody>>, HttpError>
+    {
+        let context = rqctx.context();
+        match context.store.get_by_hash(&path.into_inner()) {
+            Some(data_handle) => {
+                let size = data_handle.file_size() as u64;
+                let data_stream = match data_handle.reader_stream().await {
+                    Ok(stream) => stream,
+                    Err(err) => {
+                        error!(
+                            context.log, "failed to open extracted archive on demand";
+                            "error" => #%err,
+                        );
+                        return Err(HttpError::for_internal_error(format!(
+                            // TODO: print error chain
+                            "Artifact not found: {err}"
+                        )));
+                    }
+                };
 
-        Some((size, Body::wrap_stream(data_stream)))
+                Ok(body_to_artifact_response(
+                    size,
+                    Body::wrap_stream(data_stream),
+                ))
+            }
+            None => {
+                Err(HttpError::for_not_found(None, "Artifact not found".into()))
+            }
+        }
     }
 
     async fn report_progress(
-        &self,
-        update_id: Uuid,
-        report: installinator_common::EventReport,
-    ) -> Result<EventReportStatus, HttpError> {
-        Ok(self.ipr_artifact.report_progress(update_id, report))
+        rqctx: RequestContext<Self::Context>,
+        path: Path<ReportQuery>,
+        report: TypedBody<EventReport>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let context = rqctx.context();
+        let update_id = path.into_inner().update_id;
+
+        context
+            .ipr_artifact
+            .report_progress(update_id, report.into_inner())
+            .to_http_result(update_id)
     }
 }
diff --git a/wicketd/src/bin/wicketd.rs b/wicketd/src/bin/wicketd.rs
index 4037bc4c23..6ef616d708 100644
--- a/wicketd/src/bin/wicketd.rs
+++ b/wicketd/src/bin/wicketd.rs
@@ -144,9 +144,8 @@ async fn do_run() -> Result<(), CmdError> {
                 .to_logger("wicketd")
                 .context("failed to initialize logger")
                 .map_err(CmdError::Failure)?;
-            let server = Server::start(log, args)
-                .await
-                .map_err(|err| CmdError::Failure(anyhow!(err)))?;
+            let server =
+                Server::start(log, args).await.map_err(CmdError::Failure)?;
             server
                 .wait_for_finish()
                 .await
diff --git a/wicketd/src/installinator_progress.rs b/wicketd/src/installinator_progress.rs
index 77baec2c94..7d076e7b0e 100644
--- a/wicketd/src/installinator_progress.rs
+++ b/wicketd/src/installinator_progress.rs
@@ -12,7 +12,7 @@ use std::{
     sync::{Arc, Mutex},
 };
 
-use installinator_artifactd::EventReportStatus;
+use installinator_api::EventReportStatus;
 use tokio::sync::{oneshot, watch};
 use update_engine::events::StepEventIsTerminal;
 use uuid::Uuid;
diff --git a/wicketd/src/lib.rs b/wicketd/src/lib.rs
index 5926fc468d..9fb204b675 100644
--- a/wicketd/src/lib.rs
+++ b/wicketd/src/lib.rs
@@ -16,8 +16,11 @@ mod preflight_check;
 mod rss_config;
 mod update_tracker;
 
-use anyhow::{anyhow, Context, Result};
-use artifacts::{WicketdArtifactServer, WicketdArtifactStore};
+use anyhow::{anyhow, bail, Context, Result};
+use artifacts::{
+    WicketdArtifactStore, WicketdInstallinatorApiImpl,
+    WicketdInstallinatorContext,
+};
 use bootstrap_addrs::BootstrapPeers;
 pub use config::Config;
 pub(crate) use context::ServerContext;
@@ -118,7 +121,7 @@ impl SmfConfigValues {
 
 pub struct Server {
     pub wicketd_server: HttpServer<ServerContext>,
-    pub artifact_server: HttpServer<installinator_artifactd::ServerContext>,
+    pub installinator_server: HttpServer<WicketdInstallinatorContext>,
     pub artifact_store: WicketdArtifactStore,
     pub update_tracker: Arc<UpdateTracker>,
     pub ipr_update_tracker: IprUpdateTracker,
@@ -127,14 +130,14 @@ pub struct Server {
 
 impl Server {
     /// Run an instance of the wicketd server
-    pub async fn start(log: slog::Logger, args: Args) -> Result<Self, String> {
+    pub async fn start(log: slog::Logger, args: Args) -> anyhow::Result<Self> {
         let (drain, registration) = slog_dtrace::with_drain(log);
 
         let log = slog::Logger::root(drain.fuse(), slog::o!(FileKv));
         if let slog_dtrace::ProbeRegistration::Failed(e) = registration {
             let msg = format!("failed to register DTrace probes: {}", e);
             error!(log, "{}", msg);
-            return Err(msg);
+            bail!(msg);
         } else {
             debug!(log, "registered DTrace probes");
         };
@@ -174,7 +177,8 @@ impl Server {
                     addr,
                 )
                 .map_err(|err| {
-                    format!("Could not create internal DNS resolver: {err}")
+                    anyhow!(err)
+                        .context("Could not create internal DNS resolver")
                 })
             })
             .transpose()?;
@@ -186,7 +190,9 @@ impl Server {
             &log,
         )
         .await
-        .map_err(|err| format!("failed to start Nexus TCP proxy: {err}"))?;
+        .map_err(|err| {
+            anyhow!(err).context("failed to start Nexus TCP proxy")
+        })?;
 
         let wicketd_server = {
             let ds_log = log.new(o!("component" => "dropshot (wicketd)"));
@@ -209,25 +215,39 @@ impl Server {
                 },
                 &ds_log,
             )
-            .map_err(|err| format!("initializing http server: {}", err))?
+            .map_err(|err| anyhow!(err).context("initializing http server"))?
             .start()
         };
 
-        let server =
-            WicketdArtifactServer::new(&log, store.clone(), ipr_artifact);
-        let artifact_server = installinator_artifactd::ArtifactServer::new(
-            server,
-            args.artifact_address,
-            &log,
-        )
-        .start()
-        .map_err(|error| {
-            format!("failed to start artifact server: {error:?}")
-        })?;
+        let installinator_server = {
+            let installinator_config = installinator_api::default_config(
+                SocketAddr::V6(args.artifact_address),
+            );
+            let api_description =
+                installinator_api::installinator_api::api_description::<
+                    WicketdInstallinatorApiImpl,
+                >()?;
+
+            dropshot::HttpServerStarter::new(
+                &installinator_config,
+                api_description,
+                WicketdInstallinatorContext::new(
+                    &log,
+                    store.clone(),
+                    ipr_artifact,
+                ),
+                &log,
+            )
+            .map_err(|err| {
+                anyhow!(err)
+                    .context("failed to create installinator artifact server")
+            })?
+            .start()
+        };
 
         Ok(Self {
             wicketd_server,
-            artifact_server,
+            installinator_server,
             artifact_store: store,
             update_tracker,
             ipr_update_tracker,
@@ -240,7 +260,7 @@ impl Server {
         self.wicketd_server.close().await.map_err(|error| {
             anyhow!("error closing wicketd server: {error}")
         })?;
-        self.artifact_server.close().await.map_err(|error| {
+        self.installinator_server.close().await.map_err(|error| {
             anyhow!("error closing artifact server: {error}")
         })?;
         self.nexus_tcp_proxy.shutdown();
@@ -257,7 +277,7 @@ impl Server {
                     Err(err) => Err(format!("running wicketd server: {err}")),
                 }
             }
-            res = self.artifact_server => {
+            res = self.installinator_server => {
                 match res {
                     Ok(()) => Err("artifact server exited unexpectedly".to_owned()),
                     // The artifact server returns an anyhow::Error, which has a
diff --git a/wicketd/tests/integration_tests/setup.rs b/wicketd/tests/integration_tests/setup.rs
index 62682a73ab..01f01e21e1 100644
--- a/wicketd/tests/integration_tests/setup.rs
+++ b/wicketd/tests/integration_tests/setup.rs
@@ -16,7 +16,7 @@ pub struct WicketdTestContext {
     // this way.
     pub wicketd_raw_client: ClientTestContext,
     pub artifact_addr: SocketAddrV6,
-    pub artifact_client: installinator_artifact_client::Client,
+    pub artifact_client: installinator_client::Client,
     pub server: wicketd::Server,
     pub gateway: GatewayTestContext,
 }
@@ -62,14 +62,15 @@ impl WicketdTestContext {
             )
         };
 
-        let artifact_addr = assert_ipv6(server.artifact_server.local_addr());
+        let artifact_addr =
+            assert_ipv6(server.installinator_server.local_addr());
         let artifact_client = {
             let endpoint = format!(
                 "http://[{}]:{}",
                 artifact_addr.ip(),
                 artifact_addr.port()
             );
-            installinator_artifact_client::Client::new(
+            installinator_client::Client::new(
                 &endpoint,
                 log.new(slog::o!("component" => "artifact test client")),
             )

From 8c6afd157a05e1c42f3ce7a0dfef51264f10a022 Mon Sep 17 00:00:00 2001
From: Adam Leventhal <ahl@oxide.computer>
Date: Mon, 15 Jul 2024 13:28:07 -0700
Subject: [PATCH 18/27] bump to v10; API to estimated release date (#6085)

Closes #6057
---
 dev-tools/releng/src/main.rs              | 2 +-
 nexus/src/lib.rs                          | 2 +-
 nexus/tests/integration_tests/commands.rs | 2 +-
 openapi/nexus.json                        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs
index 1bd3b69ac9..ee649e79b2 100644
--- a/dev-tools/releng/src/main.rs
+++ b/dev-tools/releng/src/main.rs
@@ -41,7 +41,7 @@ use crate::job::Jobs;
 /// to as "v8", "version 8", or "release 8" to customers). The use of semantic
 /// versioning is mostly to hedge for perhaps wanting something more granular in
 /// the future.
-const BASE_VERSION: Version = Version::new(9, 0, 0);
+const BASE_VERSION: Version = Version::new(10, 0, 0);
 
 const RETRY_ATTEMPTS: usize = 3;
 
diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs
index a359ead038..5d5e7d6eba 100644
--- a/nexus/src/lib.rs
+++ b/nexus/src/lib.rs
@@ -57,7 +57,7 @@ extern crate slog;
 /// to stdout.
 pub fn run_openapi_external() -> Result<(), String> {
     external_api()
-        .openapi("Oxide Region API", "20240710.0")
+        .openapi("Oxide Region API", "20240821.0")
         .description("API for interacting with the Oxide control plane")
         .contact_url("https://oxide.computer")
         .contact_email("api@oxide.computer")
diff --git a/nexus/tests/integration_tests/commands.rs b/nexus/tests/integration_tests/commands.rs
index 3e133e8681..c2277ba776 100644
--- a/nexus/tests/integration_tests/commands.rs
+++ b/nexus/tests/integration_tests/commands.rs
@@ -109,7 +109,7 @@ fn test_nexus_openapi() {
         .expect("stdout was not valid OpenAPI");
     assert_eq!(spec.openapi, "3.0.3");
     assert_eq!(spec.info.title, "Oxide Region API");
-    assert_eq!(spec.info.version, "20240710.0");
+    assert_eq!(spec.info.version, "20240821.0");
 
     // Spot check a couple of items.
     assert!(!spec.paths.paths.is_empty());
diff --git a/openapi/nexus.json b/openapi/nexus.json
index c9d85a8ee3..8763441402 100644
--- a/openapi/nexus.json
+++ b/openapi/nexus.json
@@ -7,7 +7,7 @@
       "url": "https://oxide.computer",
       "email": "api@oxide.computer"
     },
-    "version": "20240710.0"
+    "version": "20240821.0"
   },
   "paths": {
     "/device/auth": {

From 031b5ecb6bebae76100d1f9240d258c0dcbbc91e Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Mon, 15 Jul 2024 13:34:50 -0700
Subject: [PATCH 19/27] [sled-agent] Stop self-managing physical disks (#5987)

Fixes https://github.com/oxidecomputer/omicron/issues/5328

This was an old kludge for backwards compatibility, we should be able to
rely on ledgers explicitly for this purpose.
---
 sled-storage/src/manager.rs | 234 +-----------------------------------
 1 file changed, 1 insertion(+), 233 deletions(-)

diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs
index 9e31568e00..d374ab8e23 100644
--- a/sled-storage/src/manager.rs
+++ b/sled-storage/src/manager.rs
@@ -8,9 +8,7 @@ use std::collections::HashSet;
 
 use crate::config::MountConfig;
 use crate::dataset::{DatasetName, CONFIG_DATASET};
-use crate::disk::{
-    OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig, RawDisk,
-};
+use crate::disk::{OmicronPhysicalDisksConfig, RawDisk};
 use crate::error::Error;
 use crate::resources::{AllDisks, DisksManagementResult, StorageResources};
 use camino::Utf8PathBuf;
@@ -589,91 +587,11 @@ impl StorageManager {
             self.resources.set_config(&ledger.data().disks);
         } else {
             info!(self.log, "KeyManager ready, but no ledger detected");
-            let mut synthetic_config =
-                self.resources.get_config().values().cloned().collect();
-            // TODO(https://github.com/oxidecomputer/omicron/issues/5328): Once
-            // we are confident that we have migrated to a world where this
-            // ledger is universally used, we should remove the following
-            // kludge. The sled agent should not need to "self-manage" anything!
-            let changed = self
-                .self_manage_disks_with_zpools(&mut synthetic_config)
-                .await?;
-            if !changed {
-                info!(self.log, "No disks to be automatically managed");
-                return Ok(());
-            }
-            info!(self.log, "auto-managed disks"; "count" => synthetic_config.len());
-            self.resources.set_config(&synthetic_config);
         }
 
         Ok(())
     }
 
-    // NOTE: What follows is an exceptional case: one where we have
-    // no record of "Control Plane Physical Disks", but we have zpools
-    // on our U.2s, and we want to use them regardless.
-    //
-    // THIS WOULD NORMALLY BE INCORRECT BEHAVIOR. In the future, these
-    // zpools will not be "automatically imported", and instead, we'll
-    // let Nexus decide whether or not to reformat the disks.
-    //
-    // However, because we are transitioning from "the set of disks /
-    // zpools is implicit" to a world where that set is explicit, this
-    // is a necessary transitional tool.
-    //
-    // Returns "true" if the synthetic_config has changed.
-    async fn self_manage_disks_with_zpools(
-        &mut self,
-        synthetic_config: &mut Vec<OmicronPhysicalDiskConfig>,
-    ) -> Result<bool, Error> {
-        let mut changed = false;
-        for (identity, disk) in self.resources.disks().values.iter() {
-            match disk {
-                crate::resources::ManagedDisk::Unmanaged(raw) => {
-                    let zpool_path = match raw.u2_zpool_path() {
-                        Ok(zpool_path) => zpool_path,
-                        Err(err) => {
-                            info!(self.log, "Cannot find zpool path"; "identity" => ?identity, "err" => ?err);
-                            continue;
-                        }
-                    };
-
-                    let zpool_name =
-                        match sled_hardware::disk::check_if_zpool_exists(
-                            &zpool_path,
-                        ) {
-                            Ok(zpool_name) => zpool_name,
-                            Err(err) => {
-                                info!(self.log, "Zpool does not exist"; "identity" => ?identity, "err" => ?err);
-                                continue;
-                            }
-                        };
-
-                    info!(self.log, "Found existing zpool on device without ledger";
-                        "identity" => ?identity,
-                        "zpool" => ?zpool_name);
-
-                    // We found an unmanaged disk with a zpool, even though
-                    // we have no prior record of a ledger of control-plane
-                    // disks.
-                    synthetic_config.push(
-                        // These disks don't have a control-plane UUID --
-                        // report "nil" until they're overwritten with real
-                        // values.
-                        OmicronPhysicalDiskConfig {
-                            identity: identity.clone(),
-                            id: Uuid::nil(),
-                            pool_id: zpool_name.id(),
-                        },
-                    );
-                    changed = true;
-                }
-                _ => continue,
-            }
-        }
-        Ok(changed)
-    }
-
     // Makes an U.2 disk managed by the control plane within [`StorageResources`].
     async fn omicron_physical_disks_ensure(
         &mut self,
@@ -911,10 +829,8 @@ mod tests {
 
     use super::*;
     use camino_tempfile::tempdir_in;
-    use omicron_common::api::external::Generation;
     use omicron_common::ledger;
     use omicron_test_utils::dev::test_setup_log;
-    use omicron_uuid_kinds::ZpoolUuid;
     use sled_hardware::DiskFirmware;
     use std::sync::atomic::Ordering;
     use uuid::Uuid;
@@ -1390,154 +1306,6 @@ mod tests {
         harness.cleanup().await;
         logctx.cleanup_successful();
     }
-
-    #[tokio::test]
-    async fn ledgerless_to_ledgered_migration() {
-        illumos_utils::USE_MOCKS.store(false, Ordering::SeqCst);
-        let logctx = test_setup_log("ledgerless_to_ledgered_migration");
-        let mut harness = StorageManagerTestHarness::new(&logctx.log).await;
-
-        // Test setup: Create two U.2s and an M.2
-        let raw_disks = harness
-            .add_vdevs(&[
-                "u2_under_test.vdev",
-                "u2_that_shows_up_late.vdev",
-                "m2_helping.vdev",
-            ])
-            .await;
-
-        // First, we format the U.2s to have a zpool. This should work, even
-        // without looping in the StorageManager.
-        let first_u2 = &raw_disks[0];
-        let first_pool_id = ZpoolUuid::new_v4();
-        let _disk = crate::disk::Disk::new(
-            &logctx.log,
-            &harness.mount_config(),
-            first_u2.clone(),
-            Some(first_pool_id),
-            Some(harness.key_requester()),
-        )
-        .await
-        .expect("Failed to format U.2");
-
-        let second_u2 = &raw_disks[1];
-        let second_pool_id = ZpoolUuid::new_v4();
-        let _disk = crate::disk::Disk::new(
-            &logctx.log,
-            &harness.mount_config(),
-            second_u2.clone(),
-            Some(second_pool_id),
-            Some(harness.key_requester()),
-        )
-        .await
-        .expect("Failed to format U.2");
-
-        // Because we did that formatting "behind the back" of the
-        // StorageManager, we should see no evidence of the U.2 being managed.
-        //
-        // This currently matches the format of "existing systems, which were
-        // initialized before the storage ledger was created".
-
-        // We should still see no ledger.
-        let result = harness.handle().omicron_physical_disks_list().await;
-        assert!(matches!(result, Err(Error::LedgerNotFound)), "{:?}", result);
-
-        // We should also not see any managed U.2s.
-        let disks = harness.handle().get_latest_disks().await;
-        assert!(disks.all_u2_zpools().is_empty());
-
-        // Leave one of the U.2s attached, but "remove" the other one.
-        harness.remove_vdev(second_u2).await;
-
-        // When the system activates, we should see a single Zpool, and
-        // "auto-manage" it.
-        harness.handle().key_manager_ready().await;
-
-        // It might take a moment for synchronization to be handled by the
-        // background task, but we'll eventually see the U.2 zpool.
-        //
-        // This is the equivalent of us "loading a zpool, even though
-        // it was not backed by a ledger".
-        let tt = TimeTravel::new();
-        tt.enough_to_start_synchronization().await;
-        while harness
-            .handle_mut()
-            .wait_for_changes()
-            .await
-            .all_u2_zpools()
-            .is_empty()
-        {
-            info!(&logctx.log, "Waiting for U.2 to automatically show up");
-        }
-        let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
-        assert_eq!(u2s.len(), 1, "{:?}", u2s);
-
-        // If we attach the second U.2 -- the equivalent of it appearing after
-        // the key manager is ready -- it'll also be included in the set of
-        // auto-maanged U.2s.
-        harness.add_vdev_as(second_u2.clone()).await;
-        tt.enough_to_start_synchronization().await;
-        while harness
-            .handle_mut()
-            .wait_for_changes()
-            .await
-            .all_u2_zpools()
-            .len()
-            == 1
-        {
-            info!(&logctx.log, "Waiting for U.2 to automatically show up");
-        }
-        let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
-        assert_eq!(u2s.len(), 2, "{:?}", u2s);
-
-        // This is the equivalent of the "/omicron-physical-disks GET" API,
-        // which Nexus might use to contact this sled.
-        //
-        // This means that we'll bootstrap the sled successfully, but report a
-        // 404 if nexus asks us for the latest configuration.
-        let result = harness.handle().omicron_physical_disks_list().await;
-        assert!(matches!(result, Err(Error::LedgerNotFound),), "{:?}", result);
-
-        // At this point, Nexus may want to explicitly tell sled agent which
-        // disks it should use. This is the equivalent of invoking
-        // "/omicron-physical-disks PUT".
-        let mut disks = vec![
-            OmicronPhysicalDiskConfig {
-                identity: first_u2.identity().clone(),
-                id: Uuid::new_v4(),
-                pool_id: first_pool_id,
-            },
-            OmicronPhysicalDiskConfig {
-                identity: second_u2.identity().clone(),
-                id: Uuid::new_v4(),
-                pool_id: second_pool_id,
-            },
-        ];
-        // Sort the disks to ensure the "output" matches the "input" when we
-        // query later.
-        disks.sort_by(|a, b| a.identity.partial_cmp(&b.identity).unwrap());
-        let config =
-            OmicronPhysicalDisksConfig { generation: Generation::new(), disks };
-        let result = harness
-            .handle()
-            .omicron_physical_disks_ensure(config.clone())
-            .await
-            .expect("Failed to ensure disks with 'new' Config");
-        assert!(!result.has_error(), "{:?}", result);
-
-        let observed_config = harness
-            .handle()
-            .omicron_physical_disks_list()
-            .await
-            .expect("Failed to retreive config after ensuring it");
-        assert_eq!(observed_config, config);
-
-        let u2s = harness.handle().get_latest_disks().await.all_u2_zpools();
-        assert_eq!(u2s.len(), 2, "{:?}", u2s);
-
-        harness.cleanup().await;
-        logctx.cleanup_successful();
-    }
 }
 
 #[cfg(test)]

From e6a3c3f52d79e8a74bd763ba0c045f093f8d1bd2 Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 14:57:18 -0700
Subject: [PATCH 20/27] Update Rust crate strum to v0.26.3 (#6062)

---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 459939ce50..686b0a0b71 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9454,9 +9454,9 @@ dependencies = [
 
 [[package]]
 name = "strum"
-version = "0.26.2"
+version = "0.26.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
 dependencies = [
  "strum_macros 0.26.4",
 ]

From ad6c92ede4835f6ab875927494f95e726d753ddd Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Mon, 15 Jul 2024 15:29:12 -0700
Subject: [PATCH 21/27] [Sled Agent] Expunged disks are not in use after
 omicron_physical_disks_ensure (#5965)

`omicron_physical_disks_ensure` is an API exposed by Sled Agent, which
allows Nexus to control the set of
"active" control plane disks.

Although this API was exposed, it previously did not stop the Sled Agent
from using expunged disks under
all circumstances. This PR now adjusts the endpoint to "flush out" all
old usage of disks before returning.

This PR:
- Ensures dump device management lets go of expunged U.2s
- Ensures Zone bundles let go of expunged U.2s
- Removes any network probes allocated with a transient filesystem on an
expunged U.2
- Removes any VMMs allocates with a transient filesystem on an expunged
U.2

Fixes https://github.com/oxidecomputer/omicron/issues/5929
---
 illumos-utils/src/running_zone.rs    | 171 +++------------------------
 illumos-utils/src/zone.rs            |   6 +-
 illumos-utils/src/zpool.rs           |  15 ++-
 sled-agent/src/common/instance.rs    |  13 +-
 sled-agent/src/dump_setup.rs         |  31 +++++
 sled-agent/src/instance.rs           |  68 ++++++++---
 sled-agent/src/instance_manager.rs   |  84 ++++++++++++-
 sled-agent/src/long_running_tasks.rs |  18 ++-
 sled-agent/src/probe_manager.rs      |  94 ++++++++++++---
 sled-agent/src/services.rs           |  33 ++++--
 sled-agent/src/sim/instance.rs       |   6 +-
 sled-agent/src/sled_agent.rs         |  66 ++++++++++-
 sled-agent/src/storage_monitor.rs    |  60 +++++++++-
 sled-agent/src/zone_bundle.rs        |  16 +++
 sled-storage/src/manager.rs          |  43 +++----
 sled-storage/src/resources.rs        |  91 ++++++++++----
 16 files changed, 556 insertions(+), 259 deletions(-)

diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs
index c529a1b6d4..a66fa44e9c 100644
--- a/illumos-utils/src/running_zone.rs
+++ b/illumos-utils/src/running_zone.rs
@@ -10,6 +10,7 @@ use crate::link::{Link, VnicAllocator};
 use crate::opte::{Port, PortTicket};
 use crate::svc::wait_for_service;
 use crate::zone::{AddressRequest, IPADM, ZONE_PREFIX};
+use crate::zpool::{PathInPool, ZpoolName};
 use camino::{Utf8Path, Utf8PathBuf};
 use camino_tempfile::Utf8TempDir;
 use ipnetwork::IpNetwork;
@@ -101,60 +102,6 @@ pub enum EnsureAddressError {
     OpteGatewayConfig(#[from] RunCommandError),
 }
 
-/// Errors returned from [`RunningZone::get`].
-#[derive(thiserror::Error, Debug)]
-pub enum GetZoneError {
-    #[error("While looking up zones with prefix '{prefix}', could not get zones: {err}")]
-    GetZones {
-        prefix: String,
-        #[source]
-        err: crate::zone::AdmError,
-    },
-
-    #[error("Invalid Utf8 path: {0}")]
-    FromPathBuf(#[from] camino::FromPathBufError),
-
-    #[error("Zone with prefix '{prefix}' not found")]
-    NotFound { prefix: String },
-
-    #[error("Cannot get zone '{name}': it is in the {state:?} state instead of running")]
-    NotRunning { name: String, state: zone::State },
-
-    #[error(
-        "Cannot get zone '{name}': Failed to acquire control interface {err}"
-    )]
-    ControlInterface {
-        name: String,
-        #[source]
-        err: crate::zone::GetControlInterfaceError,
-    },
-
-    #[error("Cannot get zone '{name}': Failed to create addrobj: {err}")]
-    AddrObject {
-        name: String,
-        #[source]
-        err: crate::addrobj::ParseError,
-    },
-
-    #[error(
-        "Cannot get zone '{name}': Failed to ensure address exists: {err}"
-    )]
-    EnsureAddress {
-        name: String,
-        #[source]
-        err: crate::zone::EnsureAddressError,
-    },
-
-    #[error(
-        "Cannot get zone '{name}': Incorrect bootstrap interface access {err}"
-    )]
-    BootstrapInterface {
-        name: String,
-        #[source]
-        err: crate::zone::GetBootstrapInterfaceError,
-    },
-}
-
 #[cfg(target_os = "illumos")]
 static REAPER_THREAD: OnceLock<thread::JoinHandle<()>> = OnceLock::new();
 
@@ -407,6 +354,11 @@ impl RunningZone {
         self.inner.root()
     }
 
+    /// Returns the zpool on which the filesystem path has been placed.
+    pub fn root_zpool(&self) -> Option<&ZpoolName> {
+        self.inner.zonepath.pool.as_ref()
+    }
+
     pub fn control_interface(&self) -> AddrObject {
         AddrObject::new(self.inner.get_control_vnic_name(), "omicron6").unwrap()
     }
@@ -797,95 +749,6 @@ impl RunningZone {
         Ok(())
     }
 
-    /// Looks up a running zone based on the `zone_prefix`, if one already exists.
-    ///
-    /// - If the zone was found, is running, and has a network interface, it is
-    /// returned.
-    /// - If the zone was not found `Error::NotFound` is returned.
-    /// - If the zone was found, but not running, `Error::NotRunning` is
-    /// returned.
-    /// - Other errors may be returned attempting to look up and accessing an
-    /// address on the zone.
-    pub async fn get(
-        log: &Logger,
-        vnic_allocator: &VnicAllocator<Etherstub>,
-        zone_prefix: &str,
-        addrtype: AddressRequest,
-    ) -> Result<Self, GetZoneError> {
-        let zone_info = Zones::get()
-            .await
-            .map_err(|err| GetZoneError::GetZones {
-                prefix: zone_prefix.to_string(),
-                err,
-            })?
-            .into_iter()
-            .find(|zone_info| zone_info.name().starts_with(&zone_prefix))
-            .ok_or_else(|| GetZoneError::NotFound {
-                prefix: zone_prefix.to_string(),
-            })?;
-
-        if zone_info.state() != zone::State::Running {
-            return Err(GetZoneError::NotRunning {
-                name: zone_info.name().to_string(),
-                state: zone_info.state(),
-            });
-        }
-
-        let zone_name = zone_info.name();
-        let vnic_name =
-            Zones::get_control_interface(zone_name).map_err(|err| {
-                GetZoneError::ControlInterface {
-                    name: zone_name.to_string(),
-                    err,
-                }
-            })?;
-        let addrobj = AddrObject::new_control(&vnic_name).map_err(|err| {
-            GetZoneError::AddrObject { name: zone_name.to_string(), err }
-        })?;
-        Zones::ensure_address(Some(zone_name), &addrobj, addrtype).map_err(
-            |err| GetZoneError::EnsureAddress {
-                name: zone_name.to_string(),
-                err,
-            },
-        )?;
-
-        let control_vnic = vnic_allocator
-            .wrap_existing(vnic_name)
-            .expect("Failed to wrap valid control VNIC");
-
-        // The bootstrap address for a running zone never changes,
-        // so there's no need to call `Zones::ensure_address`.
-        // Currently, only the switch zone has a bootstrap interface.
-        let bootstrap_vnic = Zones::get_bootstrap_interface(zone_name)
-            .map_err(|err| GetZoneError::BootstrapInterface {
-                name: zone_name.to_string(),
-                err,
-            })?
-            .map(|name| {
-                vnic_allocator
-                    .wrap_existing(name)
-                    .expect("Failed to wrap valid bootstrap VNIC")
-            });
-
-        Ok(Self {
-            id: zone_info.id().map(|x| {
-                x.try_into().expect("zoneid_t is expected to be an i32")
-            }),
-            inner: InstalledZone {
-                log: log.new(o!("zone" => zone_name.to_string())),
-                zonepath: zone_info.path().to_path_buf().try_into()?,
-                name: zone_name.to_string(),
-                control_vnic,
-                // TODO(https://github.com/oxidecomputer/omicron/issues/725)
-                //
-                // Re-initialize guest_vnic state by inspecting the zone.
-                opte_ports: vec![],
-                links: vec![],
-                bootstrap_vnic,
-            },
-        })
-    }
-
     /// Return references to the OPTE ports for this zone.
     pub fn opte_ports(&self) -> impl Iterator<Item = &Port> {
         self.inner.opte_ports()
@@ -1081,7 +944,7 @@ pub struct InstalledZone {
     log: Logger,
 
     // Filesystem path of the zone
-    zonepath: Utf8PathBuf,
+    zonepath: PathInPool,
 
     // Name of the Zone.
     name: String,
@@ -1131,7 +994,7 @@ impl InstalledZone {
 
     /// Returns the filesystem path to the zonepath
     pub fn zonepath(&self) -> &Utf8Path {
-        &self.zonepath
+        &self.zonepath.path
     }
 
     pub fn site_profile_xml_path(&self) -> Utf8PathBuf {
@@ -1147,7 +1010,7 @@ impl InstalledZone {
 
     /// Returns the filesystem path to the zone's root in the GZ.
     pub fn root(&self) -> Utf8PathBuf {
-        self.zonepath.join(Self::ROOT_FS_PATH)
+        self.zonepath.path.join(Self::ROOT_FS_PATH)
     }
 }
 
@@ -1198,7 +1061,7 @@ pub struct ZoneBuilder<'a> {
     /// Allocates the NIC used for control plane communication.
     underlay_vnic_allocator: Option<&'a VnicAllocator<Etherstub>>,
     /// Filesystem path at which the installed zone will reside.
-    zone_root_path: Option<&'a Utf8Path>,
+    zone_root_path: Option<PathInPool>,
     /// The directories that will be searched for the image tarball for the
     /// provided zone type ([`Self::with_zone_type`]).
     zone_image_paths: Option<&'a [Utf8PathBuf]>,
@@ -1251,7 +1114,7 @@ impl<'a> ZoneBuilder<'a> {
     }
 
     /// Filesystem path at which the installed zone will reside.
-    pub fn with_zone_root_path(mut self, root_path: &'a Utf8Path) -> Self {
+    pub fn with_zone_root_path(mut self, root_path: PathInPool) -> Self {
         self.zone_root_path = Some(root_path);
         self
     }
@@ -1345,8 +1208,11 @@ impl<'a> ZoneBuilder<'a> {
                 self.zone_type?,
                 self.unique_name,
             );
-            let zonepath = temp_dir
-                .join(self.zone_root_path?.strip_prefix("/").unwrap())
+            let mut zonepath = self.zone_root_path?;
+            zonepath.path = temp_dir
+                .join(
+                    zonepath.path.strip_prefix("/").unwrap()
+                )
                 .join(&full_zone_name);
             let iz = InstalledZone {
                 log: self.log?,
@@ -1376,7 +1242,7 @@ impl<'a> ZoneBuilder<'a> {
         let Self {
             log: Some(log),
             underlay_vnic_allocator: Some(underlay_vnic_allocator),
-            zone_root_path: Some(zone_root_path),
+            zone_root_path: Some(mut zone_root_path),
             zone_image_paths: Some(zone_image_paths),
             zone_type: Some(zone_type),
             unique_name,
@@ -1440,6 +1306,7 @@ impl<'a> ZoneBuilder<'a> {
         net_device_names.sort();
         net_device_names.dedup();
 
+        zone_root_path.path = zone_root_path.path.join(&full_zone_name);
         Zones::install_omicron_zone(
             &log,
             &zone_root_path,
@@ -1460,7 +1327,7 @@ impl<'a> ZoneBuilder<'a> {
 
         Ok(InstalledZone {
             log: log.new(o!("zone" => full_zone_name.clone())),
-            zonepath: zone_root_path.join(&full_zone_name),
+            zonepath: zone_root_path,
             name: full_zone_name,
             control_vnic,
             bootstrap_vnic,
diff --git a/illumos-utils/src/zone.rs b/illumos-utils/src/zone.rs
index 3f749fc352..7ba40af043 100644
--- a/illumos-utils/src/zone.rs
+++ b/illumos-utils/src/zone.rs
@@ -14,6 +14,7 @@ use std::net::{IpAddr, Ipv6Addr};
 
 use crate::addrobj::AddrObject;
 use crate::dladm::{EtherstubVnic, VNIC_PREFIX_BOOTSTRAP, VNIC_PREFIX_CONTROL};
+use crate::zpool::PathInPool;
 use crate::{execute, PFEXEC};
 use omicron_common::address::SLED_PREFIX;
 
@@ -282,7 +283,7 @@ impl Zones {
     #[allow(clippy::too_many_arguments)]
     pub async fn install_omicron_zone(
         log: &Logger,
-        zone_root_path: &Utf8Path,
+        zone_root_path: &PathInPool,
         zone_name: &str,
         zone_image: &Utf8Path,
         datasets: &[zone::Dataset],
@@ -319,10 +320,9 @@ impl Zones {
             true,
             zone::CreationOptions::Blank,
         );
-        let path = zone_root_path.join(zone_name);
         cfg.get_global()
             .set_brand("omicron1")
-            .set_path(&path)
+            .set_path(&zone_root_path.path)
             .set_autoboot(false)
             .set_ip_type(zone::IpType::Exclusive);
         if !limit_priv.is_empty() {
diff --git a/illumos-utils/src/zpool.rs b/illumos-utils/src/zpool.rs
index fa93760f99..5dabbdecc7 100644
--- a/illumos-utils/src/zpool.rs
+++ b/illumos-utils/src/zpool.rs
@@ -5,7 +5,7 @@
 //! Utilities for managing Zpools.
 
 use crate::{execute, ExecutionError, PFEXEC};
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use std::str::FromStr;
 
 pub use omicron_common::zpool_name::ZpoolName;
@@ -181,6 +181,19 @@ impl FromStr for ZpoolInfo {
 /// Wraps commands for interacting with ZFS pools.
 pub struct Zpool {}
 
+/// A path which exists within a pool.
+///
+/// By storing these types together, it's possible to answer
+/// whether or not a path exists on a particular device.
+// Technically we could re-derive the pool name from the path,
+// but that involves some string parsing, and honestly I'd just
+// Rather Not.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct PathInPool {
+    pub pool: Option<ZpoolName>,
+    pub path: Utf8PathBuf,
+}
+
 #[cfg_attr(any(test, feature = "testing"), mockall::automock, allow(dead_code))]
 impl Zpool {
     pub fn create(
diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs
index ed0aceff82..0fe2e27698 100644
--- a/sled-agent/src/common/instance.rs
+++ b/sled-agent/src/common/instance.rs
@@ -486,9 +486,15 @@ impl InstanceStates {
     /// instance's state in Nexus may become inconsistent. This routine should
     /// therefore only be invoked by callers who know that an instance is not
     /// migrating.
-    pub(crate) fn terminate_rudely(&mut self) {
+    pub(crate) fn terminate_rudely(&mut self, mark_failed: bool) {
+        let vmm_state = if mark_failed {
+            PropolisInstanceState(PropolisApiState::Failed)
+        } else {
+            PropolisInstanceState(PropolisApiState::Destroyed)
+        };
+
         let fake_observed = ObservedPropolisState {
-            vmm_state: PropolisInstanceState(PropolisApiState::Destroyed),
+            vmm_state,
             migration_status: if self.instance.migration_id.is_some() {
                 ObservedMigrationStatus::Failed
             } else {
@@ -893,7 +899,8 @@ mod test {
         assert_eq!(state.propolis_role(), PropolisRole::MigrationTarget);
 
         let prev = state.clone();
-        state.terminate_rudely();
+        let mark_failed = false;
+        state.terminate_rudely(mark_failed);
 
         assert_state_change_has_gen_change(&prev, &state);
         assert_eq!(state.instance.gen, prev.instance.gen);
diff --git a/sled-agent/src/dump_setup.rs b/sled-agent/src/dump_setup.rs
index 02d3d41dd7..02d40195cf 100644
--- a/sled-agent/src/dump_setup.rs
+++ b/sled-agent/src/dump_setup.rs
@@ -100,6 +100,7 @@ use std::ffi::OsString;
 use std::path::{Path, PathBuf};
 use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH};
 use tokio::sync::mpsc::Receiver;
+use tokio::sync::oneshot;
 use zone::{Zone, ZoneError};
 
 const ZFS_PROP_USED: &str = "used";
@@ -175,6 +176,7 @@ enum DumpSetupCmd {
         dump_slices: Vec<DumpSlicePath>,
         debug_datasets: Vec<DebugZpool>,
         core_datasets: Vec<CoreZpool>,
+        update_complete_tx: oneshot::Sender<()>,
     },
 }
 
@@ -222,6 +224,12 @@ impl DumpSetup {
         Self { tx, mount_config, _poller, log }
     }
 
+    /// Given the set of all managed disks, updates the dump device location
+    /// for logs and dumps.
+    ///
+    /// This function returns only once this request has been handled, which
+    /// can be used as a signal by callers that any "old disks" are no longer
+    /// being used by [DumpSetup].
     pub(crate) async fn update_dumpdev_setup(
         &self,
         disks: impl Iterator<Item = &Disk>,
@@ -279,16 +287,22 @@ impl DumpSetup {
             }
         }
 
+        let (tx, rx) = oneshot::channel();
         if let Err(err) = self
             .tx
             .send(DumpSetupCmd::UpdateDumpdevSetup {
                 dump_slices: m2_dump_slices,
                 debug_datasets: u2_debug_datasets,
                 core_datasets: m2_core_datasets,
+                update_complete_tx: tx,
             })
             .await
         {
             error!(log, "DumpSetup channel closed: {:?}", err.0);
+        };
+
+        if let Err(err) = rx.await {
+            error!(log, "DumpSetup failed to await update"; "err" => ?err);
         }
     }
 }
@@ -504,6 +518,14 @@ impl DumpSetupWorker {
 
     async fn poll_file_archival(mut self) {
         info!(self.log, "DumpSetup poll loop started.");
+
+        // A oneshot which helps callers track when updates have propagated.
+        //
+        // This is particularly useful for disk expungement, when a caller
+        // wants to ensure that the dump device is no longer accessing an
+        // old device.
+        let mut evaluation_and_archiving_complete_tx = None;
+
         loop {
             match tokio::time::timeout(ARCHIVAL_INTERVAL, self.rx.recv()).await
             {
@@ -511,7 +533,10 @@ impl DumpSetupWorker {
                     dump_slices,
                     debug_datasets,
                     core_datasets,
+                    update_complete_tx,
                 })) => {
+                    evaluation_and_archiving_complete_tx =
+                        Some(update_complete_tx);
                     self.update_disk_loadout(
                         dump_slices,
                         debug_datasets,
@@ -537,6 +562,12 @@ impl DumpSetupWorker {
             if let Err(err) = self.archive_files().await {
                 error!(self.log, "Failed to archive debug/dump files: {err:?}");
             }
+
+            if let Some(tx) = evaluation_and_archiving_complete_tx.take() {
+                if let Err(err) = tx.send(()) {
+                    error!(self.log, "DumpDevice failed to notify caller"; "err" => ?err);
+                }
+            }
         }
     }
 
diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
index ec4d503e7b..38b97173fc 100644
--- a/sled-agent/src/instance.rs
+++ b/sled-agent/src/instance.rs
@@ -39,9 +39,10 @@ use omicron_common::api::internal::shared::{
     NetworkInterface, SourceNatConfig,
 };
 use omicron_common::backoff;
+use omicron_common::zpool_name::ZpoolName;
 use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid};
 use propolis_client::Client as PropolisClient;
-use rand::prelude::SliceRandom;
+use rand::prelude::IteratorRandom;
 use rand::SeedableRng;
 use sled_storage::dataset::ZONE_DATASET;
 use sled_storage::manager::StorageHandle;
@@ -214,6 +215,9 @@ enum InstanceRequest {
     RequestZoneBundle {
         tx: oneshot::Sender<Result<ZoneBundleMetadata, BundleError>>,
     },
+    GetFilesystemPool {
+        tx: oneshot::Sender<Option<ZpoolName>>,
+    },
     CurrentState {
         tx: oneshot::Sender<SledInstanceState>,
     },
@@ -227,6 +231,7 @@ enum InstanceRequest {
         tx: oneshot::Sender<Result<SledInstanceState, ManagerError>>,
     },
     Terminate {
+        mark_failed: bool,
         tx: oneshot::Sender<Result<InstanceUnregisterResponse, ManagerError>>,
     },
     IssueSnapshotRequest {
@@ -391,7 +396,8 @@ impl InstanceRunner {
                         // of the sender alive in "self.tx_monitor".
                         None => {
                             warn!(self.log, "Instance 'VMM monitor' channel closed; shutting down");
-                            self.terminate().await;
+                            let mark_failed = true;
+                            self.terminate(mark_failed).await;
                         },
                     }
 
@@ -405,6 +411,10 @@ impl InstanceRunner {
                             tx.send(self.request_zone_bundle().await)
                                 .map_err(|_| Error::FailedSendClientClosed)
                         },
+                        Some(GetFilesystemPool { tx } ) => {
+                            tx.send(self.get_filesystem_zpool())
+                                .map_err(|_| Error::FailedSendClientClosed)
+                        },
                         Some(CurrentState{ tx }) => {
                             tx.send(self.current_state())
                                 .map_err(|_| Error::FailedSendClientClosed)
@@ -424,9 +434,9 @@ impl InstanceRunner {
                             )
                             .map_err(|_| Error::FailedSendClientClosed)
                         },
-                        Some(Terminate { tx }) => {
+                        Some(Terminate { mark_failed, tx }) => {
                             tx.send(Ok(InstanceUnregisterResponse {
-                                updated_runtime: Some(self.terminate().await)
+                                updated_runtime: Some(self.terminate(mark_failed).await)
                             }))
                             .map_err(|_| Error::FailedSendClientClosed)
                         },
@@ -449,7 +459,8 @@ impl InstanceRunner {
                         },
                         None => {
                             warn!(self.log, "Instance request channel closed; shutting down");
-                            self.terminate().await;
+                            let mark_failed = false;
+                            self.terminate(mark_failed).await;
                             break;
                         },
                     };
@@ -609,8 +620,8 @@ impl InstanceRunner {
             Some(InstanceAction::Destroy) => {
                 info!(self.log, "terminating VMM that has exited";
                       "instance_id" => %self.id());
-
-                self.terminate().await;
+                let mark_failed = false;
+                self.terminate(mark_failed).await;
                 Reaction::Terminate
             }
             None => Reaction::Continue,
@@ -1059,6 +1070,17 @@ impl Instance {
         Ok(())
     }
 
+    pub async fn get_filesystem_zpool(
+        &self,
+    ) -> Result<Option<ZpoolName>, Error> {
+        let (tx, rx) = oneshot::channel();
+        self.tx
+            .send(InstanceRequest::GetFilesystemPool { tx })
+            .await
+            .map_err(|_| Error::FailedSendChannelClosed)?;
+        Ok(rx.await?)
+    }
+
     pub async fn current_state(&self) -> Result<SledInstanceState, Error> {
         let (tx, rx) = oneshot::channel();
         self.tx
@@ -1113,9 +1135,10 @@ impl Instance {
     pub async fn terminate(
         &self,
         tx: oneshot::Sender<Result<InstanceUnregisterResponse, ManagerError>>,
+        mark_failed: bool,
     ) -> Result<(), Error> {
         self.tx
-            .send(InstanceRequest::Terminate { tx })
+            .send(InstanceRequest::Terminate { mark_failed, tx })
             .await
             .map_err(|_| Error::FailedSendChannelClosed)?;
         Ok(())
@@ -1180,6 +1203,13 @@ impl InstanceRunner {
         }
     }
 
+    fn get_filesystem_zpool(&self) -> Option<ZpoolName> {
+        let Some(run_state) = &self.running_state else {
+            return None;
+        };
+        run_state.running_zone.root_zpool().map(|p| p.clone())
+    }
+
     fn current_state(&self) -> SledInstanceState {
         self.state.sled_instance_state()
     }
@@ -1228,7 +1258,8 @@ impl InstanceRunner {
                 // This case is morally equivalent to starting Propolis and then
                 // rudely terminating it before asking it to do anything. Update
                 // the VMM and instance states accordingly.
-                self.state.terminate_rudely();
+                let mark_failed = false;
+                self.state.terminate_rudely(mark_failed);
             }
             setup_result?;
         }
@@ -1255,7 +1286,8 @@ impl InstanceRunner {
                 // this happens, generate an instance record bearing the
                 // "Destroyed" state and return it to the caller.
                 if self.running_state.is_none() {
-                    self.terminate().await;
+                    let mark_failed = false;
+                    self.terminate(mark_failed).await;
                     (None, None)
                 } else {
                     (
@@ -1343,20 +1375,22 @@ impl InstanceRunner {
         // configured VNICs.
         let zname = propolis_zone_name(self.propolis_id());
         let mut rng = rand::rngs::StdRng::from_entropy();
-        let root = self
+        let latest_disks = self
             .storage
             .get_latest_disks()
             .await
-            .all_u2_mountpoints(ZONE_DATASET)
+            .all_u2_mountpoints(ZONE_DATASET);
+
+        let root = latest_disks
+            .into_iter()
             .choose(&mut rng)
-            .ok_or_else(|| Error::U2NotFound)?
-            .clone();
+            .ok_or_else(|| Error::U2NotFound)?;
         let installed_zone = self
             .zone_builder_factory
             .builder()
             .with_log(self.log.clone())
             .with_underlay_vnic_allocator(&self.vnic_allocator)
-            .with_zone_root_path(&root)
+            .with_zone_root_path(root)
             .with_zone_image_paths(&["/opt/oxide".into()])
             .with_zone_type("propolis-server")
             .with_unique_name(self.propolis_id().into_untyped_uuid())
@@ -1453,9 +1487,9 @@ impl InstanceRunner {
         Ok(PropolisSetup { client, running_zone })
     }
 
-    async fn terminate(&mut self) -> SledInstanceState {
+    async fn terminate(&mut self, mark_failed: bool) -> SledInstanceState {
         self.terminate_inner().await;
-        self.state.terminate_rudely();
+        self.state.terminate_rudely(mark_failed);
 
         // This causes the "run" task to exit on the next iteration.
         self.should_terminate = true;
diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs
index beeb8377d2..cfb96fb8c9 100644
--- a/sled-agent/src/instance_manager.rs
+++ b/sled-agent/src/instance_manager.rs
@@ -24,14 +24,16 @@ use illumos_utils::dladm::Etherstub;
 use illumos_utils::link::VnicAllocator;
 use illumos_utils::opte::PortManager;
 use illumos_utils::running_zone::ZoneBuilderFactory;
+use omicron_common::api::external::Generation;
 use omicron_common::api::internal::nexus::InstanceRuntimeState;
 use omicron_common::api::internal::nexus::SledInstanceState;
 use omicron_common::api::internal::nexus::VmmRuntimeState;
 use omicron_uuid_kinds::InstanceUuid;
 use omicron_uuid_kinds::PropolisUuid;
 use sled_storage::manager::StorageHandle;
+use sled_storage::resources::AllDisks;
 use slog::Logger;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashSet};
 use std::net::SocketAddr;
 use std::sync::Arc;
 use tokio::sync::{mpsc, oneshot};
@@ -119,6 +121,7 @@ impl InstanceManager {
             instances: BTreeMap::new(),
             vnic_allocator: VnicAllocator::new("Instance", etherstub),
             port_manager,
+            storage_generation: None,
             storage,
             zone_bundler,
             zone_builder_factory,
@@ -325,6 +328,23 @@ impl InstanceManager {
             .map_err(|_| Error::FailedSendInstanceManagerClosed)?;
         rx.await?
     }
+
+    /// Marks instances failed unless they're using storage from `disks`.
+    ///
+    /// This function looks for transient zone filesystem usage on expunged
+    /// zpools.
+    pub async fn use_only_these_disks(
+        &self,
+        disks: AllDisks,
+    ) -> Result<(), Error> {
+        let (tx, rx) = oneshot::channel();
+        self.inner
+            .tx
+            .send(InstanceManagerRequest::OnlyUseDisks { disks, tx })
+            .await
+            .map_err(|_| Error::FailedSendInstanceManagerClosed)?;
+        rx.await?
+    }
 }
 
 // Most requests that can be sent to the "InstanceManagerRunner" task.
@@ -384,6 +404,10 @@ enum InstanceManagerRequest {
         instance_id: InstanceUuid,
         tx: oneshot::Sender<Result<SledInstanceState, Error>>,
     },
+    OnlyUseDisks {
+        disks: AllDisks,
+        tx: oneshot::Sender<Result<(), Error>>,
+    },
 }
 
 // Requests that the instance manager stop processing information about a
@@ -420,6 +444,7 @@ struct InstanceManagerRunner {
 
     vnic_allocator: VnicAllocator<Etherstub>,
     port_manager: PortManager,
+    storage_generation: Option<Generation>,
     storage: StorageHandle,
     zone_bundler: ZoneBundler,
     zone_builder_factory: ZoneBuilderFactory,
@@ -494,6 +519,10 @@ impl InstanceManagerRunner {
                             // the state...
                             self.get_instance_state(tx, instance_id).await
                         },
+                        Some(OnlyUseDisks { disks, tx } ) => {
+                            self.use_only_these_disks(disks).await;
+                            tx.send(Ok(())).map_err(|_| Error::FailedSendClientClosed)
+                        },
                         None => {
                             warn!(self.log, "InstanceManager's request channel closed; shutting down");
                             break;
@@ -638,7 +667,8 @@ impl InstanceManagerRunner {
 
         // Otherwise, we pipeline the request, and send it to the instance,
         // where it can receive an appropriate response.
-        instance.terminate(tx).await?;
+        let mark_failed = false;
+        instance.terminate(tx, mark_failed).await?;
         Ok(())
     }
 
@@ -775,6 +805,56 @@ impl InstanceManagerRunner {
         tx.send(Ok(state)).map_err(|_| Error::FailedSendClientClosed)?;
         Ok(())
     }
+
+    async fn use_only_these_disks(&mut self, disks: AllDisks) {
+        // Consider the generation number on the incoming request to avoid
+        // applying old requests.
+        let requested_generation = *disks.generation();
+        if let Some(last_gen) = self.storage_generation {
+            if last_gen >= requested_generation {
+                // This request looks old, ignore it.
+                info!(self.log, "use_only_these_disks: Ignoring request";
+                    "last_gen" => ?last_gen, "requested_gen" => ?requested_generation);
+                return;
+            }
+        }
+        self.storage_generation = Some(requested_generation);
+        info!(self.log, "use_only_these_disks: Processing new request";
+            "gen" => ?requested_generation);
+
+        let u2_set: HashSet<_> = disks.all_u2_zpools().into_iter().collect();
+
+        let mut to_remove = vec![];
+        for (id, (_, instance)) in self.instances.iter() {
+            // If we can read the filesystem pool, consider it. Otherwise, move
+            // on, to prevent blocking the cleanup of other instances.
+            let Ok(Some(filesystem_pool)) =
+                instance.get_filesystem_zpool().await
+            else {
+                info!(self.log, "use_only_these_disks: Cannot read filesystem pool"; "instance_id" => ?id);
+                continue;
+            };
+            if !u2_set.contains(&filesystem_pool) {
+                to_remove.push(*id);
+            }
+        }
+
+        for id in to_remove {
+            info!(self.log, "use_only_these_disks: Removing instance"; "instance_id" => ?id);
+            if let Some((_, instance)) = self.instances.remove(&id) {
+                let (tx, rx) = oneshot::channel();
+                let mark_failed = true;
+                if let Err(e) = instance.terminate(tx, mark_failed).await {
+                    warn!(self.log, "use_only_these_disks: Failed to request instance removal"; "err" => ?e);
+                    continue;
+                }
+
+                if let Err(e) = rx.await {
+                    warn!(self.log, "use_only_these_disks: Failed while removing instance"; "err" => ?e);
+                }
+            }
+        }
+    }
 }
 
 /// Represents membership of an instance in the [`InstanceManager`].
diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs
index faea94f552..e920ffc3fc 100644
--- a/sled-agent/src/long_running_tasks.rs
+++ b/sled-agent/src/long_running_tasks.rs
@@ -20,7 +20,7 @@ use crate::config::Config;
 use crate::hardware_monitor::HardwareMonitor;
 use crate::services::ServiceManager;
 use crate::sled_agent::SledAgent;
-use crate::storage_monitor::StorageMonitor;
+use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle};
 use crate::zone_bundle::{CleanupContext, ZoneBundler};
 use bootstore::schemes::v0 as bootstore;
 use key_manager::{KeyManager, StorageKeyRequester};
@@ -46,6 +46,10 @@ pub struct LongRunningTaskHandles {
     /// for establishing zpools on disks and managing their datasets.
     pub storage_manager: StorageHandle,
 
+    /// A mechanism for talking to the [`StorageMonitor`], which reacts to disk
+    /// changes and updates the dump devices.
+    pub storage_monitor_handle: StorageMonitorHandle,
+
     /// A mechanism for interacting with the hardware device tree
     pub hardware_manager: HardwareManager,
 
@@ -71,7 +75,8 @@ pub async fn spawn_all_longrunning_tasks(
     let mut storage_manager =
         spawn_storage_manager(log, storage_key_requester.clone());
 
-    spawn_storage_monitor(log, storage_manager.clone());
+    let storage_monitor_handle =
+        spawn_storage_monitor(log, storage_manager.clone());
 
     let nongimlet_observed_disks =
         config.nongimlet_observed_disks.clone().unwrap_or(vec![]);
@@ -106,6 +111,7 @@ pub async fn spawn_all_longrunning_tasks(
         LongRunningTaskHandles {
             storage_key_requester,
             storage_manager,
+            storage_monitor_handle,
             hardware_manager,
             bootstore,
             zone_bundler,
@@ -137,13 +143,17 @@ fn spawn_storage_manager(
     handle
 }
 
-fn spawn_storage_monitor(log: &Logger, storage_handle: StorageHandle) {
+fn spawn_storage_monitor(
+    log: &Logger,
+    storage_handle: StorageHandle,
+) -> StorageMonitorHandle {
     info!(log, "Starting StorageMonitor");
-    let storage_monitor =
+    let (storage_monitor, handle) =
         StorageMonitor::new(log, MountConfig::default(), storage_handle);
     tokio::spawn(async move {
         storage_monitor.run().await;
     });
+    handle
 }
 
 async fn spawn_hardware_manager(
diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs
index 40af604645..9451484f21 100644
--- a/sled-agent/src/probe_manager.rs
+++ b/sled-agent/src/probe_manager.rs
@@ -10,20 +10,21 @@ use nexus_client::types::{
     BackgroundTasksActivateRequest, ProbeExternalIp, ProbeInfo,
 };
 use omicron_common::api::external::{
-    VpcFirewallRuleAction, VpcFirewallRuleDirection, VpcFirewallRulePriority,
-    VpcFirewallRuleStatus,
+    Generation, VpcFirewallRuleAction, VpcFirewallRuleDirection,
+    VpcFirewallRulePriority, VpcFirewallRuleStatus,
 };
 use omicron_common::api::internal::shared::NetworkInterface;
-use rand::prelude::SliceRandom;
+use rand::prelude::IteratorRandom;
 use rand::SeedableRng;
 use sled_storage::dataset::ZONE_DATASET;
 use sled_storage::manager::StorageHandle;
+use sled_storage::resources::AllDisks;
 use slog::{error, warn, Logger};
 use std::collections::{HashMap, HashSet};
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::sync::Mutex;
+use tokio::sync::{Mutex, MutexGuard};
 use tokio::task::JoinHandle;
 use tokio::time::sleep;
 use uuid::Uuid;
@@ -45,6 +46,11 @@ pub(crate) struct ProbeManager {
     inner: Arc<ProbeManagerInner>,
 }
 
+struct RunningProbes {
+    storage_generation: Option<Generation>,
+    zones: HashMap<Uuid, RunningZone>,
+}
+
 pub(crate) struct ProbeManagerInner {
     join_handle: Mutex<Option<JoinHandle<()>>>,
     nexus_client: NexusClientWithResolver,
@@ -53,7 +59,7 @@ pub(crate) struct ProbeManagerInner {
     vnic_allocator: VnicAllocator<Etherstub>,
     storage: StorageHandle,
     port_manager: PortManager,
-    running_probes: Mutex<HashMap<Uuid, RunningZone>>,
+    running_probes: Mutex<RunningProbes>,
 }
 
 impl ProbeManager {
@@ -72,7 +78,10 @@ impl ProbeManager {
                     VNIC_ALLOCATOR_SCOPE,
                     etherstub,
                 ),
-                running_probes: Mutex::new(HashMap::new()),
+                running_probes: Mutex::new(RunningProbes {
+                    storage_generation: None,
+                    zones: HashMap::new(),
+                }),
                 nexus_client,
                 log,
                 sled_id,
@@ -85,6 +94,51 @@ impl ProbeManager {
     pub(crate) async fn run(&self) {
         self.inner.run().await;
     }
+
+    /// Removes any probes using filesystem roots on zpools that are not
+    /// contained in the set of "disks".
+    pub(crate) async fn use_only_these_disks(&self, disks: &AllDisks) {
+        let u2_set: HashSet<_> = disks.all_u2_zpools().into_iter().collect();
+        let mut probes = self.inner.running_probes.lock().await;
+
+        // Consider the generation number on the incoming request to avoid
+        // applying old requests.
+        let requested_generation = *disks.generation();
+        if let Some(last_gen) = probes.storage_generation {
+            if last_gen >= requested_generation {
+                // This request looks old, ignore it.
+                info!(self.inner.log, "use_only_these_disks: Ignoring request";
+                    "last_gen" => ?last_gen, "requested_gen" => ?requested_generation);
+                return;
+            }
+        }
+        probes.storage_generation = Some(requested_generation);
+        info!(self.inner.log, "use_only_these_disks: Processing new request";
+            "gen" => ?requested_generation);
+
+        let to_remove = probes
+            .zones
+            .iter()
+            .filter_map(|(id, probe)| {
+                let Some(probe_pool) = probe.root_zpool() else {
+                    // No known pool for this probe
+                    info!(self.inner.log, "use_only_these_disks: Cannot read filesystem pool"; "id" => ?id);
+                    return None;
+                };
+
+                if !u2_set.contains(probe_pool) {
+                    Some(*id)
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+
+        for probe_id in to_remove {
+            info!(self.inner.log, "use_only_these_disks: Removing probe"; "probe_id" => ?probe_id);
+            self.inner.remove_probe_locked(&mut probes, probe_id).await;
+        }
+    }
 }
 
 /// State information about a probe. This is a common representation that
@@ -226,14 +280,15 @@ impl ProbeManagerInner {
     /// boots the probe zone.
     async fn add_probe(self: &Arc<Self>, probe: &ProbeState) -> Result<()> {
         let mut rng = rand::rngs::StdRng::from_entropy();
-        let root = self
+        let current_disks = self
             .storage
             .get_latest_disks()
             .await
-            .all_u2_mountpoints(ZONE_DATASET)
+            .all_u2_mountpoints(ZONE_DATASET);
+        let zone_root_path = current_disks
+            .into_iter()
             .choose(&mut rng)
-            .ok_or_else(|| anyhow!("u2 not found"))?
-            .clone();
+            .ok_or_else(|| anyhow!("u2 not found"))?;
 
         let nic = probe
             .interface
@@ -268,7 +323,7 @@ impl ProbeManagerInner {
             .builder()
             .with_log(self.log.clone())
             .with_underlay_vnic_allocator(&self.vnic_allocator)
-            .with_zone_root_path(&root)
+            .with_zone_root_path(zone_root_path)
             .with_zone_image_paths(&["/opt/oxide".into()])
             .with_zone_type("probe")
             .with_unique_name(probe.id)
@@ -290,13 +345,13 @@ impl ProbeManagerInner {
         rz.ensure_address_for_port("overlay", 0).await?;
         info!(self.log, "started probe {}", probe.id);
 
-        self.running_probes.lock().await.insert(probe.id, rz);
+        self.running_probes.lock().await.zones.insert(probe.id, rz);
 
         Ok(())
     }
 
     /// Remove a set of probes from this sled.
-    async fn remove<'a, I>(self: &Arc<Self>, probes: I)
+    async fn remove<'a, I>(&self, probes: I)
     where
         I: Iterator<Item = &'a ProbeState>,
     {
@@ -308,8 +363,17 @@ impl ProbeManagerInner {
 
     /// Remove a probe from this sled. This tears down the zone and it's
     /// network resources.
-    async fn remove_probe(self: &Arc<Self>, id: Uuid) {
-        match self.running_probes.lock().await.remove(&id) {
+    async fn remove_probe(&self, id: Uuid) {
+        let mut probes = self.running_probes.lock().await;
+        self.remove_probe_locked(&mut probes, id).await
+    }
+
+    async fn remove_probe_locked(
+        &self,
+        probes: &mut MutexGuard<'_, RunningProbes>,
+        id: Uuid,
+    ) {
+        match probes.zones.remove(&id) {
             Some(mut running_zone) => {
                 for l in running_zone.links_mut() {
                     if let Err(e) = l.delete() {
diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs
index f4e9f8da0a..6bf8a4fbe5 100644
--- a/sled-agent/src/services.rs
+++ b/sled-agent/src/services.rs
@@ -57,7 +57,7 @@ use illumos_utils::running_zone::{
 };
 use illumos_utils::zfs::ZONE_ZFS_RAMDISK_DATASET_MOUNTPOINT;
 use illumos_utils::zone::AddressRequest;
-use illumos_utils::zpool::ZpoolName;
+use illumos_utils::zpool::{PathInPool, ZpoolName};
 use illumos_utils::{execute, PFEXEC};
 use internal_dns::resolver::Resolver;
 use itertools::Itertools;
@@ -445,6 +445,11 @@ impl OmicronZonesConfigLocal {
 /// Combines the Nexus-provided `OmicronZoneConfig` (which describes what Nexus
 /// wants for this zone) with any locally-determined configuration (like the
 /// path to the root filesystem)
+//
+// NOTE: Although the path to the root filesystem is not exactly equal to the
+// ZpoolName, it is derivable from it, and the ZpoolName for the root filesystem
+// is now being supplied as a part of OmicronZoneConfig. Therefore, this struct
+// is less necessary than it has been historically.
 #[derive(
     Clone,
     Debug,
@@ -551,10 +556,15 @@ impl<'a> ZoneArgs<'a> {
     }
 
     /// Return the root filesystem path for this zone
-    pub fn root(&self) -> &Utf8Path {
+    pub fn root(&self) -> PathInPool {
         match self {
-            ZoneArgs::Omicron(zone_config) => &zone_config.root,
-            ZoneArgs::Switch(zone_request) => &zone_request.root,
+            ZoneArgs::Omicron(zone_config) => PathInPool {
+                pool: zone_config.zone.filesystem_pool.clone(),
+                path: zone_config.root.clone(),
+            },
+            ZoneArgs::Switch(zone_request) => {
+                PathInPool { pool: None, path: zone_request.root.clone() }
+            }
         }
     }
 }
@@ -1436,7 +1446,7 @@ impl ServiceManager {
         let all_disks = self.inner.storage.get_latest_disks().await;
         if let Some((_, boot_zpool)) = all_disks.boot_disk() {
             zone_image_paths.push(boot_zpool.dataset_mountpoint(
-                &all_disks.mount_config.root,
+                &all_disks.mount_config().root,
                 INSTALL_DATASET,
             ));
         }
@@ -1462,7 +1472,7 @@ impl ServiceManager {
         let installed_zone = zone_builder
             .with_log(self.inner.log.clone())
             .with_underlay_vnic_allocator(&self.inner.underlay_vnic_allocator)
-            .with_zone_root_path(&request.root())
+            .with_zone_root_path(request.root())
             .with_zone_image_paths(zone_image_paths.as_slice())
             .with_zone_type(&zone_type_str)
             .with_datasets(datasets.as_slice())
@@ -2904,7 +2914,8 @@ impl ServiceManager {
             )
             .await?;
 
-        let config = OmicronZoneConfigLocal { zone: zone.clone(), root };
+        let config =
+            OmicronZoneConfigLocal { zone: zone.clone(), root: root.path };
 
         let runtime = self
             .initialize_zone(
@@ -3172,7 +3183,7 @@ impl ServiceManager {
 
         // Collect information that's necessary to start new zones
         let storage = self.inner.storage.get_latest_disks().await;
-        let mount_config = &storage.mount_config;
+        let mount_config = storage.mount_config();
         let all_u2_pools = storage.all_u2_zpools();
         let time_is_synchronized =
             match self.timesync_get_locked(&existing_zones).await {
@@ -3289,7 +3300,7 @@ impl ServiceManager {
         mount_config: &MountConfig,
         zone: &OmicronZoneConfig,
         all_u2_pools: &Vec<ZpoolName>,
-    ) -> Result<Utf8PathBuf, Error> {
+    ) -> Result<PathInPool, Error> {
         let name = zone.zone_name();
 
         // If the caller has requested a specific durable dataset,
@@ -3368,7 +3379,9 @@ impl ServiceManager {
                 device: format!("zpool: {filesystem_pool}"),
             });
         }
-        Ok(filesystem_pool.dataset_mountpoint(&mount_config.root, ZONE_DATASET))
+        let path = filesystem_pool
+            .dataset_mountpoint(&mount_config.root, ZONE_DATASET);
+        Ok(PathInPool { pool: Some(filesystem_pool), path })
     }
 
     pub async fn cockroachdb_initialize(&self) -> Result<(), Error> {
diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs
index be6c63f53a..e94b3b4984 100644
--- a/sled-agent/src/sim/instance.rs
+++ b/sled-agent/src/sim/instance.rs
@@ -211,7 +211,8 @@ impl SimInstanceInner {
             InstanceStateRequested::Stopped => {
                 match self.next_resting_state() {
                     VmmState::Starting => {
-                        self.state.terminate_rudely();
+                        let mark_failed = false;
+                        self.state.terminate_rudely(mark_failed);
                     }
                     VmmState::Running => self.queue_graceful_stop(),
                     // Idempotently allow requests to stop an instance that is
@@ -363,7 +364,8 @@ impl SimInstanceInner {
     /// Simulates rude termination by moving the instance to the Destroyed state
     /// immediately and clearing the queue of pending state transitions.
     fn terminate(&mut self) -> SledInstanceState {
-        self.state.terminate_rudely();
+        let mark_failed = false;
+        self.state.terminate_rudely(mark_failed);
         self.queue.clear();
         self.destroyed = true;
         self.state.sled_instance_state()
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index 82c16b0b8d..9832144791 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -27,6 +27,7 @@ use crate::params::{
 };
 use crate::probe_manager::ProbeManager;
 use crate::services::{self, ServiceManager};
+use crate::storage_monitor::StorageMonitorHandle;
 use crate::updates::{ConfigUpdates, UpdateManager};
 use crate::vmm_reservoir::{ReservoirMode, VmmReservoirManager};
 use crate::zone_bundle;
@@ -123,6 +124,9 @@ pub enum Error {
     #[error("Error managing storage: {0}")]
     Storage(#[from] sled_storage::error::Error),
 
+    #[error("Error monitoring storage: {0}")]
+    StorageMonitor(#[from] crate::storage_monitor::Error),
+
     #[error("Error updating: {0}")]
     Download(#[from] crate::updates::Error),
 
@@ -277,6 +281,10 @@ struct SledAgentInner {
     // Component of Sled Agent responsible for storage and dataset management.
     storage: StorageHandle,
 
+    // Component of Sled Agent responsible for monitoring storage and updating
+    // dump devices.
+    storage_monitor: StorageMonitorHandle,
+
     // Component of Sled Agent responsible for managing Propolis instances.
     instances: InstanceManager,
 
@@ -562,6 +570,9 @@ impl SledAgent {
                 subnet: request.body.subnet,
                 start_request: request,
                 storage: long_running_task_handles.storage_manager.clone(),
+                storage_monitor: long_running_task_handles
+                    .storage_monitor_handle
+                    .clone(),
                 instances,
                 probes,
                 hardware: long_running_task_handles.hardware_manager.clone(),
@@ -808,7 +819,60 @@ impl SledAgent {
         &self,
         config: OmicronPhysicalDisksConfig,
     ) -> Result<DisksManagementResult, Error> {
-        Ok(self.storage().omicron_physical_disks_ensure(config).await?)
+        info!(self.log, "physical disks ensure");
+        // Tell the storage subsystem which disks should be managed.
+        let disk_result =
+            self.storage().omicron_physical_disks_ensure(config).await?;
+        info!(self.log, "physical disks ensure: Updated storage");
+
+        // Grab a view of the latest set of disks, alongside a generation
+        // number.
+        //
+        // This generation is at LEAST as high as our last call through
+        // omicron_physical_disks_ensure. It may actually be higher, if a
+        // concurrent operation occurred.
+        //
+        // "latest_disks" has a generation number, which is important for other
+        // subcomponents of Sled Agent to consider. If multiple requests to
+        // ensure disks arrive concurrently, it's important to "only advance
+        // forward" as requested by Nexus.
+        //
+        // For example: if we receive the following requests concurrently:
+        // - Use Disks {A, B, C}, generation = 1
+        // - Use Disks {A, B, C, D}, generation = 2
+        //
+        // If we ignore generation numbers, it's possible that we start using
+        // "disk D" -- e.g., for instance filesystems -- and then immediately
+        // delete it when we process the request with "generation 1".
+        //
+        // By keeping these requests ordered, we prevent this thrashing, and
+        // ensure that we always progress towards the last-requested state.
+        let latest_disks = self.storage().get_latest_disks().await;
+        let our_gen = latest_disks.generation();
+        info!(self.log, "physical disks ensure: Propagating new generation of disks"; "generation" => ?our_gen);
+
+        // Ensure that the StorageMonitor, and the dump devices, have committed
+        // to start using new disks and stop using old ones.
+        self.inner.storage_monitor.await_generation(*our_gen).await?;
+        info!(self.log, "physical disks ensure: Updated storage monitor");
+
+        // Ensure that the ZoneBundler, if it was creating a bundle referencing
+        // the old U.2s, has stopped using them.
+        self.inner.zone_bundler.await_completion_of_prior_bundles().await;
+        info!(self.log, "physical disks ensure: Updated zone bundler");
+
+        // Ensure that all probes, at least after our call to
+        // "omicron_physical_disks_ensure", stop using any disks that
+        // may have been in-service from before that request.
+        self.inner.probes.use_only_these_disks(&latest_disks).await;
+        info!(self.log, "physical disks ensure: Updated probes");
+
+        // Do the same for instances - mark them failed if they were using
+        // expunged disks.
+        self.inner.instances.use_only_these_disks(latest_disks).await?;
+        info!(self.log, "physical disks ensure: Updated instances");
+
+        Ok(disk_result)
     }
 
     /// List the Omicron zone configuration that's currently running
diff --git a/sled-agent/src/storage_monitor.rs b/sled-agent/src/storage_monitor.rs
index 8cb63e31f8..11883adcd2 100644
--- a/sled-agent/src/storage_monitor.rs
+++ b/sled-agent/src/storage_monitor.rs
@@ -7,10 +7,18 @@
 //! code.
 
 use crate::dump_setup::DumpSetup;
+use omicron_common::api::external::Generation;
 use sled_storage::config::MountConfig;
 use sled_storage::manager::StorageHandle;
 use sled_storage::resources::AllDisks;
 use slog::Logger;
+use tokio::sync::watch;
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("Storage Monitor no longer running")]
+    NotRunning,
+}
 
 pub struct StorageMonitor {
     log: Logger,
@@ -18,6 +26,46 @@ pub struct StorageMonitor {
 
     // Invokes dumpadm(8) and savecore(8) when new disks are encountered
     dump_setup: DumpSetup,
+
+    tx: watch::Sender<StorageMonitorStatus>,
+}
+
+/// Emits status about storage monitoring.
+#[derive(Debug, Clone)]
+pub struct StorageMonitorStatus {
+    /// The latest generation of physical disks to be processed
+    /// by the storage monitor.
+    pub latest_gen: Option<Generation>,
+}
+
+impl StorageMonitorStatus {
+    fn new() -> Self {
+        Self { latest_gen: None }
+    }
+}
+
+#[derive(Clone)]
+pub struct StorageMonitorHandle {
+    rx: watch::Receiver<StorageMonitorStatus>,
+}
+
+impl StorageMonitorHandle {
+    pub async fn await_generation(
+        &self,
+        wanted: Generation,
+    ) -> Result<(), Error> {
+        self.rx
+            .clone()
+            .wait_for(|status| {
+                let Some(observed) = status.latest_gen else {
+                    return false;
+                };
+                return observed >= wanted;
+            })
+            .await
+            .map_err(|_| Error::NotRunning)?;
+        Ok(())
+    }
 }
 
 impl StorageMonitor {
@@ -25,10 +73,14 @@ impl StorageMonitor {
         log: &Logger,
         mount_config: MountConfig,
         storage_manager: StorageHandle,
-    ) -> StorageMonitor {
+    ) -> (StorageMonitor, StorageMonitorHandle) {
         let dump_setup = DumpSetup::new(&log, mount_config);
         let log = log.new(o!("component" => "StorageMonitor"));
-        StorageMonitor { log, storage_manager, dump_setup }
+        let (tx, rx) = watch::channel(StorageMonitorStatus::new());
+        (
+            StorageMonitor { log, storage_manager, dump_setup, tx },
+            StorageMonitorHandle { rx },
+        )
     }
 
     /// Run the main receive loop of the `StorageMonitor`
@@ -50,10 +102,14 @@ impl StorageMonitor {
     }
 
     async fn handle_resource_update(&mut self, updated_disks: AllDisks) {
+        let generation = updated_disks.generation();
         self.dump_setup
             .update_dumpdev_setup(
                 updated_disks.iter_managed().map(|(_id, disk)| disk),
             )
             .await;
+        self.tx.send_replace(StorageMonitorStatus {
+            latest_gen: Some(*generation),
+        });
     }
 }
diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs
index 16147e5957..088e7b356f 100644
--- a/sled-agent/src/zone_bundle.rs
+++ b/sled-agent/src/zone_bundle.rs
@@ -256,6 +256,9 @@ impl Inner {
     // exist; and returns those.
     async fn bundle_directories(&self) -> Vec<Utf8PathBuf> {
         let resources = self.storage_handle.get_latest_disks().await;
+        // NOTE: These bundle directories are always stored on M.2s, so we don't
+        // need to worry about synchronizing with U.2 disk expungement at the
+        // callsite.
         let expected = resources.all_zone_bundle_directories();
         let mut out = Vec::with_capacity(expected.len());
         for each in expected.into_iter() {
@@ -426,12 +429,17 @@ impl ZoneBundler {
         zone: &RunningZone,
         cause: ZoneBundleCause,
     ) -> Result<ZoneBundleMetadata, BundleError> {
+        // NOTE: [Self::await_completion_of_prior_bundles] relies on this lock
+        // being held across this whole function. If we want more concurrency,
+        // we'll need to add a barrier-like mechanism to let callers know when
+        // prior bundles have completed.
         let inner = self.inner.lock().await;
         let storage_dirs = inner.bundle_directories().await;
         let resources = inner.storage_handle.get_latest_disks().await;
         let extra_log_dirs = resources
             .all_u2_mountpoints(U2_DEBUG_DATASET)
             .into_iter()
+            .map(|pool_path| pool_path.path)
             .collect();
         let context = ZoneBundleContext { cause, storage_dirs, extra_log_dirs };
         info!(
@@ -443,6 +451,14 @@ impl ZoneBundler {
         create(&self.log, zone, &context).await
     }
 
+    /// Awaits the completion of all prior calls to [ZoneBundler::create].
+    ///
+    /// This is critical for disk expungement, which wants to ensure that the
+    /// Sled Agent is no longer using devices after they have been expunged.
+    pub async fn await_completion_of_prior_bundles(&self) {
+        let _ = self.inner.lock().await;
+    }
+
     /// Return the paths for all bundles of the provided zone and ID.
     pub async fn bundle_paths(
         &self,
diff --git a/sled-storage/src/manager.rs b/sled-storage/src/manager.rs
index d374ab8e23..e081bc5034 100644
--- a/sled-storage/src/manager.rs
+++ b/sled-storage/src/manager.rs
@@ -584,7 +584,7 @@ impl StorageManager {
             // Identify which disks should be managed by the control
             // plane, and adopt all requested disks into the control plane
             // in a background task (see: [Self::manage_disks]).
-            self.resources.set_config(&ledger.data().disks);
+            self.resources.set_config(&ledger.data());
         } else {
             info!(self.log, "KeyManager ready, but no ledger detected");
         }
@@ -681,7 +681,7 @@ impl StorageManager {
 
         // Identify which disks should be managed by the control
         // plane, and adopt all requested disks into the control plane.
-        self.resources.set_config(&config.disks);
+        self.resources.set_config(&config);
 
         // Actually try to "manage" those disks, which may involve formatting
         // zpools and conforming partitions to those expected by the control
@@ -825,7 +825,7 @@ mod tests {
     use crate::dataset::DatasetKind;
     use crate::disk::RawSyntheticDisk;
     use crate::manager_test_harness::StorageManagerTestHarness;
-    use crate::resources::{DiskManagementError, ManagedDisk};
+    use crate::resources::DiskManagementError;
 
     use super::*;
     use camino_tempfile::tempdir_in;
@@ -999,21 +999,17 @@ mod tests {
 
         // Now let's verify we saw the correct firmware update.
         for rd in &raw_disks {
-            let managed =
-                all_disks_gen2.values.get(rd.identity()).expect("disk exists");
-            match managed {
-                ManagedDisk::ExplicitlyManaged(disk)
-                | ManagedDisk::ImplicitlyManaged(disk) => {
-                    assert_eq!(
-                        disk.firmware(),
-                        rd.firmware(),
-                        "didn't see firmware update"
-                    );
-                }
-                ManagedDisk::Unmanaged(disk) => {
-                    assert_eq!(disk, rd, "didn't see firmware update");
-                }
-            }
+            let firmware = all_disks_gen2
+                .iter_all()
+                .find_map(|(identity, _, _, fw)| {
+                    if identity == rd.identity() {
+                        Some(fw)
+                    } else {
+                        None
+                    }
+                })
+                .expect("disk exists");
+            assert_eq!(firmware, rd.firmware(), "didn't see firmware update");
         }
 
         harness.cleanup().await;
@@ -1236,7 +1232,8 @@ mod tests {
 
         let expected: HashSet<_> =
             disks.iter().skip(1).take(3).map(|d| d.identity()).collect();
-        let actual: HashSet<_> = all_disks.values.keys().collect();
+        let actual: HashSet<_> =
+            all_disks.iter_all().map(|(identity, _, _, _)| identity).collect();
         assert_eq!(expected, actual);
 
         // Ensure the same set of disks and make sure no change occurs
@@ -1251,7 +1248,10 @@ mod tests {
             .await
             .unwrap();
         let all_disks2 = harness.handle().get_latest_disks().await;
-        assert_eq!(all_disks.values, all_disks2.values);
+        assert_eq!(
+            all_disks.iter_all().collect::<Vec<_>>(),
+            all_disks2.iter_all().collect::<Vec<_>>()
+        );
 
         // Add a disjoint set of disks and see that only they come through
         harness
@@ -1266,7 +1266,8 @@ mod tests {
         let all_disks = harness.handle().get_latest_disks().await;
         let expected: HashSet<_> =
             disks.iter().skip(4).take(5).map(|d| d.identity()).collect();
-        let actual: HashSet<_> = all_disks.values.keys().collect();
+        let actual: HashSet<_> =
+            all_disks.iter_all().map(|(identity, _, _, _)| identity).collect();
         assert_eq!(expected, actual);
 
         harness.cleanup().await;
diff --git a/sled-storage/src/resources.rs b/sled-storage/src/resources.rs
index 5cc4672e1e..f02f62e0a6 100644
--- a/sled-storage/src/resources.rs
+++ b/sled-storage/src/resources.rs
@@ -6,12 +6,16 @@
 
 use crate::config::MountConfig;
 use crate::dataset::{DatasetError, M2_DEBUG_DATASET};
-use crate::disk::{Disk, DiskError, OmicronPhysicalDiskConfig, RawDisk};
+use crate::disk::{
+    Disk, DiskError, OmicronPhysicalDiskConfig, OmicronPhysicalDisksConfig,
+    RawDisk,
+};
 use crate::error::Error;
 use camino::Utf8PathBuf;
 use cfg_if::cfg_if;
-use illumos_utils::zpool::ZpoolName;
+use illumos_utils::zpool::{PathInPool, ZpoolName};
 use key_manager::StorageKeyRequester;
+use omicron_common::api::external::Generation;
 use omicron_common::disk::DiskIdentity;
 use omicron_uuid_kinds::ZpoolUuid;
 use schemars::JsonSchema;
@@ -102,7 +106,7 @@ impl DisksManagementResult {
 // the request of the broader control plane. This enum encompasses that duality,
 // by representing all disks that can exist, managed or not.
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub enum ManagedDisk {
+pub(crate) enum ManagedDisk {
     // A disk explicitly managed by the control plane.
     //
     // This includes U.2s which Nexus has told us to format and use.
@@ -121,6 +125,11 @@ pub enum ManagedDisk {
     Unmanaged(RawDisk),
 }
 
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct AllDisksInner {
+    values: BTreeMap<DiskIdentity, ManagedDisk>,
+}
+
 /// The disks, keyed by their identity, managed by the sled agent.
 ///
 /// This state is owned by [`crate::manager::StorageManager`], through
@@ -139,16 +148,28 @@ pub enum ManagedDisk {
 /// gets cloned or dropped.
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub struct AllDisks {
-    pub values: Arc<BTreeMap<DiskIdentity, ManagedDisk>>,
-    pub mount_config: MountConfig,
+    // This generation corresponds to the generation supplied in
+    // [OmicronPhysicalDisksConfig].
+    generation: Generation,
+    inner: Arc<AllDisksInner>,
+    mount_config: MountConfig,
 }
 
 impl AllDisks {
+    /// Returns the latest generation number of this set of disks.
+    pub fn generation(&self) -> &Generation {
+        &self.generation
+    }
+
+    pub fn mount_config(&self) -> &MountConfig {
+        &self.mount_config
+    }
+
     /// Returns the identity of the boot disk.
     ///
     /// If this returns `None`, we have not processed the boot disk yet.
     pub fn boot_disk(&self) -> Option<(DiskIdentity, ZpoolName)> {
-        for (id, disk) in self.values.iter() {
+        for (id, disk) in self.inner.values.iter() {
             if let ManagedDisk::ImplicitlyManaged(disk) = disk {
                 if disk.is_boot_disk() {
                     return Some((id.clone(), disk.zpool_name().clone()));
@@ -179,18 +200,21 @@ impl AllDisks {
     }
 
     /// Returns all mountpoints within all U.2s for a particular dataset.
-    pub fn all_u2_mountpoints(&self, dataset: &str) -> Vec<Utf8PathBuf> {
+    pub fn all_u2_mountpoints(&self, dataset: &str) -> Vec<PathInPool> {
         self.all_u2_zpools()
-            .iter()
-            .map(|zpool| {
-                zpool.dataset_mountpoint(&self.mount_config.root, dataset)
+            .into_iter()
+            .map(|pool| {
+                let path =
+                    pool.dataset_mountpoint(&self.mount_config.root, dataset);
+                PathInPool { pool: Some(pool), path }
             })
             .collect()
     }
 
     /// Returns all zpools managed by the control plane
     pub fn get_all_zpools(&self) -> Vec<(ZpoolName, DiskVariant)> {
-        self.values
+        self.inner
+            .values
             .values()
             .filter_map(|disk| match disk {
                 ManagedDisk::ExplicitlyManaged(disk)
@@ -206,7 +230,8 @@ impl AllDisks {
     //
     // Only returns zpools from disks actively being managed.
     fn all_zpools(&self, variant: DiskVariant) -> Vec<ZpoolName> {
-        self.values
+        self.inner
+            .values
             .values()
             .filter_map(|disk| match disk {
                 ManagedDisk::ExplicitlyManaged(disk)
@@ -231,7 +256,7 @@ impl AllDisks {
 
     /// Returns an iterator over all managed disks.
     pub fn iter_managed(&self) -> impl Iterator<Item = (&DiskIdentity, &Disk)> {
-        self.values.iter().filter_map(|(identity, disk)| match disk {
+        self.inner.values.iter().filter_map(|(identity, disk)| match disk {
             ManagedDisk::ExplicitlyManaged(disk) => Some((identity, disk)),
             ManagedDisk::ImplicitlyManaged(disk) => Some((identity, disk)),
             _ => None,
@@ -243,7 +268,7 @@ impl AllDisks {
         &self,
     ) -> impl Iterator<Item = (&DiskIdentity, DiskVariant, i64, &DiskFirmware)>
     {
-        self.values.iter().map(|(identity, disk)| match disk {
+        self.inner.values.iter().map(|(identity, disk)| match disk {
             ManagedDisk::ExplicitlyManaged(disk) => {
                 (identity, disk.variant(), disk.slot(), disk.firmware())
             }
@@ -284,8 +309,11 @@ impl StorageResources {
         mount_config: MountConfig,
         key_requester: StorageKeyRequester,
     ) -> Self {
-        let disks =
-            AllDisks { values: Arc::new(BTreeMap::new()), mount_config };
+        let disks = AllDisks {
+            generation: Generation::new(),
+            inner: Arc::new(AllDisksInner { values: BTreeMap::new() }),
+            mount_config,
+        };
         Self {
             log: log.new(o!("component" => "StorageResources")),
             key_requester,
@@ -310,8 +338,14 @@ impl StorageResources {
     /// Does not attempt to manage any of the physical disks previously
     /// observed. To synchronize the "set of requested disks" with the "set of
     /// observed disks", call [Self::synchronize_disk_management].
-    pub fn set_config(&mut self, config: &Vec<OmicronPhysicalDiskConfig>) {
+    pub fn set_config(&mut self, config: &OmicronPhysicalDisksConfig) {
+        let our_gen = &mut self.disks.generation;
+        if *our_gen > config.generation {
+            return;
+        }
+        *our_gen = config.generation;
         self.control_plane_disks = config
+            .disks
             .iter()
             .map(|disk| (disk.identity.clone(), disk.clone()))
             .collect();
@@ -336,14 +370,14 @@ impl StorageResources {
         &mut self,
     ) -> DisksManagementResult {
         let mut updated = false;
-        let disks = Arc::make_mut(&mut self.disks.values);
+        let disks = Arc::make_mut(&mut self.disks.inner);
         info!(self.log, "Synchronizing disk managment");
 
         // "Unmanage" all disks no longer requested by the control plane.
         //
         // This updates the reported sets of "managed" disks, and performs no
         // other modifications to the underlying storage.
-        for (identity, managed_disk) in &mut *disks {
+        for (identity, managed_disk) in &mut disks.values {
             match managed_disk {
                 // This leaves the presence of the disk still in "Self", but
                 // downgrades the disk to an unmanaged status.
@@ -365,7 +399,7 @@ impl StorageResources {
         // configuration.
         let mut result = DisksManagementResult::default();
         for (identity, config) in &self.control_plane_disks {
-            let Some(managed_disk) = disks.get_mut(identity) else {
+            let Some(managed_disk) = disks.values.get_mut(identity) else {
                 warn!(
                     self.log,
                     "Control plane disk requested, but not detected within sled";
@@ -496,11 +530,11 @@ impl StorageResources {
 
         // This is a trade-off for simplicity even though we may be potentially
         // cloning data before we know if there is a write action to perform.
-        let disks = Arc::make_mut(&mut self.disks.values);
+        let disks = Arc::make_mut(&mut self.disks.inner);
 
         // First check if there are any updates we need to apply to existing
         // managed disks.
-        if let Some(managed) = disks.get_mut(&disk_identity) {
+        if let Some(managed) = disks.values.get_mut(&disk_identity) {
             let mut updated = false;
             match managed {
                 ManagedDisk::ExplicitlyManaged(mdisk)
@@ -532,7 +566,9 @@ impl StorageResources {
         // If there's no update then we are inserting a new disk.
         match disk.variant() {
             DiskVariant::U2 => {
-                disks.insert(disk_identity, ManagedDisk::Unmanaged(disk));
+                disks
+                    .values
+                    .insert(disk_identity, ManagedDisk::Unmanaged(disk));
             }
             DiskVariant::M2 => {
                 let managed_disk = Disk::new(
@@ -543,12 +579,13 @@ impl StorageResources {
                     Some(&self.key_requester),
                 )
                 .await?;
-                disks.insert(
+                disks.values.insert(
                     disk_identity,
                     ManagedDisk::ImplicitlyManaged(managed_disk),
                 );
             }
         }
+
         self.disk_updates.send_replace(self.disks.clone());
 
         Ok(())
@@ -562,7 +599,7 @@ impl StorageResources {
     /// are only added once.
     pub(crate) fn remove_disk(&mut self, id: &DiskIdentity) {
         info!(self.log, "Removing disk"; "identity" => ?id);
-        let Some(entry) = self.disks.values.get(id) else {
+        let Some(entry) = self.disks.inner.values.get(id) else {
             info!(self.log, "Disk not found by id, exiting"; "identity" => ?id);
             return;
         };
@@ -589,7 +626,9 @@ impl StorageResources {
         }
 
         // Safe to unwrap as we just checked the key existed above
-        Arc::make_mut(&mut self.disks.values).remove(id).unwrap();
+        let disks = Arc::make_mut(&mut self.disks.inner);
+        disks.values.remove(id).unwrap();
+
         self.disk_updates.send_replace(self.disks.clone());
     }
 }

From 748a1d7b46279de5b7ccc5f3c1a98bbb091df887 Mon Sep 17 00:00:00 2001
From: Sean Klein <sean@oxide.computer>
Date: Mon, 15 Jul 2024 16:55:48 -0700
Subject: [PATCH 22/27] [nexus] Expunge disk internal API, omdb commands
 (#5994)

Provides an internal API to remove disks, and wires it into omdb.
Additionally, expands omdb commands for visibility.

- `omdb db physical-disks` can be used to view all "control plane
physical disks". This is similar to, but distinct from, the `omdb db
inventory physical-disks` command, as it reports control plane disks
that have been adopted in the control plane. This command is necessary
for identifying the UUID of the associated control plane object, which
is not observable via inventory.
- `omdb nexus sleds expunge-disk` can be used to expunge a physical disk
from a sled. This relies on many prior patches to operate correctly, but
with the combination of: #5987, #5965, #5931, #5952, #5601, #5599, we
can observe the following behavior: expunging a disk leads to all
"users" of that disk (zone filesystems, datasets, zone bundles, etc)
being removed.

I tested this PR on a4x2 using the following steps:

```bash
# Boot a4x2, confirm the Nexus zone is running
# From g0, zlogin oxz_switch

$ omdb db sleds

SERIAL  IP                             ROLE      POLICY      STATE   ID
 g2      [fd00:1122:3344:103::1]:12345  -         in service  active  29fede5f-37e4-4528-bcf2-f3ee94924894
 g0      [fd00:1122:3344:101::1]:12345  scrimlet  in service  active  6a2c7019-d055-4256-8bad-042b97aa0e5e
 g1      [fd00:1122:3344:102::1]:12345  -         in service  active  a611b43e-3995-4cd4-9603-89ca6aca3dc5
 g3      [fd00:1122:3344:104::1]:12345  scrimlet  in service  active  f62f2cfe-d17b-4bd6-ae64-57e8224d3672

# We'll plan on expunging a disk on g1, and observing the effects.
$ export SLED_ID=a611b43e-3995-4cd4-9603-89ca6aca3dc5
$ export OMDB_SLED_AGENT_URL=http://[fd00:1122:3344:102::1]:12345
$ omdb sled-agent zones list

    "oxz_cockroachdb_b3fecda8-2eb8-4ff3-9cf6-90c94fba7c50"
    "oxz_crucible_19831c98-3137-4af4-a93d-fc1a17c138f2"
    "oxz_crucible_6adcb8ec-6c9e-4e8a-a8d4-bbf9ad44e2c4"
    "oxz_crucible_74b2f587-10ce-4131-97fd-9832c52c8a41"
    "oxz_crucible_9e422508-f4d5-4c24-8dde-0080c0916419"
    "oxz_crucible_a47e9625-d189-4001-877a-cc3aa5b1f3eb"
    "oxz_crucible_pantry_c3b4e3cb-3e23-4f5e-921b-04e4801924fd"
    "oxz_external_dns_7e669b6f-a3fe-47a9-addd-20e42c58b8bb"
    "oxz_internal_dns_1a45a6e8-5b03-4ab4-a3db-e83fb7767767"
    "oxz_ntp_209ad0d0-a5e7-4ab8-ac8f-e99902697b32"
    "oxz_oximeter_864efebb-790f-4b7a-8377-b2c82c87f5b8"

$ omdb db physical-disks | grep $SLED_ID
 ID                                    SERIAL                 VENDOR            MODEL               SLED_ID                               POLICY      STATE
 23524716-a331-4d57-aa71-8bd4dbc916f8  synthetic-serial-g1_0  synthetic-vendor  synthetic-model-U2  a611b43e-3995-4cd4-9603-89ca6aca3dc5  in service  active
 3ca1812b-55e3-47ed-861f-f667f626c8a0  synthetic-serial-g1_3  synthetic-vendor  synthetic-model-U2  a611b43e-3995-4cd4-9603-89ca6aca3dc5  in service  active
 40139afb-7076-45d9-84cf-b96eefe7acf8  synthetic-serial-g1_1  synthetic-vendor  synthetic-model-U2  a611b43e-3995-4cd4-9603-89ca6aca3dc5  in service  active
 5c8e33dd-1230-4214-af78-9be892d9f421  synthetic-serial-g1_4  synthetic-vendor  synthetic-model-U2  a611b43e-3995-4cd4-9603-89ca6aca3dc5  in service  active
 85780bbf-8e2d-481e-9013-34611572f191  synthetic-serial-g1_2  synthetic-vendor  synthetic-model-U2  a611b43e-3995-4cd4-9603-89ca6aca3dc5  in service  active

# Let's expunge the "0th" disk here.

$ omdb nexus sleds expunge-disk 23524716-a331-4d57-aa71-8bd4dbc916f8 -w
$ omdb nexus blueprints regenerate -w
$ omdb nexus blueprints show $NEW_BLUEPRINT_ID

# Observe that the new blueprint for the sled expunges some zones -- minimally,
# the Crucible zone -- and no longer lists the "g1_0" disk. This should also be
# summarized in the blueprint metadata comment.

$ omdb nexus blueprints target set $NEW_BLUEPRINT_ID enabled -w
$ omdb sled-agent zones list

zones:
    "oxz_crucible_19831c98-3137-4af4-a93d-fc1a17c138f2"
    "oxz_crucible_74b2f587-10ce-4131-97fd-9832c52c8a41"
    "oxz_crucible_9e422508-f4d5-4c24-8dde-0080c0916419"
    "oxz_crucible_a47e9625-d189-4001-877a-cc3aa5b1f3eb"
    "oxz_crucible_pantry_c3b4e3cb-3e23-4f5e-921b-04e4801924fd"
    "oxz_ntp_209ad0d0-a5e7-4ab8-ac8f-e99902697b32"
    "oxz_oximeter_864efebb-790f-4b7a-8377-b2c82c87f5b8"

# As we can see, the expunged zones have been removed.
# We can also access the sled agent logs from g1 to observe that the expected requests have been sent
# to adjust the set of control plane disks and expunge the expected zones.
```

This is a major part of
https://github.com/oxidecomputer/omicron/issues/4719
Fixes https://github.com/oxidecomputer/omicron/issues/5370
---
 dev-tools/omdb/src/bin/omdb/db.rs             |  94 ++++++++-
 dev-tools/omdb/src/bin/omdb/nexus.rs          | 197 +++++++++++++++---
 dev-tools/omdb/tests/usage_errors.out         |   9 +-
 nexus/db-model/src/physical_disk.rs           |  72 +++++++
 .../src/db/datastore/physical_disk.rs         |   5 +-
 nexus/internal-api/src/lib.rs                 |  17 +-
 nexus/src/app/sled.rs                         |  29 ++-
 nexus/src/external_api/http_entrypoints.rs    |   2 +-
 nexus/src/internal_api/http_entrypoints.rs    |  19 ++
 nexus/types/src/deployment/planning_input.rs  |  56 ++++-
 openapi/nexus-internal.json                   |  41 ++++
 11 files changed, 497 insertions(+), 44 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs
index f0f7be0b83..44b34b0220 100644
--- a/dev-tools/omdb/src/bin/omdb/db.rs
+++ b/dev-tools/omdb/src/bin/omdb/db.rs
@@ -62,6 +62,7 @@ use nexus_db_model::IpAttachState;
 use nexus_db_model::IpKind;
 use nexus_db_model::NetworkInterface;
 use nexus_db_model::NetworkInterfaceKind;
+use nexus_db_model::PhysicalDisk;
 use nexus_db_model::Probe;
 use nexus_db_model::Project;
 use nexus_db_model::Region;
@@ -96,7 +97,10 @@ use nexus_types::deployment::Blueprint;
 use nexus_types::deployment::BlueprintZoneDisposition;
 use nexus_types::deployment::BlueprintZoneFilter;
 use nexus_types::deployment::BlueprintZoneType;
+use nexus_types::deployment::DiskFilter;
 use nexus_types::deployment::SledFilter;
+use nexus_types::external_api::views::PhysicalDiskPolicy;
+use nexus_types::external_api::views::PhysicalDiskState;
 use nexus_types::external_api::views::SledPolicy;
 use nexus_types::external_api::views::SledState;
 use nexus_types::identity::Resource;
@@ -281,12 +285,14 @@ pub struct DbFetchOptions {
 enum DbCommands {
     /// Print information about the rack
     Rack(RackArgs),
-    /// Print information about disks
+    /// Print information about virtual disks
     Disks(DiskArgs),
     /// Print information about internal and external DNS
     Dns(DnsArgs),
     /// Print information about collected hardware/software inventory
     Inventory(InventoryArgs),
+    /// Print information about physical disks
+    PhysicalDisks(PhysicalDisksArgs),
     /// Save the current Reconfigurator inputs to a file
     ReconfiguratorSave(ReconfiguratorSaveArgs),
     /// Print information about regions
@@ -407,8 +413,8 @@ enum InventoryCommands {
     Cabooses,
     /// list and show details from particular collections
     Collections(CollectionsArgs),
-    /// show all physical disks every found
-    PhysicalDisks(PhysicalDisksArgs),
+    /// show all physical disks ever found
+    PhysicalDisks(InvPhysicalDisksArgs),
     /// list all root of trust pages ever found
     RotPages,
 }
@@ -437,7 +443,7 @@ struct CollectionsShowArgs {
 }
 
 #[derive(Debug, Args, Clone, Copy)]
-struct PhysicalDisksArgs {
+struct InvPhysicalDisksArgs {
     #[clap(long)]
     collection_id: Option<CollectionUuid>,
 
@@ -445,6 +451,13 @@ struct PhysicalDisksArgs {
     sled_id: Option<SledUuid>,
 }
 
+#[derive(Debug, Args)]
+struct PhysicalDisksArgs {
+    /// Show disks that match the given filter
+    #[clap(short = 'F', long, value_enum)]
+    filter: Option<DiskFilter>,
+}
+
 #[derive(Debug, Args)]
 struct ReconfiguratorSaveArgs {
     /// where to save the output
@@ -611,6 +624,15 @@ impl DbArgs {
                 )
                 .await
             }
+            DbCommands::PhysicalDisks(args) => {
+                cmd_db_physical_disks(
+                    &opctx,
+                    &datastore,
+                    &self.fetch_opts,
+                    args,
+                )
+                .await
+            }
             DbCommands::ReconfiguratorSave(reconfig_save_args) => {
                 cmd_db_reconfigurator_save(
                     &opctx,
@@ -1385,6 +1407,68 @@ async fn cmd_db_disk_physical(
     Ok(())
 }
 
+#[derive(Tabled)]
+#[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
+struct PhysicalDiskRow {
+    id: Uuid,
+    serial: String,
+    vendor: String,
+    model: String,
+    sled_id: Uuid,
+    policy: PhysicalDiskPolicy,
+    state: PhysicalDiskState,
+}
+
+impl From<PhysicalDisk> for PhysicalDiskRow {
+    fn from(d: PhysicalDisk) -> Self {
+        PhysicalDiskRow {
+            id: d.id(),
+            serial: d.serial.clone(),
+            vendor: d.vendor.clone(),
+            model: d.model.clone(),
+            sled_id: d.sled_id,
+            policy: d.disk_policy.into(),
+            state: d.disk_state.into(),
+        }
+    }
+}
+
+/// Run `omdb db physical-disks`.
+async fn cmd_db_physical_disks(
+    opctx: &OpContext,
+    datastore: &DataStore,
+    fetch_opts: &DbFetchOptions,
+    args: &PhysicalDisksArgs,
+) -> Result<(), anyhow::Error> {
+    let limit = fetch_opts.fetch_limit;
+    let filter = match args.filter {
+        Some(filter) => filter,
+        None => {
+            eprintln!(
+                "note: listing all in-service disks \
+                 (use -F to filter, e.g. -F in-service)"
+            );
+            DiskFilter::InService
+        }
+    };
+
+    let sleds = datastore
+        .physical_disk_list(&opctx, &first_page(limit), filter)
+        .await
+        .context("listing physical disks")?;
+    check_limit(&sleds, limit, || String::from("listing physical disks"));
+
+    let rows = sleds.into_iter().map(|s| PhysicalDiskRow::from(s));
+    let table = tabled::Table::new(rows)
+        .with(tabled::settings::Style::empty())
+        .with(tabled::settings::Padding::new(1, 1, 0, 0))
+        .to_string();
+
+    println!("{}", table);
+
+    Ok(())
+}
+
 // SERVICES
 
 // Snapshots
@@ -3187,7 +3271,7 @@ async fn cmd_db_inventory_cabooses(
 async fn cmd_db_inventory_physical_disks(
     conn: &DataStoreConnection<'_>,
     limit: NonZeroU32,
-    args: PhysicalDisksArgs,
+    args: InvPhysicalDisksArgs,
 ) -> Result<(), anyhow::Error> {
     #[derive(Tabled)]
     #[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index fb74ddd89b..f699466505 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -24,6 +24,7 @@ use nexus_client::types::BackgroundTask;
 use nexus_client::types::BackgroundTasksActivateRequest;
 use nexus_client::types::CurrentStatus;
 use nexus_client::types::LastResult;
+use nexus_client::types::PhysicalDiskPath;
 use nexus_client::types::SledSelector;
 use nexus_client::types::UninitializedSledId;
 use nexus_db_queries::db::lookup::LookupPath;
@@ -33,6 +34,7 @@ use nexus_types::internal_api::background::RegionReplacementDriverStatus;
 use nexus_types::inventory::BaseboardId;
 use omicron_uuid_kinds::CollectionUuid;
 use omicron_uuid_kinds::GenericUuid;
+use omicron_uuid_kinds::PhysicalDiskUuid;
 use omicron_uuid_kinds::SledUuid;
 use reedline::DefaultPrompt;
 use reedline::DefaultPromptSegment;
@@ -256,6 +258,8 @@ enum SledsCommands {
     Add(SledAddArgs),
     /// Expunge a sled (DANGEROUS)
     Expunge(SledExpungeArgs),
+    /// Expunge a disk (DANGEROUS)
+    ExpungeDisk(DiskExpungeArgs),
 }
 
 #[derive(Debug, Args)]
@@ -277,6 +281,17 @@ struct SledExpungeArgs {
     sled_id: SledUuid,
 }
 
+#[derive(Debug, Args)]
+struct DiskExpungeArgs {
+    // expunge is _extremely_ dangerous, so we also require a database
+    // connection to perform some safety checks
+    #[clap(flatten)]
+    db_url_opts: DbUrlOptions,
+
+    /// Physical disk ID
+    physical_disk_id: PhysicalDiskUuid,
+}
+
 impl NexusArgs {
     /// Run a `omdb nexus` subcommand.
     pub(crate) async fn run_cmd(
@@ -401,6 +416,13 @@ impl NexusArgs {
                 let token = omdb.check_allow_destructive()?;
                 cmd_nexus_sled_expunge(&client, args, omdb, log, token).await
             }
+            NexusCommands::Sleds(SledsArgs {
+                command: SledsCommands::ExpungeDisk(args),
+            }) => {
+                let token = omdb.check_allow_destructive()?;
+                cmd_nexus_sled_expunge_disk(&client, args, omdb, log, token)
+                    .await
+            }
         }
     }
 }
@@ -1458,6 +1480,39 @@ async fn cmd_nexus_sled_add(
     Ok(())
 }
 
+struct ConfirmationPrompt(Reedline);
+
+impl ConfirmationPrompt {
+    fn new() -> Self {
+        Self(Reedline::create())
+    }
+
+    fn read(&mut self, message: &str) -> Result<String, anyhow::Error> {
+        let prompt = DefaultPrompt::new(
+            DefaultPromptSegment::Basic(message.to_string()),
+            DefaultPromptSegment::Empty,
+        );
+        if let Ok(reedline::Signal::Success(input)) = self.0.read_line(&prompt)
+        {
+            Ok(input)
+        } else {
+            bail!("expungement aborted")
+        }
+    }
+
+    fn read_and_validate(
+        &mut self,
+        message: &str,
+        expected: &str,
+    ) -> Result<(), anyhow::Error> {
+        let input = self.read(message)?;
+        if input != expected {
+            bail!("Aborting, input did not match expected value");
+        }
+        Ok(())
+    }
+}
+
 /// Runs `omdb nexus sleds expunge`
 async fn cmd_nexus_sled_expunge(
     client: &nexus_client::Client,
@@ -1487,20 +1542,7 @@ async fn cmd_nexus_sled_expunge(
         .with_context(|| format!("failed to find sled {}", args.sled_id))?;
 
     // Helper to get confirmation messages from the user.
-    let mut line_editor = Reedline::create();
-    let mut read_with_prompt = move |message: &str| {
-        let prompt = DefaultPrompt::new(
-            DefaultPromptSegment::Basic(message.to_string()),
-            DefaultPromptSegment::Empty,
-        );
-        if let Ok(reedline::Signal::Success(input)) =
-            line_editor.read_line(&prompt)
-        {
-            Ok(input)
-        } else {
-            bail!("expungement aborted")
-        }
-    };
+    let mut prompt = ConfirmationPrompt::new();
 
     // Now check whether its sled-agent or SP were found in the most recent
     // inventory collection.
@@ -1530,11 +1572,7 @@ async fn cmd_nexus_sled_expunge(
                      proceed anyway?",
                     args.sled_id, collection.time_done,
                 );
-                let confirm = read_with_prompt("y/N")?;
-                if confirm != "y" {
-                    eprintln!("expungement not confirmed: aborting");
-                    return Ok(());
-                }
+                prompt.read_and_validate("y/N", "y")?;
             }
         }
         None => {
@@ -1552,11 +1590,7 @@ async fn cmd_nexus_sled_expunge(
         args.sled_id,
         sled.serial_number(),
     );
-    let confirm = read_with_prompt("sled serial number")?;
-    if confirm != sled.serial_number() {
-        eprintln!("sled serial number not confirmed: aborting");
-        return Ok(());
-    }
+    prompt.read_and_validate("sled serial number", sled.serial_number())?;
 
     let old_policy = client
         .sled_expunge(&SledSelector { sled: args.sled_id.into_untyped_uuid() })
@@ -1569,3 +1603,118 @@ async fn cmd_nexus_sled_expunge(
     );
     Ok(())
 }
+
+/// Runs `omdb nexus sleds expunge-disk`
+async fn cmd_nexus_sled_expunge_disk(
+    client: &nexus_client::Client,
+    args: &DiskExpungeArgs,
+    omdb: &Omdb,
+    log: &slog::Logger,
+    _destruction_token: DestructiveOperationToken,
+) -> Result<(), anyhow::Error> {
+    use nexus_db_queries::context::OpContext;
+
+    let datastore = args.db_url_opts.connect(omdb, log).await?;
+    let opctx = OpContext::for_tests(log.clone(), datastore.clone());
+    let opctx = &opctx;
+
+    // First, we need to look up the disk so we can lookup identity information.
+    let (_authz_physical_disk, physical_disk) =
+        LookupPath::new(opctx, &datastore)
+            .physical_disk(args.physical_disk_id.into_untyped_uuid())
+            .fetch()
+            .await
+            .with_context(|| {
+                format!(
+                    "failed to find physical disk {}",
+                    args.physical_disk_id
+                )
+            })?;
+
+    // Helper to get confirmation messages from the user.
+    let mut prompt = ConfirmationPrompt::new();
+
+    // Now check whether its sled-agent was found in the most recent
+    // inventory collection.
+    match datastore
+        .inventory_get_latest_collection(opctx)
+        .await
+        .context("loading latest collection")?
+    {
+        Some(collection) => {
+            let disk_identity = omicron_common::disk::DiskIdentity {
+                vendor: physical_disk.vendor.clone(),
+                serial: physical_disk.serial.clone(),
+                model: physical_disk.model.clone(),
+            };
+
+            let mut sleds_containing_disk = vec![];
+
+            for (sled_id, sled_agent) in collection.sled_agents {
+                for sled_disk in sled_agent.disks {
+                    if sled_disk.identity == disk_identity {
+                        sleds_containing_disk.push(sled_id);
+                    }
+                }
+            }
+
+            match sleds_containing_disk.len() {
+                0 => {}
+                1 => {
+                    eprintln!(
+                        "WARNING: physical disk {} is PRESENT in the most \
+                         recent inventory collection (spotted at {}). Although \
+                         expunging a running disk is supported, it is safer \
+                         to expunge a disk from a system where it has been \
+                         removed. Are you sure you want to proceed anyway?",
+                        args.physical_disk_id, collection.time_done,
+                    );
+                    prompt.read_and_validate("y/N", "y")?;
+                }
+                _ => {
+                    // This should be impossible due to a unique database index,
+                    // "vendor_serial_model_unique".
+                    //
+                    // Even if someone tried moving a disk, it would need to be
+                    // decommissioned before being re-commissioned elsewhere.
+                    //
+                    // However, we still print out an error message here in the
+                    // (unlikely) even that it happens anyway.
+                    eprintln!(
+                        "ERROR: physical disk {} is PRESENT MULTIPLE TIMES in \
+                        the most recent inventory collection (spotted at {}).
+                        This should not be possible, and is an indication of a \
+                        database issue.",
+                        args.physical_disk_id, collection.time_done,
+                    );
+                    bail!("Physical Disk appeared on multiple sleds");
+                }
+            }
+        }
+        None => {
+            eprintln!(
+                "ERROR: cannot verify the physical disk inventory status \
+                 because there are no inventory collections present. Please \
+                 ensure that inventory may be collected."
+            );
+            bail!("No inventory");
+        }
+    }
+
+    eprintln!(
+        "WARNING: This operation will PERMANENTLY and IRRECOVABLY mark physical disk \
+        {} ({}) expunged. To proceed, type the physical disk's serial number.",
+        args.physical_disk_id,
+        physical_disk.serial,
+    );
+    prompt.read_and_validate("disk serial number", &physical_disk.serial)?;
+
+    client
+        .physical_disk_expunge(&PhysicalDiskPath {
+            disk_id: args.physical_disk_id.into_untyped_uuid(),
+        })
+        .await
+        .context("expunging disk")?;
+    eprintln!("expunged disk {}", args.physical_disk_id);
+    Ok(())
+}
diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out
index 8762907e81..3d6f2af112 100644
--- a/dev-tools/omdb/tests/usage_errors.out
+++ b/dev-tools/omdb/tests/usage_errors.out
@@ -105,9 +105,10 @@ Usage: omdb db [OPTIONS] <COMMAND>
 
 Commands:
   rack                 Print information about the rack
-  disks                Print information about disks
+  disks                Print information about virtual disks
   dns                  Print information about internal and external DNS
   inventory            Print information about collected hardware/software inventory
+  physical-disks       Print information about physical disks
   reconfigurator-save  Save the current Reconfigurator inputs to a file
   region               Print information about regions
   region-replacement   Query for information about region replacements, optionally manually
@@ -146,9 +147,10 @@ Usage: omdb db [OPTIONS] <COMMAND>
 
 Commands:
   rack                 Print information about the rack
-  disks                Print information about disks
+  disks                Print information about virtual disks
   dns                  Print information about internal and external DNS
   inventory            Print information about collected hardware/software inventory
+  physical-disks       Print information about physical disks
   reconfigurator-save  Save the current Reconfigurator inputs to a file
   region               Print information about regions
   region-replacement   Query for information about region replacements, optionally manually
@@ -185,7 +187,7 @@ termination: Exited(2)
 stdout:
 ---------------------------------------------
 stderr:
-Print information about disks
+Print information about virtual disks
 
 Usage: omdb db disks [OPTIONS] <COMMAND>
 
@@ -526,6 +528,7 @@ Commands:
   list-uninitialized  List all uninitialized sleds
   add                 Add an uninitialized sled
   expunge             Expunge a sled (DANGEROUS)
+  expunge-disk        Expunge a disk (DANGEROUS)
   help                Print this message or the help of the given subcommand(s)
 
 Options:
diff --git a/nexus/db-model/src/physical_disk.rs b/nexus/db-model/src/physical_disk.rs
index c6ef97ee1f..d4a1dcd33c 100644
--- a/nexus/db-model/src/physical_disk.rs
+++ b/nexus/db-model/src/physical_disk.rs
@@ -85,3 +85,75 @@ impl DatastoreCollectionConfig<super::Zpool> for PhysicalDisk {
     type CollectionTimeDeletedColumn = physical_disk::dsl::time_deleted;
     type CollectionIdColumn = zpool::dsl::sled_id;
 }
+
+mod diesel_util {
+    use diesel::{
+        helper_types::{And, EqAny},
+        prelude::*,
+        query_dsl::methods::FilterDsl,
+    };
+    use nexus_types::{
+        deployment::DiskFilter,
+        external_api::views::{PhysicalDiskPolicy, PhysicalDiskState},
+    };
+
+    /// An extension trait to apply a [`DiskFilter`] to a Diesel expression.
+    ///
+    /// This is applicable to any Diesel expression which includes the `physical_disk`
+    /// table.
+    ///
+    /// This needs to live here, rather than in `nexus-db-queries`, because it
+    /// names the `DbPhysicalDiskPolicy` type which is private to this crate.
+    pub trait ApplyPhysicalDiskFilterExt {
+        type Output;
+
+        /// Applies a [`DiskFilter`] to a Diesel expression.
+        fn physical_disk_filter(self, filter: DiskFilter) -> Self::Output;
+    }
+
+    impl<E> ApplyPhysicalDiskFilterExt for E
+    where
+        E: FilterDsl<PhysicalDiskFilterQuery>,
+    {
+        type Output = E::Output;
+
+        fn physical_disk_filter(self, filter: DiskFilter) -> Self::Output {
+            use crate::schema::physical_disk::dsl as physical_disk_dsl;
+
+            // These are only boxed for ease of reference above.
+            let all_matching_policies: BoxedIterator<
+                crate::PhysicalDiskPolicy,
+            > = Box::new(
+                PhysicalDiskPolicy::all_matching(filter).map(Into::into),
+            );
+            let all_matching_states: BoxedIterator<crate::PhysicalDiskState> =
+                Box::new(
+                    PhysicalDiskState::all_matching(filter).map(Into::into),
+                );
+
+            FilterDsl::filter(
+                self,
+                physical_disk_dsl::disk_policy
+                    .eq_any(all_matching_policies)
+                    .and(
+                        physical_disk_dsl::disk_state
+                            .eq_any(all_matching_states),
+                    ),
+            )
+        }
+    }
+
+    type BoxedIterator<T> = Box<dyn Iterator<Item = T>>;
+    type PhysicalDiskFilterQuery = And<
+        EqAny<
+            crate::schema::physical_disk::disk_policy,
+            BoxedIterator<crate::PhysicalDiskPolicy>,
+        >,
+        EqAny<
+            crate::schema::physical_disk::disk_state,
+            BoxedIterator<crate::PhysicalDiskState>,
+        >,
+    >;
+}
+
+pub use diesel_util::ApplyPhysicalDiskFilterExt;
diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs
index e51d59075e..11e056d19b 100644
--- a/nexus/db-queries/src/db/datastore/physical_disk.rs
+++ b/nexus/db-queries/src/db/datastore/physical_disk.rs
@@ -26,7 +26,8 @@ use crate::transaction_retry::OptionalError;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use chrono::Utc;
 use diesel::prelude::*;
-use nexus_types::deployment::SledFilter;
+use nexus_db_model::ApplyPhysicalDiskFilterExt;
+use nexus_types::deployment::{DiskFilter, SledFilter};
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::DeleteResult;
@@ -247,11 +248,13 @@ impl DataStore {
         &self,
         opctx: &OpContext,
         pagparams: &DataPageParams<'_, Uuid>,
+        disk_filter: DiskFilter,
     ) -> ListResultVec<PhysicalDisk> {
         opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
         use db::schema::physical_disk::dsl;
         paginated(dsl::physical_disk, dsl::id, pagparams)
             .filter(dsl::time_deleted.is_null())
+            .physical_disk_filter(disk_filter)
             .select(PhysicalDisk::as_select())
             .load_async(&*self.pool_connection_authorized(opctx).await?)
             .await
diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs
index b2d68036bb..b6de85486a 100644
--- a/nexus/internal-api/src/lib.rs
+++ b/nexus/internal-api/src/lib.rs
@@ -14,7 +14,7 @@ use nexus_types::{
         Blueprint, BlueprintMetadata, BlueprintTarget, BlueprintTargetSet,
     },
     external_api::{
-        params::{SledSelector, UninitializedSledId},
+        params::{PhysicalDiskPath, SledSelector, UninitializedSledId},
         shared::{ProbeInfo, UninitializedSled},
         views::SledPolicy,
     },
@@ -472,6 +472,21 @@ pub trait NexusInternalApi {
         sled: TypedBody<SledSelector>,
     ) -> Result<HttpResponseOk<SledPolicy>, HttpError>;
 
+    /// Mark a physical disk as expunged
+    ///
+    /// This is an irreversible process! It should only be called after
+    /// sufficient warning to the operator.
+    ///
+    /// This is idempotent.
+    #[endpoint {
+        method = POST,
+        path = "/physical-disk/expunge",
+    }]
+    async fn physical_disk_expunge(
+        rqctx: RequestContext<Self::Context>,
+        disk: TypedBody<PhysicalDiskPath>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
+
     /// Get all the probes associated with a given sled.
     #[endpoint {
         method = GET,
diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs
index fd5341ae80..0165b2d261 100644
--- a/nexus/src/app/sled.rs
+++ b/nexus/src/app/sled.rs
@@ -13,7 +13,9 @@ use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db;
 use nexus_db_queries::db::lookup;
 use nexus_db_queries::db::model::DatasetKind;
+use nexus_types::deployment::DiskFilter;
 use nexus_types::deployment::SledFilter;
+use nexus_types::external_api::views::PhysicalDiskPolicy;
 use nexus_types::external_api::views::SledPolicy;
 use nexus_types::external_api::views::SledProvisionPolicy;
 use omicron_common::api::external::DataPageParams;
@@ -186,7 +188,7 @@ impl super::Nexus {
 
     // Physical disks
 
-    pub async fn physical_disk_lookup<'a>(
+    pub fn physical_disk_lookup<'a>(
         &'a self,
         opctx: &'a OpContext,
         disk_selector: &params::PhysicalDiskPath,
@@ -211,7 +213,9 @@ impl super::Nexus {
         opctx: &OpContext,
         pagparams: &DataPageParams<'_, Uuid>,
     ) -> ListResultVec<db::model::PhysicalDisk> {
-        self.db_datastore.physical_disk_list(&opctx, pagparams).await
+        self.db_datastore
+            .physical_disk_list(&opctx, pagparams, DiskFilter::InService)
+            .await
     }
 
     /// Upserts a physical disk into the database, updating it if it already exists.
@@ -240,6 +244,27 @@ impl super::Nexus {
         Ok(())
     }
 
+    /// Mark a physical disk as expunged
+    ///
+    /// This is an irreversible process! It should only be called after
+    /// sufficient warning to the operator.
+    pub(crate) async fn physical_disk_expunge(
+        &self,
+        opctx: &OpContext,
+        disk: params::PhysicalDiskPath,
+    ) -> Result<(), Error> {
+        let physical_disk_lookup = self.physical_disk_lookup(opctx, &disk)?;
+        let (authz_disk,) =
+            physical_disk_lookup.lookup_for(authz::Action::Modify).await?;
+        self.db_datastore
+            .physical_disk_update_policy(
+                opctx,
+                authz_disk.id(),
+                PhysicalDiskPolicy::Expunged.into(),
+            )
+            .await
+    }
+
     // Zpools (contained within sleds)
 
     /// Upserts a Zpool into the database, updating it if it already exists.
diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs
index d23f0d035a..9d616c7e9c 100644
--- a/nexus/src/external_api/http_entrypoints.rs
+++ b/nexus/src/external_api/http_entrypoints.rs
@@ -6142,7 +6142,7 @@ async fn physical_disk_view(
         let opctx = crate::context::op_context_for_external_api(&rqctx).await?;
 
         let (.., physical_disk) =
-            nexus.physical_disk_lookup(&opctx, &path).await?.fetch().await?;
+            nexus.physical_disk_lookup(&opctx, &path)?.fetch().await?;
         Ok(HttpResponseOk(physical_disk.into()))
     };
     apictx
diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs
index f324ea787d..28ff712c24 100644
--- a/nexus/src/internal_api/http_entrypoints.rs
+++ b/nexus/src/internal_api/http_entrypoints.rs
@@ -24,6 +24,7 @@ use nexus_types::deployment::Blueprint;
 use nexus_types::deployment::BlueprintMetadata;
 use nexus_types::deployment::BlueprintTarget;
 use nexus_types::deployment::BlueprintTargetSet;
+use nexus_types::external_api::params::PhysicalDiskPath;
 use nexus_types::external_api::params::SledSelector;
 use nexus_types::external_api::params::UninitializedSledId;
 use nexus_types::external_api::shared::ProbeInfo;
@@ -827,6 +828,24 @@ impl NexusInternalApi for NexusInternalApiImpl {
             .await
     }
 
+    async fn physical_disk_expunge(
+        rqctx: RequestContext<Self::Context>,
+        disk: TypedBody<PhysicalDiskPath>,
+    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
+        let apictx = &rqctx.context().context;
+        let nexus = &apictx.nexus;
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_internal_api(&rqctx).await;
+            nexus.physical_disk_expunge(&opctx, disk.into_inner()).await?;
+            Ok(HttpResponseUpdatedNoContent())
+        };
+        apictx
+            .internal_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
+
     async fn probes_get(
         rqctx: RequestContext<Self::Context>,
         path_params: Path<ProbePathParam>,
diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs
index 8a230469d5..a8f3989da4 100644
--- a/nexus/types/src/deployment/planning_input.rs
+++ b/nexus/types/src/deployment/planning_input.rs
@@ -340,7 +340,7 @@ impl SledDisk {
 }
 
 /// Filters that apply to disks.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, ValueEnum)]
 pub enum DiskFilter {
     /// All disks
     All,
@@ -355,16 +355,58 @@ impl DiskFilter {
         policy: PhysicalDiskPolicy,
         state: PhysicalDiskState,
     ) -> bool {
+        policy.matches(self) && state.matches(self)
+    }
+}
+
+impl PhysicalDiskPolicy {
+    /// Returns true if self matches the filter
+    pub fn matches(self, filter: DiskFilter) -> bool {
         match self {
-            DiskFilter::All => true,
-            DiskFilter::InService => match (policy, state) {
-                (PhysicalDiskPolicy::InService, PhysicalDiskState::Active) => {
-                    true
-                }
-                _ => false,
+            PhysicalDiskPolicy::InService => match filter {
+                DiskFilter::All => true,
+                DiskFilter::InService => true,
+            },
+            PhysicalDiskPolicy::Expunged => match filter {
+                DiskFilter::All => true,
+                DiskFilter::InService => false,
             },
         }
     }
+
+    /// Returns all policies matching the given filter.
+    ///
+    /// This is meant for database access, and is generally paired with
+    /// [`PhysicalDiskState::all_matching`]. See `ApplyPhysicalDiskFilterExt` in
+    /// nexus-db-model.
+    pub fn all_matching(filter: DiskFilter) -> impl Iterator<Item = Self> {
+        Self::iter().filter(move |state| state.matches(filter))
+    }
+}
+
+impl PhysicalDiskState {
+    /// Returns true if self matches the filter
+    pub fn matches(self, filter: DiskFilter) -> bool {
+        match self {
+            PhysicalDiskState::Active => match filter {
+                DiskFilter::All => true,
+                DiskFilter::InService => true,
+            },
+            PhysicalDiskState::Decommissioned => match filter {
+                DiskFilter::All => true,
+                DiskFilter::InService => false,
+            },
+        }
+    }
+
+    /// Returns all state matching the given filter.
+    ///
+    /// This is meant for database access, and is generally paired with
+    /// [`PhysicalDiskPolicy::all_matching`]. See `ApplyPhysicalDiskFilterExt` in
+    /// nexus-db-model.
+    pub fn all_matching(filter: DiskFilter) -> impl Iterator<Item = Self> {
+        Self::iter().filter(move |state| state.matches(filter))
+    }
 }
 
 /// Filters that apply to zpools.
diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json
index 27430c7599..c5fc2c3b56 100644
--- a/openapi/nexus-internal.json
+++ b/openapi/nexus-internal.json
@@ -909,6 +909,34 @@
         }
       }
     },
+    "/physical-disk/expunge": {
+      "post": {
+        "summary": "Mark a physical disk as expunged",
+        "description": "This is an irreversible process! It should only be called after sufficient warning to the operator.\nThis is idempotent.",
+        "operationId": "physical_disk_expunge",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/PhysicalDiskPath"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "204": {
+            "description": "resource updated"
+          },
+          "4XX": {
+            "$ref": "#/components/responses/Error"
+          },
+          "5XX": {
+            "$ref": "#/components/responses/Error"
+          }
+        }
+      }
+    },
     "/probes/{sled}": {
       "get": {
         "summary": "Get all the probes associated with a given sled.",
@@ -3794,6 +3822,19 @@
           "u2"
         ]
       },
+      "PhysicalDiskPath": {
+        "type": "object",
+        "properties": {
+          "disk_id": {
+            "description": "ID of the physical disk",
+            "type": "string",
+            "format": "uuid"
+          }
+        },
+        "required": [
+          "disk_id"
+        ]
+      },
       "PhysicalDiskPutRequest": {
         "type": "object",
         "properties": {

From 86b119537b657c7c93323ca219be8da5116c7a66 Mon Sep 17 00:00:00 2001
From: Zeeshan Lakhani <zeeshan@oxidecomputer.com>
Date: Mon, 15 Jul 2024 20:14:39 -0400
Subject: [PATCH 23/27] Add a gh-action and buildomat jobs to cargo check on
 no-default-features and feature-powerset (#6018)

Includes:

* An xtask which
  - [X] runs with specific excludes, i.e. `image-*`
- [X] downloads *pre-built* `cargo-hack` subcommand binary for known
platform/os & arch from https://github.com/taiki-e/cargo-hack/releases/
  - [X] allows for version-based installation (otherwise)
* New CI jobs for checking-features in rust.yml & buildomat
* Extends download `xtask` to install `cargo-hack`
---
 .cargo/config.toml                       |   1 +
 .github/buildomat/jobs/check-features.sh |  34 ++++
 .github/buildomat/jobs/clippy.sh         |   2 +-
 .github/workflows/rust.yml               |  32 +++-
 README.adoc                              |  15 ++
 dev-tools/xtask/src/check_features.rs    | 212 +++++++++++++++++++++++
 dev-tools/xtask/src/download.rs          |  77 ++++++++
 dev-tools/xtask/src/main.rs              |   4 +
 tools/cargo_hack_checksum                |   3 +
 tools/cargo_hack_version                 |   1 +
 10 files changed, 379 insertions(+), 2 deletions(-)
 create mode 100644 .github/buildomat/jobs/check-features.sh
 create mode 100644 dev-tools/xtask/src/check_features.rs
 create mode 100644 tools/cargo_hack_checksum
 create mode 100644 tools/cargo_hack_version

diff --git a/.cargo/config.toml b/.cargo/config.toml
index c5b6fcd9d4..209d15c760 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -8,6 +8,7 @@
 # CI scripts:
 # - .github/buildomat/build-and-test.sh
 # - .github/buildomat/jobs/clippy.sh
+# - .github/buildomat/jobs/check-features.sh
 # - .github/workflows/rust.yml
 #
 [build]
diff --git a/.github/buildomat/jobs/check-features.sh b/.github/buildomat/jobs/check-features.sh
new file mode 100644
index 0000000000..4ba97ec02f
--- /dev/null
+++ b/.github/buildomat/jobs/check-features.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#:
+#: name = "check-features (helios)"
+#: variety = "basic"
+#: target = "helios-2.0"
+#: rust_toolchain = true
+#: output_rules = [
+#:  "/out/*",
+#: ]
+
+# Run the check-features `xtask` on illumos, testing compilation of feature combinations.
+
+set -o errexit
+set -o pipefail
+set -o xtrace
+
+cargo --version
+rustc --version
+
+#
+# Set up our PATH for use with this workspace.
+#
+source ./env.sh
+export PATH="$PATH:$PWD/out/cargo-hack"
+
+banner prerequisites
+ptime -m bash ./tools/install_builder_prerequisites.sh -y
+
+#
+# Check feature combinations with the `cargo xtask check-features` command.
+#
+banner hack-check
+export CARGO_INCREMENTAL=0
+ptime -m timeout 2h cargo xtask check-features --ci
diff --git a/.github/buildomat/jobs/clippy.sh b/.github/buildomat/jobs/clippy.sh
index 71aa04c907..4040691b72 100755
--- a/.github/buildomat/jobs/clippy.sh
+++ b/.github/buildomat/jobs/clippy.sh
@@ -10,7 +10,7 @@
 # (that we want to check) is conditionally-compiled on illumos only.
 #
 # Note that `cargo clippy` includes `cargo check, so this ends up checking all
-# of our code.
+# of our (default) code.
 
 set -o errexit
 set -o pipefail
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 2ef2783108..94d25e7dfa 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -53,7 +53,7 @@ jobs:
       run: cargo run --bin omicron-package -- -t default check
 
   # Note that `cargo clippy` includes `cargo check, so this ends up checking all
-  # of our code.
+  # of our (default) code.
   clippy-lint:
     runs-on: ubuntu-22.04
     env:
@@ -82,6 +82,36 @@ jobs:
     - name: Run Clippy Lints
       run: cargo xtask clippy
 
+  check-features:
+    runs-on: ubuntu-22.04
+    env:
+      CARGO_INCREMENTAL: 0
+    steps:
+    # This repo is unstable and unnecessary: https://github.com/microsoft/linux-package-repositories/issues/34
+    - name: Disable packages.microsoft.com repo
+      run: sudo rm -f /etc/apt/sources.list.d/microsoft-prod.list
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        ref: ${{ github.event.pull_request.head.sha }} # see omicron#4461
+    - uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84 # v2.7.3
+      if: ${{ github.ref != 'refs/heads/main' }}
+    - name: Report cargo version
+      run: cargo --version
+    - name: Update PATH
+      run: |
+        set -x
+        export PATH="./out/cargo-hack:$PATH"
+        source "./env.sh"; echo "PATH=$PATH" >> "$GITHUB_ENV"
+    - name: Print PATH
+      run: echo $PATH
+    - name: Print GITHUB_ENV
+      run: cat "$GITHUB_ENV"
+    - name: Install Pre-Requisites
+      run: ./tools/install_builder_prerequisites.sh -y
+    - name: Run Check on Feature Combinations (Feature-Powerset, No-Dev-Deps)
+      timeout-minutes: 120 # 2 hours
+      run: cargo xtask check-features --ci
+
   # This is just a test build of docs.  Publicly available docs are built via
   # the separate "rustdocs" repo.
   build-docs:
diff --git a/README.adoc b/README.adoc
index 1ef4bd8601..4979411d73 100644
--- a/README.adoc
+++ b/README.adoc
@@ -112,6 +112,21 @@ cargo nextest run
 
 We check that certain system library dependencies are not leaked outside of their intended binaries via `cargo xtask verify-libraries` in CI. If you are adding a new dependency on a illumos/helios library it is recommended that you update xref:.cargo/xtask.toml[] with an allow list of where you expect the dependency to show up. For example some libraries such as `libnvme.so.1` are only available in the global zone and therefore will not be present in any other zone. This check is here to help us catch any leakage before we go to deploy on a rack. You can inspect a compiled binary in the target directory for what it requires by using `elfedit` - for example `elfedit -r -e 'dyn:tag NEEDED' /path/to/omicron/target/debug/sled-agent`.
 
+=== Checking feature flag combinations
+
+To ensure that varying combinations of features compile, run `cargo xtask check-features`, which executes the https://github.com/taiki-e/cargo-hack[`cargo hack`] subcommand under the hood.
+
+This `xtask` is run in CI using the `--ci` parameter , which automatically exludes certain `image-*` features that purposefully cause compiler errors if set and uses a pre-built binary.
+
+If `cargo hack` is not already installed in omicron's `out/` directory, a pre-built binary will be installed automatically depending on your operating system and architecture.
+
+To limit the max number of simultaneous feature flags combined for checking, run the `xtask` with the `--depth <NUM>` flag:
+
+[source,text]
+----
+$ cargo xtask check-features --depth 2
+----
+
 === Rust packages in Omicron
 
 NOTE: The term "package" is overloaded: most programming languages and operating systems have their own definitions of a package.  On top of that, Omicron bundles up components into our own kind of "package" that gets delivered via the install and update systems.  These are described in the `package-manifest.toml` file in the root of the repo.  In this section, we're just concerned with Rust packages.
diff --git a/dev-tools/xtask/src/check_features.rs b/dev-tools/xtask/src/check_features.rs
new file mode 100644
index 0000000000..a9dbc2bff7
--- /dev/null
+++ b/dev-tools/xtask/src/check_features.rs
@@ -0,0 +1,212 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Subcommand: cargo xtask check-features
+
+use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
+use clap::Parser;
+use std::{collections::HashSet, process::Command};
+
+const SUPPORTED_ARCHITECTURES: [&str; 1] = ["x86_64"];
+const CI_EXCLUDED_FEATURES: [&str; 2] = ["image-trampoline", "image-standard"];
+
+#[derive(Parser)]
+pub struct Args {
+    /// Run in CI mode, with a default set of features excluded.
+    #[clap(long, default_value_t = false)]
+    ci: bool,
+    /// Features to exclude from the check.
+    #[clap(long, value_name = "FEATURES")]
+    exclude_features: Option<Vec<String>>,
+    /// Depth of the feature powerset to check.
+    #[clap(long, value_name = "NUM")]
+    depth: Option<usize>,
+    /// Error format passed to `cargo hack check`.
+    #[clap(long, value_name = "FMT")]
+    message_format: Option<String>,
+    /// Version of `cargo-hack` to install. By default, we download a pre-built
+    /// version.
+    #[clap(long, value_name = "VERSION")]
+    install_version: Option<String>,
+}
+
+/// Run `cargo hack check`.
+pub fn run_cmd(args: Args) -> Result<()> {
+    // We cannot specify both `--ci` and `--install-version`, as the former
+    // implies we are using a pre-built version.
+    if args.ci && args.install_version.is_some() {
+        bail!("cannot specify --ci and --install-version together");
+    }
+
+    let cargo =
+        std::env::var("CARGO").unwrap_or_else(|_| String::from("cargo"));
+
+    let mut command = Command::new(&cargo);
+
+    // Add the `hack check` subcommand.
+    command.args(&["hack", "check"]);
+
+    if args.ci {
+        install_prebuilt_cargo_hack(&cargo)?;
+
+        let ex = if let Some(mut features) = args.exclude_features {
+            // Extend the list of features to exclude with the CI defaults.
+            features.extend(
+                CI_EXCLUDED_FEATURES.into_iter().map(|s| s.to_string()),
+            );
+
+            // Remove duplicates.
+            let excludes = features.into_iter().collect::<HashSet<_>>();
+
+            excludes.into_iter().collect::<Vec<_>>().join(",")
+        } else {
+            CI_EXCLUDED_FEATURES.join(",")
+        };
+
+        // Add the `--exclude-features` flag if we are running in CI mode.
+        command.args(["--exclude-features", &ex]);
+    } else {
+        install_cargo_hack(&cargo, args.install_version)?;
+        // Add "only" the `--exclude-features` flag if it was provided.
+        if let Some(features) = args.exclude_features {
+            command.args(["--exclude-features", &features.join(",")]);
+        }
+    }
+
+    if let Some(depth) = args.depth {
+        command.args(&["--depth", &depth.to_string()]);
+    }
+
+    // Pass along the `--message-format` flag if it was provided.
+    if let Some(fmt) = args.message_format {
+        command.args(["--message-format", &fmt]);
+    }
+
+    command
+        // Make sure we check everything.
+        .arg("--workspace")
+        // We want to check the binaries.
+        .arg("--bins")
+        // We want to check the feature powerset.
+        .arg("--feature-powerset")
+        // We will not check the dev-dependencies, which should covered by tests.
+        .arg("--no-dev-deps");
+
+    exec(command)
+}
+
+/// The supported operating systems.
+enum Os {
+    Illumos,
+    Linux,
+    Mac,
+}
+
+/// Get the current OS.
+fn os_name() -> Result<Os> {
+    let os = match std::env::consts::OS {
+        "linux" => Os::Linux,
+        "macos" => Os::Mac,
+        "solaris" | "illumos" => Os::Illumos,
+        other => bail!("OS not supported: {other}"),
+    };
+    Ok(os)
+}
+
+/// This is a workaround for the lack of a CARGO_WORKSPACE_DIR environment
+/// variable, as suggested in <https://github.com/rust-lang/cargo/issues/3946#issuecomment-1433384192>.
+/// A better workaround might be to set this in the `[env]` section of
+/// `.cargo/config.toml`.
+fn project_root() -> Utf8PathBuf {
+    Utf8PathBuf::from(&concat!(env!("CARGO_MANIFEST_DIR"), "/.."))
+}
+
+/// Get the path to the `out` directory from the project root/workspace
+/// directory.
+fn out_dir() -> Utf8PathBuf {
+    project_root().join("out/cargo-hack")
+}
+
+/// Install `cargo-hack` if the `install-version` was specified; otherwise,
+/// download a pre-built version if it's not already in our `out` directory.
+fn install_cargo_hack(cargo: &str, version: Option<String>) -> Result<()> {
+    if let Some(version) = version {
+        let mut command = Command::new(cargo);
+
+        eprintln!(
+            "installing cargo-hack at version {} to {}",
+            version,
+            env!("CARGO_HOME")
+        );
+        command.args(&["install", "cargo-hack", "--version", &version]);
+        exec(command)
+    } else if !out_dir().exists() {
+        install_prebuilt_cargo_hack(cargo)
+    } else {
+        let out_dir = out_dir();
+        eprintln!("cargo-hack found in {}", out_dir);
+        Ok(())
+    }
+}
+
+/// Download a pre-built version of `cargo-hack` to the `out` directory via the
+/// download `xtask`.
+fn install_prebuilt_cargo_hack(cargo: &str) -> Result<()> {
+    let mut command = Command::new(cargo);
+
+    let out_dir = out_dir();
+    eprintln!(
+        "cargo-hack not found in {}, downloading a pre-built version",
+        out_dir
+    );
+
+    let os = os_name()?;
+    match os {
+        Os::Illumos | Os::Linux | Os::Mac
+            if SUPPORTED_ARCHITECTURES.contains(&std::env::consts::ARCH) =>
+        {
+            // Download the pre-built version of `cargo-hack` via our
+            // download `xtask`.
+            command.args(&["xtask", "download", "cargo-hack"]);
+        }
+        _ => {
+            bail!(
+                "cargo-hack is not pre-built for this os {} / arch {}",
+                std::env::consts::OS,
+                std::env::consts::ARCH
+            );
+        }
+    }
+
+    exec(command)
+}
+
+/// Execute the command and check the exit status.
+fn exec(mut command: Command) -> Result<()> {
+    let cargo =
+        std::env::var("CARGO").unwrap_or_else(|_| String::from("cargo"));
+
+    eprintln!(
+        "running: {:?} {}",
+        &cargo,
+        command
+            .get_args()
+            .map(|arg| format!("{:?}", arg.to_str().unwrap()))
+            .collect::<Vec<_>>()
+            .join(" ")
+    );
+
+    let exit_status = command
+        .spawn()
+        .expect("failed to spawn child process")
+        .wait()
+        .expect("failed to wait for child process");
+
+    if !exit_status.success() {
+        bail!("cargo-hack install failed: {}", exit_status);
+    }
+
+    Ok(())
+}
diff --git a/dev-tools/xtask/src/download.rs b/dev-tools/xtask/src/download.rs
index 2790a638a7..37c9b7be8a 100644
--- a/dev-tools/xtask/src/download.rs
+++ b/dev-tools/xtask/src/download.rs
@@ -17,6 +17,7 @@ use std::io::Write;
 use std::os::unix::fs::PermissionsExt;
 use std::sync::OnceLock;
 use std::time::Duration;
+use strum::Display;
 use strum::EnumIter;
 use strum::IntoEnumIterator;
 use tar::Archive;
@@ -25,6 +26,9 @@ use tokio::process::Command;
 
 const BUILDOMAT_URL: &'static str =
     "https://buildomat.eng.oxide.computer/public/file";
+const CARGO_HACK_URL: &'static str =
+    "https://github.com/taiki-e/cargo-hack/releases/download";
+
 const RETRY_ATTEMPTS: usize = 3;
 
 /// What is being downloaded?
@@ -44,6 +48,9 @@ enum Target {
     /// Download all targets
     All,
 
+    /// `cargo hack` binary
+    CargoHack,
+
     /// Clickhouse binary
     Clickhouse,
 
@@ -124,6 +131,7 @@ pub async fn run_cmd(args: DownloadArgs) -> Result<()> {
                     Target::All => {
                         bail!("We should have already filtered this 'All' target out?");
                     }
+                    Target::CargoHack => downloader.download_cargo_hack().await,
                     Target::Clickhouse => downloader.download_clickhouse().await,
                     Target::Cockroach => downloader.download_cockroach().await,
                     Target::Console => downloader.download_console().await,
@@ -151,12 +159,19 @@ pub async fn run_cmd(args: DownloadArgs) -> Result<()> {
     Ok(())
 }
 
+#[derive(Display)]
 enum Os {
     Illumos,
     Linux,
     Mac,
 }
 
+#[derive(Display)]
+enum Arch {
+    X86_64,
+    Aarch64,
+}
+
 impl Os {
     fn env_name(&self) -> &'static str {
         match self {
@@ -177,6 +192,15 @@ fn os_name() -> Result<Os> {
     Ok(os)
 }
 
+fn arch() -> Result<Arch> {
+    let arch = match std::env::consts::ARCH {
+        "x86_64" => Arch::X86_64,
+        "aarch64" => Arch::Aarch64,
+        other => bail!("Architecture not supported: {other}"),
+    };
+    Ok(arch)
+}
+
 struct Downloader<'a> {
     log: Logger,
 
@@ -432,6 +456,59 @@ async fn download_file_and_verify(
 }
 
 impl<'a> Downloader<'a> {
+    async fn download_cargo_hack(&self) -> Result<()> {
+        let os = os_name()?;
+        let arch = arch()?;
+
+        let download_dir = self.output_dir.join("downloads");
+        let destination_dir = self.output_dir.join("cargo-hack");
+
+        let checksums_path = self.versions_dir.join("cargo_hack_checksum");
+        let [checksum] = get_values_from_file(
+            [&format!("CIDL_SHA256_{}", os.env_name())],
+            &checksums_path,
+        )
+        .await?;
+
+        let versions_path = self.versions_dir.join("cargo_hack_version");
+        let version = tokio::fs::read_to_string(&versions_path)
+            .await
+            .context("Failed to read version from {versions_path}")?;
+        let version = version.trim();
+
+        let (platform, supported_arch) = match (os, arch) {
+            (Os::Illumos, Arch::X86_64) => ("unknown-illumos", "x86_64"),
+            (Os::Linux, Arch::X86_64) => ("unknown-linux-gnu", "x86_64"),
+            (Os::Linux, Arch::Aarch64) => ("unknown-linux-gnu", "aarch64"),
+            (Os::Mac, Arch::X86_64) => ("apple-darwin", "x86_64"),
+            (Os::Mac, Arch::Aarch64) => ("apple-darwin", "aarch64"),
+            (os, arch) => bail!("Unsupported OS/arch: {os}/{arch}"),
+        };
+
+        let tarball_filename =
+            format!("cargo-hack-{supported_arch}-{platform}.tar.gz");
+        let tarball_url =
+            format!("{CARGO_HACK_URL}/v{version}/{tarball_filename}");
+
+        let tarball_path = download_dir.join(&tarball_filename);
+
+        tokio::fs::create_dir_all(&download_dir).await?;
+        tokio::fs::create_dir_all(&destination_dir).await?;
+
+        download_file_and_verify(
+            &self.log,
+            &tarball_path,
+            &tarball_url,
+            ChecksumAlgorithm::Sha2,
+            &checksum,
+        )
+        .await?;
+
+        unpack_tarball(&self.log, &tarball_path, &destination_dir).await?;
+
+        Ok(())
+    }
+
     async fn download_clickhouse(&self) -> Result<()> {
         let os = os_name()?;
 
diff --git a/dev-tools/xtask/src/main.rs b/dev-tools/xtask/src/main.rs
index d0a61272a9..0ea2332c31 100644
--- a/dev-tools/xtask/src/main.rs
+++ b/dev-tools/xtask/src/main.rs
@@ -10,6 +10,7 @@ use anyhow::{Context, Result};
 use cargo_metadata::Metadata;
 use clap::{Parser, Subcommand};
 
+mod check_features;
 mod check_workspace_deps;
 mod clippy;
 mod download;
@@ -38,6 +39,8 @@ enum Cmds {
     /// Run Argon2 hash with specific parameters (quick performance check)
     Argon2(external::External),
 
+    /// Check that all features are flagged correctly
+    CheckFeatures(check_features::Args),
     /// Check that dependencies are not duplicated in any packages in the
     /// workspace
     CheckWorkspaceDeps,
@@ -91,6 +94,7 @@ async fn main() -> Result<()> {
             external.cargo_args(["--release"]).exec_example("argon2")
         }
         Cmds::Clippy(args) => clippy::run_cmd(args),
+        Cmds::CheckFeatures(args) => check_features::run_cmd(args),
         Cmds::CheckWorkspaceDeps => check_workspace_deps::run_cmd(),
         Cmds::Download(args) => download::run_cmd(args).await,
         Cmds::Openapi(external) => external.exec_bin("openapi-manager"),
diff --git a/tools/cargo_hack_checksum b/tools/cargo_hack_checksum
new file mode 100644
index 0000000000..12ed33c12e
--- /dev/null
+++ b/tools/cargo_hack_checksum
@@ -0,0 +1,3 @@
+CIDL_SHA256_DARWIN="ee00750378126c7e14402a45c34f95ed1ba4be2ae505b0c0020bb39b5b3467a4"
+CIDL_SHA256_ILLUMOS="f80d281343368bf7a027e2a7e94ae98a19e085c0666bff8d15264f39b42997bc"
+CIDL_SHA256_LINUX="ffecd932fc7569975eb77d70f2e299f07b57220868bedeb5867062a4a95a0376"
diff --git a/tools/cargo_hack_version b/tools/cargo_hack_version
new file mode 100644
index 0000000000..cb180fda59
--- /dev/null
+++ b/tools/cargo_hack_version
@@ -0,0 +1 @@
+0.6.29

From b214e28e0074328d328ae46c30e9db1fd600025c Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Mon, 15 Jul 2024 18:36:25 -0700
Subject: [PATCH 24/27] move saga recovery to a background task (#6063)

---
 Cargo.lock                                    |  30 +
 Cargo.toml                                    |   3 +
 dev-tools/omdb/Cargo.toml                     |   1 +
 dev-tools/omdb/src/bin/omdb/nexus.rs          | 131 +++
 dev-tools/omdb/tests/env.out                  |  12 +
 dev-tools/omdb/tests/successes.out            |  25 +
 nexus-config/src/nexus_config.rs              |  15 +
 nexus/Cargo.toml                              |   1 +
 nexus/auth/src/context.rs                     |  19 +
 nexus/db-model/src/lib.rs                     |   1 +
 nexus/db-queries/src/db/datastore/saga.rs     | 267 +++++
 nexus/db-queries/src/db/mod.rs                |   5 +-
 nexus/db-queries/src/db/saga_recovery.rs      | 805 ---------------
 nexus/examples/config-second.toml             |   1 +
 nexus/examples/config.toml                    |   1 +
 nexus/saga-recovery/Cargo.toml                |  41 +
 nexus/saga-recovery/build.rs                  |  10 +
 nexus/saga-recovery/src/lib.rs                | 682 +++++++++++++
 nexus/saga-recovery/src/recovery.rs           | 705 +++++++++++++
 nexus/saga-recovery/src/status.rs             | 175 ++++
 nexus/src/app/background/driver.rs            |   4 +
 nexus/src/app/background/init.rs              |  66 +-
 nexus/src/app/background/mod.rs               |   2 +
 nexus/src/app/background/tasks/mod.rs         |   1 +
 .../src/app/background/tasks/saga_recovery.rs | 927 ++++++++++++++++++
 nexus/src/app/mod.rs                          |  77 +-
 nexus/src/app/saga.rs                         |  18 +-
 nexus/src/app/sagas/mod.rs                    |   2 +-
 nexus/src/saga_interface.rs                   |   2 +-
 nexus/tests/config.test.toml                  |   1 +
 smf/nexus/multi-sled/config-partial.toml      |   1 +
 smf/nexus/single-sled/config-partial.toml     |   1 +
 32 files changed, 3180 insertions(+), 852 deletions(-)
 delete mode 100644 nexus/db-queries/src/db/saga_recovery.rs
 create mode 100644 nexus/saga-recovery/Cargo.toml
 create mode 100644 nexus/saga-recovery/build.rs
 create mode 100644 nexus/saga-recovery/src/lib.rs
 create mode 100644 nexus/saga-recovery/src/recovery.rs
 create mode 100644 nexus/saga-recovery/src/status.rs
 create mode 100644 nexus/src/app/background/tasks/saga_recovery.rs

diff --git a/Cargo.lock b/Cargo.lock
index 686b0a0b71..22e647c69c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4913,6 +4913,34 @@ dependencies = [
  "slog-error-chain",
 ]
 
+[[package]]
+name = "nexus-saga-recovery"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "futures",
+ "nexus-auth",
+ "nexus-db-model",
+ "nexus-db-queries",
+ "nexus-test-utils",
+ "nexus-test-utils-macros",
+ "nexus-types",
+ "omicron-common",
+ "omicron-rpaths",
+ "omicron-test-utils",
+ "omicron-workspace-hack",
+ "once_cell",
+ "pq-sys",
+ "pretty_assertions",
+ "serde",
+ "serde_json",
+ "slog",
+ "slog-error-chain",
+ "steno",
+ "tokio",
+ "uuid",
+]
+
 [[package]]
 name = "nexus-test-interface"
 version = "0.1.0"
@@ -5553,6 +5581,7 @@ dependencies = [
  "nexus-reconfigurator-execution",
  "nexus-reconfigurator-planning",
  "nexus-reconfigurator-preparation",
+ "nexus-saga-recovery",
  "nexus-test-interface",
  "nexus-test-utils",
  "nexus-test-utils-macros",
@@ -5656,6 +5685,7 @@ dependencies = [
  "nexus-db-model",
  "nexus-db-queries",
  "nexus-reconfigurator-preparation",
+ "nexus-saga-recovery",
  "nexus-test-utils",
  "nexus-test-utils-macros",
  "nexus-types",
diff --git a/Cargo.toml b/Cargo.toml
index 379aa7f549..96f962708a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -56,6 +56,7 @@ members = [
     "nexus/reconfigurator/execution",
     "nexus/reconfigurator/planning",
     "nexus/reconfigurator/preparation",
+    "nexus/saga-recovery",
     "nexus/test-interface",
     "nexus/test-utils-macros",
     "nexus/test-utils",
@@ -151,6 +152,7 @@ default-members = [
     "nexus/reconfigurator/execution",
     "nexus/reconfigurator/planning",
     "nexus/reconfigurator/preparation",
+    "nexus/saga-recovery",
     "nexus/test-interface",
     "nexus/test-utils-macros",
     "nexus/test-utils",
@@ -359,6 +361,7 @@ nexus-networking = { path = "nexus/networking" }
 nexus-reconfigurator-execution = { path = "nexus/reconfigurator/execution" }
 nexus-reconfigurator-planning = { path = "nexus/reconfigurator/planning" }
 nexus-reconfigurator-preparation = { path = "nexus/reconfigurator/preparation" }
+nexus-saga-recovery = { path = "nexus/saga-recovery" }
 omicron-certificates = { path = "certificates" }
 omicron-passwords = { path = "passwords" }
 omicron-workspace-hack = "0.1.0"
diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml
index e5d898509c..0990fdb11c 100644
--- a/dev-tools/omdb/Cargo.toml
+++ b/dev-tools/omdb/Cargo.toml
@@ -33,6 +33,7 @@ nexus-config.workspace = true
 nexus-db-model.workspace = true
 nexus-db-queries.workspace = true
 nexus-reconfigurator-preparation.workspace = true
+nexus-saga-recovery.workspace = true
 nexus-types.workspace = true
 omicron-common.workspace = true
 omicron-uuid-kinds.workspace = true
diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index f699466505..593e6c1645 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -28,6 +28,7 @@ use nexus_client::types::PhysicalDiskPath;
 use nexus_client::types::SledSelector;
 use nexus_client::types::UninitializedSledId;
 use nexus_db_queries::db::lookup::LookupPath;
+use nexus_saga_recovery::LastPass;
 use nexus_types::deployment::Blueprint;
 use nexus_types::internal_api::background::LookupRegionPortStatus;
 use nexus_types::internal_api::background::RegionReplacementDriverStatus;
@@ -1105,6 +1106,136 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 }
             }
         };
+    } else if name == "saga_recovery" {
+        match serde_json::from_value::<nexus_saga_recovery::Report>(
+            details.clone(),
+        ) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+
+            Ok(report) => {
+                println!("    since Nexus started:");
+                println!(
+                    "        sagas recovered:         {:3}",
+                    report.ntotal_recovered
+                );
+                println!(
+                    "        sagas recovery errors:   {:3}",
+                    report.ntotal_failures,
+                );
+                println!(
+                    "        sagas observed started:  {:3}",
+                    report.ntotal_started
+                );
+                println!(
+                    "        sagas inferred finished: {:3}",
+                    report.ntotal_finished
+                );
+                println!(
+                    "        missing from SEC:        {:3}",
+                    report.ntotal_sec_errors_missing,
+                );
+                println!(
+                    "        bad state in SEC:        {:3}",
+                    report.ntotal_sec_errors_bad_state,
+                );
+                match report.last_pass {
+                    LastPass::NeverStarted => {
+                        println!("    never run");
+                    }
+                    LastPass::Failed { message } => {
+                        println!("    last pass FAILED: {}", message);
+                    }
+                    LastPass::Success(success) => {
+                        println!("    last pass:");
+                        println!(
+                            "        found sagas: {:3} \
+                            (in-progress, assigned to this Nexus)",
+                            success.nfound
+                        );
+                        println!(
+                            "        recovered:   {:3} (successfully)",
+                            success.nrecovered
+                        );
+                        println!("        failed:      {:3}", success.nfailed);
+                        println!(
+                            "        skipped:     {:3} (already running)",
+                            success.nskipped
+                        );
+                        println!(
+                            "        removed:     {:3} (newly finished)",
+                            success.nskipped
+                        );
+                    }
+                };
+
+                if report.recent_recoveries.is_empty() {
+                    println!("    no recovered sagas");
+                } else {
+                    println!(
+                        "    recently recovered sagas ({}):",
+                        report.recent_recoveries.len()
+                    );
+
+                    #[derive(Tabled)]
+                    #[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
+                    struct SagaRow {
+                        time: String,
+                        saga_id: String,
+                    }
+                    let table_rows =
+                        report.recent_recoveries.iter().map(|r| SagaRow {
+                            time: r
+                                .time
+                                .to_rfc3339_opts(SecondsFormat::Secs, true),
+                            saga_id: r.saga_id.to_string(),
+                        });
+                    let table = tabled::Table::new(table_rows)
+                        .with(tabled::settings::Style::empty())
+                        .with(tabled::settings::Padding::new(0, 1, 0, 0))
+                        .to_string();
+                    println!(
+                        "{}",
+                        textwrap::indent(&table.to_string(), "        ")
+                    );
+                }
+
+                if report.recent_failures.is_empty() {
+                    println!("    no saga recovery failures");
+                } else {
+                    println!(
+                        "    recent sagas recovery failures ({}):",
+                        report.recent_failures.len()
+                    );
+
+                    #[derive(Tabled)]
+                    #[tabled(rename_all = "SCREAMING_SNAKE_CASE")]
+                    struct SagaRow<'a> {
+                        time: String,
+                        saga_id: String,
+                        message: &'a str,
+                    }
+                    let table_rows =
+                        report.recent_failures.iter().map(|r| SagaRow {
+                            time: r
+                                .time
+                                .to_rfc3339_opts(SecondsFormat::Secs, true),
+                            saga_id: r.saga_id.to_string(),
+                            message: &r.message,
+                        });
+                    let table = tabled::Table::new(table_rows)
+                        .with(tabled::settings::Style::empty())
+                        .with(tabled::settings::Padding::new(0, 1, 0, 0))
+                        .to_string();
+                    println!(
+                        "{}",
+                        textwrap::indent(&table.to_string(), "        ")
+                    );
+                }
+            }
+        }
     } else if name == "lookup_region_port" {
         match serde_json::from_value::<LookupRegionPortStatus>(details.clone())
         {
diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
index 66a48ab394..75acc5c584 100644
--- a/dev-tools/omdb/tests/env.out
+++ b/dev-tools/omdb/tests/env.out
@@ -118,6 +118,10 @@ task: "region_replacement_driver"
     drive region replacements forward to completion
 
 
+task: "saga_recovery"
+    recovers sagas assigned to this Nexus
+
+
 task: "service_firewall_rule_propagation"
     propagates VPC firewall rules for Omicron services with external network
     connectivity
@@ -254,6 +258,10 @@ task: "region_replacement_driver"
     drive region replacements forward to completion
 
 
+task: "saga_recovery"
+    recovers sagas assigned to this Nexus
+
+
 task: "service_firewall_rule_propagation"
     propagates VPC firewall rules for Omicron services with external network
     connectivity
@@ -377,6 +385,10 @@ task: "region_replacement_driver"
     drive region replacements forward to completion
 
 
+task: "saga_recovery"
+    recovers sagas assigned to this Nexus
+
+
 task: "service_firewall_rule_propagation"
     propagates VPC firewall rules for Omicron services with external network
     connectivity
diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
index a65098d7aa..982a9d8403 100644
--- a/dev-tools/omdb/tests/successes.out
+++ b/dev-tools/omdb/tests/successes.out
@@ -319,6 +319,10 @@ task: "region_replacement_driver"
     drive region replacements forward to completion
 
 
+task: "saga_recovery"
+    recovers sagas assigned to this Nexus
+
+
 task: "service_firewall_rule_propagation"
     propagates VPC firewall rules for Omicron services with external network
     connectivity
@@ -534,6 +538,27 @@ task: "region_replacement_driver"
     number of region replacement finish sagas started ok: 0
     number of errors: 0
 
+task: "saga_recovery"
+  configured period: every 10m
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    since Nexus started:
+        sagas recovered:           0
+        sagas recovery errors:     0
+        sagas observed started:    0
+        sagas inferred finished:   0
+        missing from SEC:          0
+        bad state in SEC:          0
+    last pass:
+        found sagas:   0 (in-progress, assigned to this Nexus)
+        recovered:     0 (successfully)
+        failed:        0
+        skipped:       0 (already running)
+        removed:       0 (newly finished)
+    no recovered sagas
+    no saga recovery failures
+
 task: "service_firewall_rule_propagation"
   configured period: every 5m
   currently executing: no
diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 4bdee4ab4e..3bc3a36126 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -383,6 +383,8 @@ pub struct BackgroundTaskConfig {
     pub v2p_mapping_propagation: V2PMappingPropagationConfig,
     /// configuration for abandoned VMM reaper task
     pub abandoned_vmm_reaper: AbandonedVmmReaperConfig,
+    /// configuration for saga recovery task
+    pub saga_recovery: SagaRecoveryConfig,
     /// configuration for lookup region port task
     pub lookup_region_port: LookupRegionPortConfig,
 }
@@ -566,6 +568,14 @@ pub struct AbandonedVmmReaperConfig {
     pub period_secs: Duration,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct SagaRecoveryConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+}
+
 #[serde_as]
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct RegionReplacementDriverConfig {
@@ -824,6 +834,7 @@ mod test {
             service_firewall_propagation.period_secs = 300
             v2p_mapping_propagation.period_secs = 30
             abandoned_vmm_reaper.period_secs = 60
+            saga_recovery.period_secs = 60
             lookup_region_port.period_secs = 60
             [default_region_allocation_strategy]
             type = "random"
@@ -972,6 +983,9 @@ mod test {
                         abandoned_vmm_reaper: AbandonedVmmReaperConfig {
                             period_secs: Duration::from_secs(60),
                         },
+                        saga_recovery: SagaRecoveryConfig {
+                            period_secs: Duration::from_secs(60),
+                        },
                         lookup_region_port: LookupRegionPortConfig {
                             period_secs: Duration::from_secs(60),
                         },
@@ -1047,6 +1061,7 @@ mod test {
             service_firewall_propagation.period_secs = 300
             v2p_mapping_propagation.period_secs = 30
             abandoned_vmm_reaper.period_secs = 60
+            saga_recovery.period_secs = 60
             lookup_region_port.period_secs = 60
             [default_region_allocation_strategy]
             type = "random"
diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml
index 359ea616d4..8d256aad5a 100644
--- a/nexus/Cargo.toml
+++ b/nexus/Cargo.toml
@@ -47,6 +47,7 @@ nexus-client.workspace = true
 nexus-config.workspace = true
 nexus-internal-api.workspace = true
 nexus-networking.workspace = true
+nexus-saga-recovery.workspace = true
 nexus-test-interface.workspace = true
 num-integer.workspace = true
 once_cell.workspace = true
diff --git a/nexus/auth/src/context.rs b/nexus/auth/src/context.rs
index 0aac0900c5..161ce6493b 100644
--- a/nexus/auth/src/context.rs
+++ b/nexus/auth/src/context.rs
@@ -236,6 +236,25 @@ impl OpContext {
         }
     }
 
+    /// Creates a new `OpContext` just like the given one, but with a different
+    /// identity.
+    ///
+    /// This is only intended for tests.
+    pub fn child_with_authn(&self, authn: authn::Context) -> OpContext {
+        let created_instant = Instant::now();
+        let created_walltime = SystemTime::now();
+
+        OpContext {
+            log: self.log.clone(),
+            authn: Arc::new(authn),
+            authz: self.authz.clone(),
+            created_instant,
+            created_walltime,
+            metadata: self.metadata.clone(),
+            kind: self.kind,
+        }
+    }
+
     /// Check whether the actor performing this request is authorized for
     /// `action` on `resource`.
     pub async fn authorize<Resource>(
diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs
index 30dc82965d..f28f886f6c 100644
--- a/nexus/db-model/src/lib.rs
+++ b/nexus/db-model/src/lib.rs
@@ -176,6 +176,7 @@ pub use region_replacement_step::*;
 pub use region_snapshot::*;
 pub use role_assignment::*;
 pub use role_builtin::*;
+pub use saga_types::*;
 pub use schema_versions::*;
 pub use semver_version::*;
 pub use service_kind::*;
diff --git a/nexus/db-queries/src/db/datastore/saga.rs b/nexus/db-queries/src/db/datastore/saga.rs
index c42d14d0d7..e632bce881 100644
--- a/nexus/db-queries/src/db/datastore/saga.rs
+++ b/nexus/db-queries/src/db/datastore/saga.rs
@@ -5,14 +5,19 @@
 //! [`DataStore`] methods on [`db::saga_types::Saga`]s.
 
 use super::DataStore;
+use super::SQL_BATCH_SIZE;
 use crate::db;
 use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
 use crate::db::model::Generation;
+use crate::db::pagination::paginated;
+use crate::db::pagination::paginated_multicolumn;
+use crate::db::pagination::Paginator;
 use crate::db::update_and_check::UpdateAndCheck;
 use crate::db::update_and_check::UpdateStatus;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use diesel::prelude::*;
+use nexus_auth::context::OpContext;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;
@@ -99,4 +104,266 @@ impl DataStore {
             )),
         }
     }
+
+    /// Returns a list of unfinished sagas assigned to SEC `sec_id`, making as
+    /// many queries as needed (in batches) to get them all
+    pub async fn saga_list_recovery_candidates_batched(
+        &self,
+        opctx: &OpContext,
+        sec_id: db::saga_types::SecId,
+    ) -> Result<Vec<db::saga_types::Saga>, Error> {
+        let mut sagas = vec![];
+        let mut paginator = Paginator::new(SQL_BATCH_SIZE);
+        let conn = self.pool_connection_authorized(opctx).await?;
+        while let Some(p) = paginator.next() {
+            use db::schema::saga::dsl;
+
+            let mut batch =
+                paginated(dsl::saga, dsl::id, &p.current_pagparams())
+                    .filter(dsl::saga_state.ne(
+                        db::saga_types::SagaCachedState(
+                            steno::SagaCachedState::Done,
+                        ),
+                    ))
+                    .filter(dsl::current_sec.eq(sec_id))
+                    .select(db::saga_types::Saga::as_select())
+                    .load_async(&*conn)
+                    .await
+                    .map_err(|e| {
+                        public_error_from_diesel(e, ErrorHandler::Server)
+                    })?;
+
+            paginator = p.found_batch(&batch, &|row| row.id);
+            sagas.append(&mut batch);
+        }
+        Ok(sagas)
+    }
+
+    /// Returns a list of all saga log entries for the given saga, making as
+    /// many queries as needed (in batches) to get them all
+    pub async fn saga_fetch_log_batched(
+        &self,
+        opctx: &OpContext,
+        saga_id: db::saga_types::SagaId,
+    ) -> Result<Vec<steno::SagaNodeEvent>, Error> {
+        let mut events = vec![];
+        let mut paginator = Paginator::new(SQL_BATCH_SIZE);
+        let conn = self.pool_connection_authorized(opctx).await?;
+        while let Some(p) = paginator.next() {
+            use db::schema::saga_node_event::dsl;
+            let batch = paginated_multicolumn(
+                dsl::saga_node_event,
+                (dsl::node_id, dsl::event_type),
+                &p.current_pagparams(),
+            )
+            .filter(dsl::saga_id.eq(saga_id))
+            .select(db::saga_types::SagaNodeEvent::as_select())
+            .load_async(&*conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+            paginator = p.found_batch(&batch, &|row| {
+                (row.node_id, row.event_type.clone())
+            });
+
+            let mut batch = batch
+                .into_iter()
+                .map(|event| steno::SagaNodeEvent::try_from(event))
+                .collect::<Result<Vec<_>, Error>>()?;
+
+            events.append(&mut batch);
+        }
+
+        Ok(events)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::db::datastore::test_utils::datastore_test;
+    use nexus_test_utils::db::test_setup_database;
+    use omicron_test_utils::dev;
+    use rand::seq::SliceRandom;
+    use uuid::Uuid;
+
+    // Tests pagination in listing sagas that are candidates for recovery
+    #[tokio::test]
+    async fn test_list_candidate_sagas() {
+        // Test setup
+        let logctx = dev::test_setup_log("test_list_candidate_sagas");
+        let mut db = test_setup_database(&logctx.log).await;
+        let (opctx, datastore) = datastore_test(&logctx, &db).await;
+        let sec_id = db::SecId(uuid::Uuid::new_v4());
+
+        // Create a couple batches of sagas.
+        let new_running_db_saga = || {
+            let params = steno::SagaCreateParams {
+                id: steno::SagaId(Uuid::new_v4()),
+                name: steno::SagaName::new("test saga"),
+                dag: serde_json::value::Value::Null,
+                state: steno::SagaCachedState::Running,
+            };
+
+            db::model::saga_types::Saga::new(sec_id, params)
+        };
+        let mut inserted_sagas = (0..SQL_BATCH_SIZE.get() * 2)
+            .map(|_| new_running_db_saga())
+            .collect::<Vec<_>>();
+
+        // Shuffle these sagas into a random order to check that the pagination
+        // order is working as intended on the read path, which we'll do later
+        // in this test.
+        inserted_sagas.shuffle(&mut rand::thread_rng());
+
+        // Insert the batches of unfinished sagas into the database
+        let conn = datastore
+            .pool_connection_unauthorized()
+            .await
+            .expect("Failed to access db connection");
+        diesel::insert_into(db::schema::saga::dsl::saga)
+            .values(inserted_sagas.clone())
+            .execute_async(&*conn)
+            .await
+            .expect("Failed to insert test setup data");
+
+        // List them, expect to see them all in order by ID.
+        let mut observed_sagas = datastore
+            .saga_list_recovery_candidates_batched(&opctx, sec_id)
+            .await
+            .expect("Failed to list unfinished sagas");
+        inserted_sagas.sort_by_key(|a| a.id);
+
+        // Timestamps can change slightly when we insert them.
+        //
+        // Sanitize them to make input/output equality checks easier.
+        let sanitize_timestamps = |sagas: &mut Vec<db::saga_types::Saga>| {
+            for saga in sagas {
+                saga.time_created = chrono::DateTime::UNIX_EPOCH;
+                saga.adopt_time = chrono::DateTime::UNIX_EPOCH;
+            }
+        };
+        sanitize_timestamps(&mut observed_sagas);
+        sanitize_timestamps(&mut inserted_sagas);
+
+        assert_eq!(
+            inserted_sagas, observed_sagas,
+            "Observed sagas did not match inserted sagas"
+        );
+
+        // Test cleanup
+        db.cleanup().await.unwrap();
+        logctx.cleanup_successful();
+    }
+
+    // Tests pagination in loading a saga log
+    #[tokio::test]
+    async fn test_list_unfinished_nodes() {
+        // Test setup
+        let logctx = dev::test_setup_log("test_list_unfinished_nodes");
+        let mut db = test_setup_database(&logctx.log).await;
+        let (opctx, datastore) = datastore_test(&logctx, &db).await;
+        let sec_id = db::SecId(uuid::Uuid::new_v4());
+        let saga_id = steno::SagaId(Uuid::new_v4());
+
+        // Create a couple batches of saga events
+        let new_db_saga_nodes =
+            |node_id: u32, event_type: steno::SagaNodeEventType| {
+                let event = steno::SagaNodeEvent {
+                    saga_id,
+                    node_id: steno::SagaNodeId::from(node_id),
+                    event_type,
+                };
+
+                db::model::saga_types::SagaNodeEvent::new(event, sec_id)
+            };
+        let mut inserted_nodes = (0..SQL_BATCH_SIZE.get() * 2)
+            .flat_map(|i| {
+                // This isn't an exhaustive list of event types, but gives us a
+                // few options to pick from. Since this is a pagination key,
+                // it's important to include a variety here.
+                use steno::SagaNodeEventType::*;
+                [
+                    new_db_saga_nodes(i, Started),
+                    new_db_saga_nodes(i, UndoStarted),
+                    new_db_saga_nodes(i, UndoFinished),
+                ]
+            })
+            .collect::<Vec<_>>();
+
+        // Shuffle these nodes into a random order to check that the pagination
+        // order is working as intended on the read path, which we'll do later
+        // in this test.
+        inserted_nodes.shuffle(&mut rand::thread_rng());
+
+        // Insert them into the database
+        let conn = datastore
+            .pool_connection_unauthorized()
+            .await
+            .expect("Failed to access db connection");
+        diesel::insert_into(db::schema::saga_node_event::dsl::saga_node_event)
+            .values(inserted_nodes.clone())
+            .execute_async(&*conn)
+            .await
+            .expect("Failed to insert test setup data");
+
+        // List them, expect to see them all in order by ID.
+        let observed_nodes = datastore
+            .saga_fetch_log_batched(
+                &opctx,
+                nexus_db_model::saga_types::SagaId::from(saga_id),
+            )
+            .await
+            .expect("Failed to list nodes of unfinished saga");
+        inserted_nodes.sort_by_key(|a| (a.node_id, a.event_type.clone()));
+
+        let inserted_nodes = inserted_nodes
+            .into_iter()
+            .map(|node| steno::SagaNodeEvent::try_from(node))
+            .collect::<Result<Vec<_>, _>>()
+            .expect("Couldn't convert DB nodes to steno nodes");
+
+        // The steno::SagaNodeEvent type doesn't implement PartialEq, so we need
+        // to do this a little manually.
+        assert_eq!(inserted_nodes.len(), observed_nodes.len());
+        for (inserted, observed) in
+            inserted_nodes.iter().zip(observed_nodes.iter())
+        {
+            assert_eq!(inserted.saga_id, observed.saga_id);
+            assert_eq!(inserted.node_id, observed.node_id);
+            assert_eq!(
+                inserted.event_type.label(),
+                observed.event_type.label()
+            );
+        }
+
+        // Test cleanup
+        db.cleanup().await.unwrap();
+        logctx.cleanup_successful();
+    }
+
+    // Tests the special case of listing an empty saga log
+    #[tokio::test]
+    async fn test_list_no_unfinished_nodes() {
+        // Test setup
+        let logctx = dev::test_setup_log("test_list_no_unfinished_nodes");
+        let mut db = test_setup_database(&logctx.log).await;
+        let (opctx, datastore) = datastore_test(&logctx, &db).await;
+        let saga_id = steno::SagaId(Uuid::new_v4());
+
+        // Test that this returns "no nodes" rather than throwing some "not
+        // found" error.
+        let observed_nodes = datastore
+            .saga_fetch_log_batched(
+                &opctx,
+                nexus_db_model::saga_types::SagaId::from(saga_id),
+            )
+            .await
+            .expect("Failed to list nodes of unfinished saga");
+        assert_eq!(observed_nodes.len(), 0);
+
+        // Test cleanup
+        db.cleanup().await.unwrap();
+        logctx.cleanup_successful();
+    }
 }
diff --git a/nexus/db-queries/src/db/mod.rs b/nexus/db-queries/src/db/mod.rs
index c8c8860901..fc44a2f27b 100644
--- a/nexus/db-queries/src/db/mod.rs
+++ b/nexus/db-queries/src/db/mod.rs
@@ -27,7 +27,6 @@ mod pool_connection;
 // sagas.
 pub mod queries;
 mod raw_query_builder;
-mod saga_recovery;
 mod sec_store;
 pub(crate) mod true_or_cast_error;
 mod update_and_check;
@@ -37,8 +36,7 @@ mod update_and_check;
 // full table scans the same way pooled connections do.
 pub use pool_connection::DISALLOW_FULL_TABLE_SCAN_SQL;
 
-#[cfg(test)]
-mod test_utils;
+pub mod test_utils;
 
 pub use nexus_db_fixed_data as fixed_data;
 pub use nexus_db_model as model;
@@ -50,7 +48,6 @@ pub use config::Config;
 pub use datastore::DataStore;
 pub use on_conflict_ext::IncompleteOnConflictExt;
 pub use pool::{DbConnection, Pool};
-pub use saga_recovery::{recover, CompletionTask, RecoveryTask};
 pub use saga_types::SecId;
 pub use sec_store::CockroachDbSecStore;
 
diff --git a/nexus/db-queries/src/db/saga_recovery.rs b/nexus/db-queries/src/db/saga_recovery.rs
deleted file mode 100644
index e85011f60f..0000000000
--- a/nexus/db-queries/src/db/saga_recovery.rs
+++ /dev/null
@@ -1,805 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! Handles recovery of sagas
-
-use crate::context::OpContext;
-use crate::db;
-use crate::db::datastore::SQL_BATCH_SIZE;
-use crate::db::error::public_error_from_diesel;
-use crate::db::error::ErrorHandler;
-use crate::db::pagination::{paginated, paginated_multicolumn, Paginator};
-use async_bb8_diesel::AsyncRunQueryDsl;
-use diesel::prelude::*;
-use diesel::ExpressionMethods;
-use diesel::SelectableHelper;
-use futures::{future::BoxFuture, TryFutureExt};
-use omicron_common::api::external::Error;
-use omicron_common::api::external::LookupType;
-use omicron_common::api::external::ResourceType;
-use omicron_common::backoff::retry_notify;
-use omicron_common::backoff::retry_policy_internal_service;
-use omicron_common::backoff::BackoffError;
-use std::future::Future;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::{Context, Poll};
-
-/// Result type of a [`RecoveryTask`].
-pub type RecoveryResult = Result<CompletionTask, Error>;
-
-/// A future which completes once sagas have been loaded and resumed.
-/// Note that this does not necessarily mean the sagas have completed
-/// execution.
-///
-/// Returns a Result of either:
-/// - A [`CompletionTask`] to track the completion of the resumed sagas, or
-/// - An [`Error`] encountered when attempting to load and resume sagas.
-pub struct RecoveryTask(BoxFuture<'static, RecoveryResult>);
-
-impl Future for RecoveryTask {
-    type Output = RecoveryResult;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        Pin::new(&mut self.get_mut().0).poll(cx)
-    }
-}
-
-/// Result type from a [`CompletionTask`].
-pub type CompletionResult = Result<(), Error>;
-
-/// A future which completes once loaded and resumed sagas have also completed.
-pub struct CompletionTask(BoxFuture<'static, CompletionResult>);
-
-impl Future for CompletionTask {
-    type Output = CompletionResult;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        Pin::new(&mut self.get_mut().0).poll(cx)
-    }
-}
-
-/// Starts an asynchronous task to recover sagas (as after a crash or restart)
-///
-/// More specifically, this task queries the database to list all uncompleted
-/// sagas that are assigned to SEC `sec_id` and for each one:
-///
-/// * loads the saga DAG and log from `datastore`
-/// * uses [`steno::SecClient::saga_resume`] to prepare to resume execution of
-///   the saga using the persistent saga log
-/// * resumes execution of each saga
-///
-/// The returned [`RecoveryTask`] completes once all sagas have been loaded
-/// and resumed, and itself returns a [`CompletionTask`] which completes
-/// when those resumed sagas have finished.
-pub fn recover<T>(
-    opctx: OpContext,
-    sec_id: db::SecId,
-    uctx: Arc<T::ExecContextType>,
-    datastore: Arc<db::DataStore>,
-    sec_client: Arc<steno::SecClient>,
-    registry: Arc<steno::ActionRegistry<T>>,
-) -> RecoveryTask
-where
-    T: steno::SagaType,
-{
-    let join_handle = tokio::spawn(async move {
-        info!(&opctx.log, "start saga recovery");
-
-        // We perform the initial list of sagas using a standard retry policy.
-        // We treat all errors as transient because there's nothing we can do
-        // about any of them except try forever.  As a result, we never expect
-        // an error from the overall operation.
-        // TODO-monitoring we definitely want a way to raise a big red flag if
-        // saga recovery is not completing.
-        // TODO-robustness It would be better to retry the individual database
-        // operations within this operation than retrying the overall operation.
-        // As this is written today, if the listing requires a bunch of pages
-        // and the operation fails partway through, we'll re-fetch all the pages
-        // we successfully fetched before.  If the database is overloaded and
-        // only N% of requests are completing, the probability of this operation
-        // succeeding decreases considerably as the number of separate queries
-        // (pages) goes up.  We'd be much more likely to finish the overall
-        // operation if we didn't throw away the results we did get each time.
-        let found_sagas = retry_notify(
-            retry_policy_internal_service(),
-            || async {
-                list_unfinished_sagas(&opctx, &datastore, &sec_id)
-                    .await
-                    .map_err(BackoffError::transient)
-            },
-            |error, duration| {
-                warn!(
-                    &opctx.log,
-                    "failed to list sagas (will retry after {:?}): {:#}",
-                    duration,
-                    error
-                )
-            },
-        )
-        .await
-        .unwrap();
-
-        info!(&opctx.log, "listed sagas ({} total)", found_sagas.len());
-
-        let recovery_futures = found_sagas.into_iter().map(|saga| async {
-            // TODO-robustness We should put this into a retry loop.  We may
-            // also want to take any failed sagas and put them at the end of the
-            // queue.  It shouldn't really matter, in that the transient
-            // failures here are likely to affect recovery of all sagas.
-            // However, it's conceivable we misclassify a permanent failure as a
-            // transient failure, or that a transient failure is more likely to
-            // affect some sagas than others (e.g, data on a different node, or
-            // it has a larger log that requires more queries).  To avoid one
-            // bad saga ruining the rest, we should try to recover the rest
-            // before we go back to one that's failed.
-            // TODO-debugging want visibility into "abandoned" sagas
-            let saga_id: steno::SagaId = saga.id.into();
-            recover_saga(
-                &opctx,
-                Arc::clone(&uctx),
-                &datastore,
-                &sec_client,
-                Arc::clone(&registry),
-                saga,
-            )
-            .map_err(|error| {
-                warn!(
-                    &opctx.log,
-                    "failed to recover saga {}: {:#}", saga_id, error
-                );
-                error
-            })
-            .await
-        });
-
-        let mut completion_futures = Vec::with_capacity(recovery_futures.len());
-        // Loads and resumes all sagas in serial.
-        for recovery_future in recovery_futures {
-            let saga_complete_future = recovery_future.await?;
-            completion_futures.push(saga_complete_future);
-        }
-        // Returns a future that awaits the completion of all resumed sagas.
-        Ok(CompletionTask(Box::pin(async move {
-            futures::future::try_join_all(completion_futures).await?;
-            Ok(())
-        })))
-    });
-
-    RecoveryTask(Box::pin(async move {
-        // Unwraps join-related errors.
-        join_handle.await.unwrap()
-    }))
-}
-
-/// Queries the database to return a list of uncompleted sagas assigned to SEC
-/// `sec_id`
-// For now, we do the simplest thing: we fetch all the sagas that the
-// caller's going to need before returning any of them.  This is easier to
-// implement than, say, using a channel or some other stream.  In principle
-// we're giving up some opportunity for parallelism.  The caller could be
-// going off and fetching the saga log for the first sagas that we find
-// while we're still listing later sagas.  Doing that properly would require
-// concurrency limits to prevent overload or starvation of other database
-// consumers.
-async fn list_unfinished_sagas(
-    opctx: &OpContext,
-    datastore: &db::DataStore,
-    sec_id: &db::SecId,
-) -> Result<Vec<db::saga_types::Saga>, Error> {
-    trace!(&opctx.log, "listing sagas");
-
-    // Read all sagas in batches.
-    //
-    // Although we could read them all into memory simultaneously, this
-    // risks blocking the DB for an unreasonable amount of time. Instead,
-    // we paginate to avoid cutting off availability to the DB.
-    let mut sagas = vec![];
-    let mut paginator = Paginator::new(SQL_BATCH_SIZE);
-    let conn = datastore.pool_connection_authorized(opctx).await?;
-    while let Some(p) = paginator.next() {
-        use db::schema::saga::dsl;
-
-        let mut batch = paginated(dsl::saga, dsl::id, &p.current_pagparams())
-            .filter(dsl::saga_state.ne(db::saga_types::SagaCachedState(
-                steno::SagaCachedState::Done,
-            )))
-            .filter(dsl::current_sec.eq(*sec_id))
-            .select(db::saga_types::Saga::as_select())
-            .load_async(&*conn)
-            .await
-            .map_err(|e| {
-                public_error_from_diesel(
-                    e,
-                    ErrorHandler::NotFoundByLookup(
-                        ResourceType::SagaDbg,
-                        LookupType::ById(sec_id.0),
-                    ),
-                )
-            })?;
-
-        paginator = p.found_batch(&batch, &|row| row.id);
-        sagas.append(&mut batch);
-    }
-    Ok(sagas)
-}
-
-/// Recovers an individual saga
-///
-/// This function loads the saga log and uses `sec_client` to resume execution.
-///
-/// This function returns a future that completes when the resumed saga
-/// has completed. The saga executor will attempt to execute the saga
-/// regardless of this future - it is for notification purposes only,
-/// and does not need to be polled.
-async fn recover_saga<'a, T>(
-    opctx: &'a OpContext,
-    uctx: Arc<T::ExecContextType>,
-    datastore: &'a db::DataStore,
-    sec_client: &'a steno::SecClient,
-    registry: Arc<steno::ActionRegistry<T>>,
-    saga: db::saga_types::Saga,
-) -> Result<
-    impl core::future::Future<Output = Result<(), Error>> + 'static,
-    Error,
->
-where
-    T: steno::SagaType,
-{
-    let saga_id: steno::SagaId = saga.id.into();
-    let saga_name = saga.name.clone();
-    trace!(opctx.log, "recovering saga: start";
-        "saga_id" => saga_id.to_string(),
-        "saga_name" => saga_name.clone(),
-    );
-
-    let log_events = load_saga_log(&opctx, datastore, &saga).await?;
-    trace!(
-        opctx.log,
-        "recovering saga: loaded log";
-        "saga_id" => ?saga_id,
-        "saga_name" => saga_name.clone()
-    );
-    let saga_completion = sec_client
-        .saga_resume(
-            saga_id,
-            Arc::clone(&uctx),
-            saga.saga_dag,
-            registry,
-            log_events,
-        )
-        .await
-        .map_err(|error| {
-            // TODO-robustness We want to differentiate between retryable and
-            // not here
-            Error::internal_error(&format!(
-                "failed to resume saga: {:#}",
-                error
-            ))
-        })?;
-    sec_client.saga_start(saga_id).await.map_err(|error| {
-        Error::internal_error(&format!("failed to start saga: {:#}", error))
-    })?;
-
-    Ok(async {
-        saga_completion.await.kind.map_err(|e| {
-            Error::internal_error(&format!("Saga failure: {:?}", e))
-        })?;
-        Ok(())
-    })
-}
-
-/// Queries the database to load the full log for the specified saga
-async fn load_saga_log(
-    opctx: &OpContext,
-    datastore: &db::DataStore,
-    saga: &db::saga_types::Saga,
-) -> Result<Vec<steno::SagaNodeEvent>, Error> {
-    // Read all events in batches.
-    //
-    // Although we could read them all into memory simultaneously, this
-    // risks blocking the DB for an unreasonable amount of time. Instead,
-    // we paginate to avoid cutting off availability.
-    let mut events = vec![];
-    let mut paginator = Paginator::new(SQL_BATCH_SIZE);
-    let conn = datastore.pool_connection_authorized(opctx).await?;
-    while let Some(p) = paginator.next() {
-        use db::schema::saga_node_event::dsl;
-        let batch = paginated_multicolumn(
-            dsl::saga_node_event,
-            (dsl::node_id, dsl::event_type),
-            &p.current_pagparams(),
-        )
-        .filter(dsl::saga_id.eq(saga.id))
-        .select(db::saga_types::SagaNodeEvent::as_select())
-        .load_async(&*conn)
-        .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
-        .await?;
-        paginator =
-            p.found_batch(&batch, &|row| (row.node_id, row.event_type.clone()));
-
-        let mut batch = batch
-            .into_iter()
-            .map(|event| steno::SagaNodeEvent::try_from(event))
-            .collect::<Result<Vec<_>, Error>>()?;
-
-        events.append(&mut batch);
-    }
-    Ok(events)
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-    use crate::context::OpContext;
-    use crate::db::test_utils::UnpluggableCockroachDbSecStore;
-    use nexus_test_utils::db::test_setup_database;
-    use omicron_test_utils::dev;
-    use once_cell::sync::Lazy;
-    use pretty_assertions::assert_eq;
-    use rand::seq::SliceRandom;
-    use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
-    use steno::{
-        new_action_noop_undo, Action, ActionContext, ActionError,
-        ActionRegistry, DagBuilder, Node, SagaDag, SagaId, SagaName, SagaType,
-        SecClient,
-    };
-    use uuid::Uuid;
-
-    // Returns a cockroach DB, as well as a "datastore" interface (which is the
-    // one more frequently used by Nexus).
-    //
-    // The caller is responsible for calling "cleanup().await" on the returned
-    // CockroachInstance - we would normally wrap this in a drop method, but it
-    // is async.
-    async fn new_db(
-        log: &slog::Logger,
-    ) -> (dev::db::CockroachInstance, Arc<db::DataStore>) {
-        let db = test_setup_database(&log).await;
-        let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(log, &cfg));
-        let db_datastore = Arc::new(
-            db::DataStore::new(&log, Arc::clone(&pool), None).await.unwrap(),
-        );
-        (db, db_datastore)
-    }
-
-    // The following is our "saga-under-test". It's a simple two-node operation
-    // that tracks how many times it has been called, and provides a mechanism
-    // for detaching storage, to simulate power failure (and meaningfully
-    // recover).
-
-    #[derive(Debug)]
-    struct TestContext {
-        log: slog::Logger,
-
-        // Storage, and instructions on whether or not to detach it
-        // when executing the first saga action.
-        storage: Arc<UnpluggableCockroachDbSecStore>,
-        do_unplug: AtomicBool,
-
-        // Tracks of how many times each node has been reached.
-        n1_count: AtomicU32,
-        n2_count: AtomicU32,
-    }
-
-    impl TestContext {
-        fn new(
-            log: &slog::Logger,
-            storage: Arc<UnpluggableCockroachDbSecStore>,
-        ) -> Self {
-            TestContext {
-                log: log.clone(),
-                storage,
-                do_unplug: AtomicBool::new(false),
-
-                // Counters of how many times the nodes have been invoked.
-                n1_count: AtomicU32::new(0),
-                n2_count: AtomicU32::new(0),
-            }
-        }
-    }
-
-    #[derive(Debug)]
-    struct TestOp;
-    impl SagaType for TestOp {
-        type ExecContextType = TestContext;
-    }
-
-    static ACTION_N1: Lazy<Arc<dyn Action<TestOp>>> =
-        Lazy::new(|| new_action_noop_undo("n1_action", node_one));
-    static ACTION_N2: Lazy<Arc<dyn Action<TestOp>>> =
-        Lazy::new(|| new_action_noop_undo("n2_action", node_two));
-
-    fn registry_create() -> Arc<ActionRegistry<TestOp>> {
-        let mut registry = ActionRegistry::new();
-        registry.register(Arc::clone(&ACTION_N1));
-        registry.register(Arc::clone(&ACTION_N2));
-        Arc::new(registry)
-    }
-
-    fn saga_object_create() -> Arc<SagaDag> {
-        let mut builder = DagBuilder::new(SagaName::new("test-saga"));
-        builder.append(Node::action("n1_out", "NodeOne", ACTION_N1.as_ref()));
-        builder.append(Node::action("n2_out", "NodeTwo", ACTION_N2.as_ref()));
-        let dag = builder.build().unwrap();
-        Arc::new(SagaDag::new(dag, serde_json::Value::Null))
-    }
-
-    async fn node_one(ctx: ActionContext<TestOp>) -> Result<i32, ActionError> {
-        let uctx = ctx.user_data();
-        uctx.n1_count.fetch_add(1, Ordering::SeqCst);
-        info!(&uctx.log, "ACTION: node_one");
-        // If "do_unplug" is true, we detach storage.
-        //
-        // This prevents the SEC from successfully recording that
-        // this node completed, and acts like a crash.
-        if uctx.do_unplug.load(Ordering::SeqCst) {
-            info!(&uctx.log, "Unplugged storage");
-            uctx.storage.set_unplug(true);
-        }
-        Ok(1)
-    }
-
-    async fn node_two(ctx: ActionContext<TestOp>) -> Result<i32, ActionError> {
-        let uctx = ctx.user_data();
-        uctx.n2_count.fetch_add(1, Ordering::SeqCst);
-        info!(&uctx.log, "ACTION: node_two");
-        Ok(2)
-    }
-
-    // Helper function for setting up storage, SEC, and a test context object.
-    fn create_storage_sec_and_context(
-        log: &slog::Logger,
-        db_datastore: Arc<db::DataStore>,
-        sec_id: db::SecId,
-    ) -> (Arc<UnpluggableCockroachDbSecStore>, SecClient, Arc<TestContext>)
-    {
-        let storage = Arc::new(UnpluggableCockroachDbSecStore::new(
-            sec_id,
-            db_datastore,
-            log.new(o!("component" => "SecStore")),
-        ));
-        let sec_client =
-            steno::sec(log.new(o!("component" => "SEC")), storage.clone());
-        let uctx = Arc::new(TestContext::new(&log, storage.clone()));
-        (storage, sec_client, uctx)
-    }
-
-    #[tokio::test]
-    async fn test_failure_during_saga_can_be_recovered() {
-        // Test setup
-        let logctx =
-            dev::test_setup_log("test_failure_during_saga_can_be_recovered");
-        let log = logctx.log.new(o!());
-        let (mut db, db_datastore) = new_db(&log).await;
-        let sec_id = db::SecId(uuid::Uuid::new_v4());
-        let (storage, sec_client, uctx) =
-            create_storage_sec_and_context(&log, db_datastore.clone(), sec_id);
-        let sec_log = log.new(o!("component" => "SEC"));
-        let opctx = OpContext::for_tests(
-            log,
-            Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
-        );
-
-        // Create and start a saga.
-        //
-        // Because "do_unplug" is set to true, we should detach storage within
-        // the first node operation.
-        //
-        // We expect the saga will complete successfully, because the
-        // storage subsystem returns "OK" rather than an error.
-        uctx.do_unplug.store(true, Ordering::SeqCst);
-        let saga_id = SagaId(Uuid::new_v4());
-        let future = sec_client
-            .saga_create(
-                saga_id,
-                uctx.clone(),
-                saga_object_create(),
-                registry_create(),
-            )
-            .await
-            .unwrap();
-        sec_client.saga_start(saga_id).await.unwrap();
-        let result = future.await;
-        let output = result.kind.unwrap();
-        assert_eq!(output.lookup_node_output::<i32>("n1_out").unwrap(), 1);
-        assert_eq!(output.lookup_node_output::<i32>("n2_out").unwrap(), 2);
-        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 1);
-        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 1);
-
-        // Now we "reboot", by terminating the SEC and creating a new one
-        // using the same storage system.
-        //
-        // We update uctx to prevent the storage system from detaching again.
-        sec_client.shutdown().await;
-        let sec_client = steno::sec(sec_log, storage.clone());
-        uctx.storage.set_unplug(false);
-        uctx.do_unplug.store(false, Ordering::SeqCst);
-
-        // Recover the saga, observing that it re-runs operations and completes.
-        let sec_client = Arc::new(sec_client);
-        recover(
-            opctx,
-            sec_id,
-            uctx.clone(),
-            db_datastore,
-            sec_client.clone(),
-            registry_create(),
-        )
-        .await // Await the loading and resuming of the sagas
-        .unwrap()
-        .await // Awaits the completion of the resumed sagas
-        .unwrap();
-        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 2);
-        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 2);
-
-        // Test cleanup
-        let sec_client = Arc::try_unwrap(sec_client).unwrap();
-        sec_client.shutdown().await;
-        db.cleanup().await.unwrap();
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_successful_saga_does_not_replay_during_recovery() {
-        // Test setup
-        let logctx = dev::test_setup_log(
-            "test_successful_saga_does_not_replay_during_recovery",
-        );
-        let log = logctx.log.new(o!());
-        let (mut db, db_datastore) = new_db(&log).await;
-        let sec_id = db::SecId(uuid::Uuid::new_v4());
-        let (storage, sec_client, uctx) =
-            create_storage_sec_and_context(&log, db_datastore.clone(), sec_id);
-        let sec_log = log.new(o!("component" => "SEC"));
-        let opctx = OpContext::for_tests(
-            log,
-            Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
-        );
-
-        // Create and start a saga, which we expect to complete successfully.
-        let saga_id = SagaId(Uuid::new_v4());
-        let future = sec_client
-            .saga_create(
-                saga_id,
-                uctx.clone(),
-                saga_object_create(),
-                registry_create(),
-            )
-            .await
-            .unwrap();
-        sec_client.saga_start(saga_id).await.unwrap();
-        let result = future.await;
-        let output = result.kind.unwrap();
-        assert_eq!(output.lookup_node_output::<i32>("n1_out").unwrap(), 1);
-        assert_eq!(output.lookup_node_output::<i32>("n2_out").unwrap(), 2);
-        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 1);
-        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 1);
-
-        // Now we "reboot", by terminating the SEC and creating a new one
-        // using the same storage system.
-        sec_client.shutdown().await;
-        let sec_client = steno::sec(sec_log, storage.clone());
-
-        // Recover the saga, observing that it does not replay the nodes.
-        let sec_client = Arc::new(sec_client);
-        recover(
-            opctx,
-            sec_id,
-            uctx.clone(),
-            db_datastore,
-            sec_client.clone(),
-            registry_create(),
-        )
-        .await
-        .unwrap()
-        .await
-        .unwrap();
-        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 1);
-        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 1);
-
-        // Test cleanup
-        let sec_client = Arc::try_unwrap(sec_client).unwrap();
-        sec_client.shutdown().await;
-        db.cleanup().await.unwrap();
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_list_unfinished_sagas() {
-        // Test setup
-        let logctx = dev::test_setup_log("test_list_unfinished_sagas");
-        let log = logctx.log.new(o!());
-        let (mut db, db_datastore) = new_db(&log).await;
-        let sec_id = db::SecId(uuid::Uuid::new_v4());
-        let opctx = OpContext::for_tests(
-            log,
-            Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
-        );
-
-        // Create a couple batches of sagas.
-        let new_running_db_saga = || {
-            let params = steno::SagaCreateParams {
-                id: steno::SagaId(Uuid::new_v4()),
-                name: steno::SagaName::new("test saga"),
-                dag: serde_json::value::Value::Null,
-                state: steno::SagaCachedState::Running,
-            };
-
-            db::model::saga_types::Saga::new(sec_id, params)
-        };
-        let mut inserted_sagas = (0..SQL_BATCH_SIZE.get() * 2)
-            .map(|_| new_running_db_saga())
-            .collect::<Vec<_>>();
-
-        // Shuffle these sagas into a random order to check that the pagination
-        // order is working as intended on the read path, which we'll do later
-        // in this test.
-        inserted_sagas.shuffle(&mut rand::thread_rng());
-
-        // Insert the batches of unfinished sagas into the database
-        let conn = db_datastore
-            .pool_connection_unauthorized()
-            .await
-            .expect("Failed to access db connection");
-        diesel::insert_into(db::schema::saga::dsl::saga)
-            .values(inserted_sagas.clone())
-            .execute_async(&*conn)
-            .await
-            .expect("Failed to insert test setup data");
-
-        // List them, expect to see them all in order by ID.
-        let mut observed_sagas =
-            list_unfinished_sagas(&opctx, &db_datastore, &sec_id)
-                .await
-                .expect("Failed to list unfinished sagas");
-        inserted_sagas.sort_by_key(|a| a.id);
-
-        // Timestamps can change slightly when we insert them.
-        //
-        // Sanitize them to make input/output equality checks easier.
-        let sanitize_timestamps = |sagas: &mut Vec<db::saga_types::Saga>| {
-            for saga in sagas {
-                saga.time_created = chrono::DateTime::UNIX_EPOCH;
-                saga.adopt_time = chrono::DateTime::UNIX_EPOCH;
-            }
-        };
-        sanitize_timestamps(&mut observed_sagas);
-        sanitize_timestamps(&mut inserted_sagas);
-
-        assert_eq!(
-            inserted_sagas, observed_sagas,
-            "Observed sagas did not match inserted sagas"
-        );
-
-        // Test cleanup
-        db.cleanup().await.unwrap();
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_list_unfinished_nodes() {
-        // Test setup
-        let logctx = dev::test_setup_log("test_list_unfinished_nodes");
-        let log = logctx.log.new(o!());
-        let (mut db, db_datastore) = new_db(&log).await;
-        let sec_id = db::SecId(uuid::Uuid::new_v4());
-        let opctx = OpContext::for_tests(
-            log,
-            Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
-        );
-        let saga_id = steno::SagaId(Uuid::new_v4());
-
-        // Create a couple batches of saga events
-        let new_db_saga_nodes =
-            |node_id: u32, event_type: steno::SagaNodeEventType| {
-                let event = steno::SagaNodeEvent {
-                    saga_id,
-                    node_id: steno::SagaNodeId::from(node_id),
-                    event_type,
-                };
-
-                db::model::saga_types::SagaNodeEvent::new(event, sec_id)
-            };
-        let mut inserted_nodes = (0..SQL_BATCH_SIZE.get() * 2)
-            .flat_map(|i| {
-                // This isn't an exhaustive list of event types, but gives us a few
-                // options to pick from. Since this is a pagination key, it's
-                // important to include a variety here.
-                use steno::SagaNodeEventType::*;
-                [
-                    new_db_saga_nodes(i, Started),
-                    new_db_saga_nodes(i, UndoStarted),
-                    new_db_saga_nodes(i, UndoFinished),
-                ]
-            })
-            .collect::<Vec<_>>();
-
-        // Shuffle these nodes into a random order to check that the pagination
-        // order is working as intended on the read path, which we'll do later
-        // in this test.
-        inserted_nodes.shuffle(&mut rand::thread_rng());
-
-        // Insert them into the database
-        let conn = db_datastore
-            .pool_connection_unauthorized()
-            .await
-            .expect("Failed to access db connection");
-        diesel::insert_into(db::schema::saga_node_event::dsl::saga_node_event)
-            .values(inserted_nodes.clone())
-            .execute_async(&*conn)
-            .await
-            .expect("Failed to insert test setup data");
-
-        // List them, expect to see them all in order by ID.
-        //
-        // Note that we need to make up a saga to see this, but the
-        // part of it that actually matters is the ID.
-        let params = steno::SagaCreateParams {
-            id: saga_id,
-            name: steno::SagaName::new("test saga"),
-            dag: serde_json::value::Value::Null,
-            state: steno::SagaCachedState::Running,
-        };
-        let saga = db::model::saga_types::Saga::new(sec_id, params);
-        let observed_nodes = load_saga_log(&opctx, &db_datastore, &saga)
-            .await
-            .expect("Failed to list unfinished nodes");
-        inserted_nodes.sort_by_key(|a| (a.node_id, a.event_type.clone()));
-
-        let inserted_nodes = inserted_nodes
-            .into_iter()
-            .map(|node| steno::SagaNodeEvent::try_from(node))
-            .collect::<Result<Vec<_>, _>>()
-            .expect("Couldn't convert DB nodes to steno nodes");
-
-        // The steno::SagaNodeEvent type doesn't implement PartialEq, so we need to do this
-        // a little manually.
-        assert_eq!(inserted_nodes.len(), observed_nodes.len());
-        for i in 0..inserted_nodes.len() {
-            assert_eq!(inserted_nodes[i].saga_id, observed_nodes[i].saga_id);
-            assert_eq!(inserted_nodes[i].node_id, observed_nodes[i].node_id);
-            assert_eq!(
-                inserted_nodes[i].event_type.label(),
-                observed_nodes[i].event_type.label()
-            );
-        }
-
-        // Test cleanup
-        db.cleanup().await.unwrap();
-        logctx.cleanup_successful();
-    }
-
-    #[tokio::test]
-    async fn test_list_no_unfinished_nodes() {
-        // Test setup
-        let logctx = dev::test_setup_log("test_list_no_unfinished_nodes");
-        let log = logctx.log.new(o!());
-        let (mut db, db_datastore) = new_db(&log).await;
-        let sec_id = db::SecId(uuid::Uuid::new_v4());
-        let opctx = OpContext::for_tests(
-            log,
-            Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
-        );
-        let saga_id = steno::SagaId(Uuid::new_v4());
-
-        let params = steno::SagaCreateParams {
-            id: saga_id,
-            name: steno::SagaName::new("test saga"),
-            dag: serde_json::value::Value::Null,
-            state: steno::SagaCachedState::Running,
-        };
-        let saga = db::model::saga_types::Saga::new(sec_id, params);
-
-        // Test that this returns "no nodes" rather than throwing some "not
-        // found" error.
-        let observed_nodes = load_saga_log(&opctx, &db_datastore, &saga)
-            .await
-            .expect("Failed to list unfinished nodes");
-        assert_eq!(observed_nodes.len(), 0);
-
-        // Test cleanup
-        db.cleanup().await.unwrap();
-        logctx.cleanup_successful();
-    }
-}
diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml
index 5dadb329cd..ef67749a4b 100644
--- a/nexus/examples/config-second.toml
+++ b/nexus/examples/config-second.toml
@@ -134,6 +134,7 @@ instance_watcher.period_secs = 30
 service_firewall_propagation.period_secs = 300
 v2p_mapping_propagation.period_secs = 30
 abandoned_vmm_reaper.period_secs = 60
+saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 
 [default_region_allocation_strategy]
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
index 8c1ab5ca5f..6ec80359ab 100644
--- a/nexus/examples/config.toml
+++ b/nexus/examples/config.toml
@@ -120,6 +120,7 @@ instance_watcher.period_secs = 30
 service_firewall_propagation.period_secs = 300
 v2p_mapping_propagation.period_secs = 30
 abandoned_vmm_reaper.period_secs = 60
+saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 
 [default_region_allocation_strategy]
diff --git a/nexus/saga-recovery/Cargo.toml b/nexus/saga-recovery/Cargo.toml
new file mode 100644
index 0000000000..4356cc9789
--- /dev/null
+++ b/nexus/saga-recovery/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "nexus-saga-recovery"
+version = "0.1.0"
+edition = "2021"
+
+[lints]
+workspace = true
+
+[build-dependencies]
+omicron-rpaths.workspace = true
+
+[dependencies]
+chrono.workspace = true
+futures.workspace = true
+nexus-db-queries.workspace = true
+nexus-db-model.workspace = true
+omicron-common.workspace = true
+# See omicron-rpaths for more about the "pq-sys" dependency.
+pq-sys = "*"
+serde.workspace = true
+serde_json.workspace = true
+slog.workspace = true
+slog-error-chain.workspace = true
+steno.workspace = true
+tokio.workspace = true
+
+omicron-workspace-hack.workspace = true
+
+[dev-dependencies]
+nexus-auth.workspace = true
+nexus-db-queries.workspace = true
+nexus-test-utils.workspace = true
+nexus-test-utils-macros.workspace = true
+nexus-types.workspace = true
+omicron-common.workspace = true
+omicron-test-utils.workspace = true
+once_cell.workspace = true
+pretty_assertions.workspace = true
+steno.workspace = true
+tokio.workspace = true
+uuid.workspace = true
diff --git a/nexus/saga-recovery/build.rs b/nexus/saga-recovery/build.rs
new file mode 100644
index 0000000000..1ba9acd41c
--- /dev/null
+++ b/nexus/saga-recovery/build.rs
@@ -0,0 +1,10 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+// See omicron-rpaths for documentation.
+// NOTE: This file MUST be kept in sync with the other build.rs files in this
+// repository.
+fn main() {
+    omicron_rpaths::configure_default_omicron_rpaths();
+}
diff --git a/nexus/saga-recovery/src/lib.rs b/nexus/saga-recovery/src/lib.rs
new file mode 100644
index 0000000000..a83fc28774
--- /dev/null
+++ b/nexus/saga-recovery/src/lib.rs
@@ -0,0 +1,682 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! # Saga recovery bookkeeping
+//!
+//! If you're reading this, you first want to read the big block comment in the
+//! saga recovery background task.  It explains important background about what
+//! we're trying to do here.  The rest of this comment assumes you've read all
+//! that.
+//!
+//! ## Saga recovery passes
+//!
+//! For the reasons mentioned in that comment, saga recovery is done in
+//! **passes**.  The state that's kept between passes (when the task is "at
+//! rest") is called the [`RestState`].
+//!
+//! Each saga recovery pass looks like this:
+//!
+//! 1. Start with initial [`RestState`] and [`Report`].
+//! 2. List in-progress sagas from the database.
+//! 3. Collect list of sagas that have been started by the rest of Nexus.
+//! 4. Use [`Plan::new()`] to construct a plan.  The plan examines all the sagas
+//!    reported by the database as well as all the sagas we knew about before
+//!    and determines for each one exactly what's needed.  Each saga falls into
+//!    one of a few buckets:
+//!    * It's clearly not running, but should be, so it needs to be recovered.
+//!    * It's clearly running, so it does not need to be recovered.
+//!    * It has finished running and we can stop keeping track of it.
+//!    * It _may_ be running but we cannot tell because of the intrinsic race
+//!      between steps 2 and 3 above.  We'll keep track of these and resolve the
+//!      ambiguity on the next pass.
+//! 5. Carry out recovery for whatever sagas need to be recovered.  Use
+//!    [`ExecutionBuilder::new()`] to construct a description of what happened.
+//! 6. Update the [`RestState`] and [`Report`] to reflect what happened in this
+//!    pass.
+//!
+//! This process can be repeated forever, as often as wanted, but should not be
+//! run concurrently.
+//!
+//! ## Saga recovery task vs. this crate
+//!
+//! This process is driven by the caller (the saga recovery background task),
+//! with helpers provided by this crate:
+//!
+//! ```text
+//!          Saga recovery task        |      nexus-saga-recovery crate
+//!     ------------------------------------------------------------------------
+//!                                    |
+//!      1. initial `RestState` and   ---> provides `RestState`, `Report`
+//!         `Report`                   |
+//!                                    |
+//!      2. list in-progress sagas     |
+//!                                    |
+//!      3. collect list of sagas  ------> use
+//!         started by Nexus           |   `RestState::update_started_sagas()`
+//!                                    |
+//!      4. make a plan  ----------------> use `Plan::new()`
+//!                                    |   This is where all the decisions
+//!                                    |   about saga recovery get made.
+//!                                    |
+//!      5. follow the plan -------------> use `Plan::sagas_needing_recovery()`
+//!                                    |
+//!         fetch details from db      |
+//!         load sagas into Steno      |
+//!                                    |   use `ExecutionBuilder::new()` to
+//!                                    |   report what's going on
+//!                                    |
+//!      6. update `RestState` and ----->  use `RestState::update_after_pass()`
+//!         `Report`                   |   and `Report::update_after_pass()`
+//! ```
+//!
+//! We do it this way to separate all the tricky planning logic from the
+//! mechanics of loading saga state from the database and handing it over to
+//! Steno (which is simple by comparison).  This crate handles the planning and
+//! reporting.  The saga recovery task handles the database/Steno stuff.  This
+//! is an example of the ["plan-execute" pattern][1] and it makes it much easier
+//! for us to exercise all the different cases in automated tests.  It also
+//! makes it easy to keep status objects for runtime observability and
+//! debugging.  These get exposed to `omdb` and should also be visible in core
+//! files.
+//!
+//! [1]: https://mmapped.blog/posts/29-plan-execute
+
+mod recovery;
+mod status;
+
+pub use recovery::Execution;
+pub use recovery::ExecutionBuilder;
+pub use recovery::Plan;
+pub use recovery::RestState;
+pub use status::DebuggingHistory;
+pub use status::LastPass;
+pub use status::LastPassSuccess;
+pub use status::RecoveryFailure;
+pub use status::RecoverySuccess;
+pub use status::Report;
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use omicron_common::api::external::Error;
+    use omicron_test_utils::dev::test_setup_log;
+    use slog::o;
+    use std::collections::BTreeMap;
+    use std::collections::BTreeSet;
+    use steno::SagaId;
+    use tokio::sync::mpsc;
+    use uuid::Uuid;
+
+    const FAKE_SEC_ID: &str = "03082281-fb2e-4bfd-bce3-997c89a0db2d";
+    pub fn make_fake_saga(saga_id: SagaId) -> nexus_db_model::Saga {
+        let sec_id =
+            nexus_db_model::SecId::from(FAKE_SEC_ID.parse::<Uuid>().unwrap());
+        nexus_db_model::Saga::new(
+            sec_id,
+            steno::SagaCreateParams {
+                id: saga_id,
+                name: steno::SagaName::new("dummy"),
+                state: steno::SagaCachedState::Running,
+                dag: serde_json::Value::Null,
+            },
+        )
+    }
+
+    pub fn make_saga_ids(count: usize) -> Vec<SagaId> {
+        let mut rv = Vec::with_capacity(count);
+        for _ in 0..count {
+            rv.push(SagaId(Uuid::new_v4()));
+        }
+        // Essentially by coincidence, the values we're checking against
+        // are going to be sorted.  Sort this here for convenience.
+        rv.sort();
+        rv
+    }
+
+    /// Simple simulator for saga recovery state
+    ///
+    /// This type exposes functions to simulate things that would happen in
+    /// Nexus (e.g., saga started, saga finished, etc.).  It keeps track of
+    /// what little simulated database state and in-memory state is required to
+    /// exercise all the bookkeeping in this crate.
+    struct Simulator {
+        log: slog::Logger,
+        rest_state: recovery::RestState,
+        started_sagas: Vec<SagaId>,
+        db_list: BTreeMap<SagaId, nexus_db_model::Saga>,
+        snapshot_db_list: Option<BTreeMap<SagaId, nexus_db_model::Saga>>,
+        injected_recovery_errors: BTreeSet<SagaId>,
+    }
+
+    impl Simulator {
+        pub fn new(log: slog::Logger) -> Simulator {
+            Simulator {
+                log,
+                rest_state: recovery::RestState::new(),
+                started_sagas: Vec::new(),
+                db_list: BTreeMap::new(),
+                snapshot_db_list: None,
+                injected_recovery_errors: BTreeSet::new(),
+            }
+        }
+
+        /// Pretend that a particular saga was running in a previous Nexus
+        /// lifetime (and so needs to be recovered).
+        pub fn sim_previously_running_saga(&mut self) -> SagaId {
+            let saga_id = SagaId(Uuid::new_v4());
+            println!("sim: recording previously-running saga {saga_id}");
+            self.db_list.insert(saga_id, make_fake_saga(saga_id));
+            saga_id
+        }
+
+        /// Pretend that Nexus started a new saga (e.g., in response to an API
+        /// request)
+        pub fn sim_normal_saga_start(&mut self) -> SagaId {
+            let saga_id = SagaId(Uuid::new_v4());
+            println!("sim: starting saga {saga_id}");
+            self.db_list.insert(saga_id, make_fake_saga(saga_id));
+            self.started_sagas.push(saga_id);
+            saga_id
+        }
+
+        /// Pretend that Nexus finished running the given saga
+        pub fn sim_normal_saga_done(&mut self, saga_id: SagaId) {
+            println!("sim: finishing saga {saga_id}");
+            assert!(
+                self.db_list.remove(&saga_id).is_some(),
+                "simulated saga finished, but it wasn't running"
+            );
+        }
+
+        /// Configure simulation so that recovery for the specified saga will
+        /// succeed or fail, depending on `fail`.  This will affect all recovery
+        /// passes until the function is called again with a different value.
+        ///
+        /// If this function is not called for a saga, the default behavior is
+        /// that recovery succeeds.
+        pub fn sim_config_recovery_result(
+            &mut self,
+            saga_id: SagaId,
+            fail: bool,
+        ) {
+            println!(
+                "sim: configuring saga {saga_id} recovery to {}",
+                if fail { "fail" } else { "succeed" }
+            );
+            if fail {
+                self.injected_recovery_errors.insert(saga_id);
+            } else {
+                self.injected_recovery_errors.remove(&saga_id);
+            }
+        }
+
+        /// Snapshot the simulated database state and use that state for the
+        /// next recovery pass.
+        ///
+        /// As an example, this can be used to exercise both sides of the race
+        /// between Nexus starting a saga and listing in-progress sagas.  If you
+        /// want to test "listing in-progress" happens first, use this function
+        /// to snapshot the database state, then start a saga, and then do a
+        /// recovery pass.  That recovery pass will act on the snapshotted
+        /// database state.
+        ///
+        /// After the next recovery pass, the snapshotted state will be removed.
+        /// The _next_ recovery pass will use the latest database state unless
+        /// this function is called again.
+        pub fn snapshot_db(&mut self) {
+            println!("sim: snapshotting database");
+            assert!(
+                self.snapshot_db_list.is_none(),
+                "multiple snapshots created between recovery passes"
+            );
+            self.snapshot_db_list = Some(self.db_list.clone());
+        }
+
+        /// Simulate a saga recovery pass
+        pub fn sim_recovery_pass(
+            &mut self,
+        ) -> (recovery::Plan, recovery::Execution, status::LastPassSuccess, usize)
+        {
+            let log = &self.log;
+
+            println!("sim: starting recovery pass");
+
+            // Simulate processing messages that the `new_sagas_started` sagas
+            // just started.
+            let nstarted = self.started_sagas.len();
+            let (tx, mut rx) = mpsc::unbounded_channel();
+            for saga_id in self.started_sagas.drain(..) {
+                tx.send(saga_id).unwrap();
+            }
+            self.rest_state.update_started_sagas(log, &mut rx);
+
+            // Start the recovery pass by planning what to do.
+            let db_sagas = self
+                .snapshot_db_list
+                .take()
+                .unwrap_or_else(|| self.db_list.clone());
+            let plan = recovery::Plan::new(log, &self.rest_state, db_sagas);
+
+            // Simulate execution using the callback to determine whether
+            // recovery for each saga succeeds or not.
+            //
+            // There are a lot of ways we could interleave execution here.  But
+            // in practice, the implementation we care about does these all
+            // serially.  So that's what we test here.
+            let mut execution_builder = recovery::ExecutionBuilder::new();
+            let mut nok = 0;
+            let mut nerrors = 0;
+            for (saga_id, saga) in plan.sagas_needing_recovery() {
+                let saga_log = log.new(o!(
+                    "saga_name" => saga.name.clone(),
+                    "saga_id" => saga_id.to_string(),
+                ));
+
+                execution_builder.saga_recovery_start(*saga_id, saga_log);
+                if self.injected_recovery_errors.contains(saga_id) {
+                    nerrors += 1;
+                    execution_builder.saga_recovery_failure(
+                        *saga_id,
+                        &Error::internal_error("test error"),
+                    );
+                } else {
+                    nok += 1;
+                    execution_builder.saga_recovery_success(*saga_id);
+                }
+            }
+
+            let execution = execution_builder.build();
+            let last_pass = status::LastPassSuccess::new(&plan, &execution);
+            assert_eq!(last_pass.nrecovered, nok);
+            assert_eq!(last_pass.nfailed, nerrors);
+
+            self.rest_state.update_after_pass(&plan, &execution);
+
+            println!("sim: recovery pass result: {:?}", last_pass);
+
+            // We can't tell from the information we have how many were skipped,
+            // removed, or ambiguous.  The caller verifies that.
+            (plan, execution, last_pass, nstarted)
+        }
+    }
+
+    // End-to-end test of the saga recovery bookkeeping, which is basically
+    // everything *except* loading the sagas from the database and restoring
+    // them in Steno.  See the block comment above -- that stuff lives outside
+    // of this crate.
+    //
+    // Tests the following structures used together:
+    //
+    // - RestState
+    // - Plan
+    // - Execution
+    // - Report
+    //
+    // These are hard to test in isolation since they're intended to be used
+    // together in a loop (and so don't export public interfaces for mucking
+    // with internal).
+    #[tokio::test]
+    async fn test_basic() {
+        let logctx = test_setup_log("saga_recovery_basic");
+        let log = &logctx.log;
+
+        // Start with a blank slate.
+        let mut sim = Simulator::new(log.clone());
+        let initial_rest_state = sim.rest_state.clone();
+        let mut report = status::Report::new();
+
+        //
+        // Now, go through a no-op recovery.
+        //
+        let (plan, execution, last_pass_result, nstarted) =
+            sim.sim_recovery_pass();
+        assert_eq!(last_pass_result.nfound, 0);
+        assert_eq!(last_pass_result.nskipped, 0);
+        assert_eq!(last_pass_result.nremoved, 0);
+        assert_eq!(sim.rest_state, initial_rest_state);
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 0);
+        assert_eq!(report.ntotal_failures, 0);
+        assert_eq!(report.ntotal_started, 0);
+        assert_eq!(report.ntotal_finished, 0);
+
+        //
+        // Now, go through a somewhat general case of recovery.
+        //
+        // First, add a couple of sagas that just showed up in the database.
+        // This covers the case of sagas that were either from a previous Nexus
+        // lifetime or re-assigned from some other Nexus that has been expunged.
+        // We create two so we can exercise success and failure cases for
+        // recovery.
+        //
+        println!("test: general recovery case");
+        let saga_recover_ok = sim.sim_previously_running_saga();
+        let saga_recover_fail = sim.sim_previously_running_saga();
+        sim.sim_config_recovery_result(saga_recover_fail, true);
+
+        // Simulate Nexus starting a couple of sagas in the usual way.  This one
+        // will appear in the database as well as in our set of sagas started.
+        let saga_started_normally_1 = sim.sim_normal_saga_start();
+        let saga_started_normally_2 = sim.sim_normal_saga_start();
+
+        // Start a saga and then finish it immediately.  This is a tricky case
+        // because the recovery pass will see that it started, but not see in
+        // the database, and it won't be able to tell if it finished or just
+        // started.
+        let saga_started_and_finished = sim.sim_normal_saga_start();
+        sim.sim_normal_saga_done(saga_started_and_finished);
+
+        // Take a snapshot.  Subsequent changes will not affect the database
+        // state that's used for the next recovery pass.  We'll use this to
+        // simulate Nexus having started a saga immediately after the database
+        // listing that's used for a recovery pass.
+        sim.snapshot_db();
+        let saga_started_after_listing = sim.sim_normal_saga_start();
+
+        // We're finally ready to carry out a simulation pass and verify what
+        // happened with each of these sagas.
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        // In the end, there should have been four sagas found in the database:
+        // all of the above except for the one that finished.
+        assert_eq!(4, last_pass_success.nfound);
+        // Two of these needed to be recovered (because they had been previously
+        // running).  One succeeded.
+        assert_eq!(1, last_pass_success.nrecovered);
+        assert_eq!(1, execution.succeeded.len());
+        assert_eq!(saga_recover_ok, execution.succeeded[0].saga_id);
+
+        assert_eq!(1, last_pass_success.nfailed);
+        assert_eq!(1, execution.failed.len());
+        assert_eq!(saga_recover_fail, execution.failed[0].saga_id);
+        // Two sagas should have been found in the database that corresponded to
+        // sagas that had been started normally and did not need to be
+        // recovered.  They would have been skipped.
+        assert_eq!(2, last_pass_success.nskipped);
+        assert_eq!(2, plan.nskipped());
+        // No sagas were removed yet -- we can't do that with only one pass.
+        assert_eq!(0, last_pass_success.nremoved);
+        assert_eq!(0, plan.ninferred_done());
+        // From what the pass could tell, two sagas might be done: the one that
+        // actually finished and the one that started after the database
+        // listing.
+        let mut maybe_done = plan.sagas_maybe_done().collect::<Vec<_>>();
+        maybe_done.sort();
+        let mut expected_maybe_done =
+            vec![saga_started_and_finished, saga_started_after_listing];
+        expected_maybe_done.sort();
+        assert_eq!(maybe_done, expected_maybe_done);
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 1);
+        assert_eq!(report.ntotal_failures, 1);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 0);
+
+        //
+        // Change nothing and run another pass.
+        // This pass allows the system to determine that some sagas are now
+        // done.
+        //
+        println!("test: recovery pass after no changes (1)");
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        // There's now five sagas in-progress in the database: the same four as
+        // above, plus the one that was started after the snapshot.
+        assert_eq!(5, last_pass_success.nfound);
+        // One of these needs to be recovered because it failed last time.  It
+        // fails again this time.
+        assert_eq!(0, last_pass_success.nrecovered);
+        assert_eq!(0, execution.succeeded.len());
+        assert_eq!(1, last_pass_success.nfailed);
+        assert_eq!(1, execution.failed.len());
+        assert_eq!(saga_recover_fail, execution.failed[0].saga_id);
+        // This time, four sagas should have been found in the database that
+        // correspond to sagas that were started normally and did not need to be
+        // recovered: the two from last time, plus the one that was recovered,
+        // plus the one that was started after the previous snapshot.  These
+        // would have been skipped.
+        assert_eq!(4, last_pass_success.nskipped);
+        assert_eq!(4, plan.nskipped());
+        // This time, the saga that was actually finished should have been
+        // removed.  We could tell this time.
+        assert_eq!(1, last_pass_success.nremoved);
+        assert_eq!(
+            vec![saga_started_and_finished],
+            plan.sagas_inferred_done().collect::<Vec<_>>()
+        );
+        // This time, there are no sagas that might be done.  The one we thought
+        // might have been done last time is now clearly running because it
+        // appears in this database listing.
+        assert_eq!(0, plan.sagas_maybe_done().count());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 1);
+        assert_eq!(report.ntotal_failures, 2);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 1);
+
+        //
+        // Again, change nothing and run another pass.  This should be a steady
+        // state: if we keep running passes from here, nothing should change.
+        //
+        println!("test: recovery pass after no changes (2)");
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        // Same as above.
+        assert_eq!(5, last_pass_success.nfound);
+        assert_eq!(0, last_pass_success.nrecovered);
+        assert_eq!(0, execution.succeeded.len());
+        assert_eq!(1, last_pass_success.nfailed);
+        assert_eq!(1, execution.failed.len());
+        assert_eq!(saga_recover_fail, execution.failed[0].saga_id);
+        assert_eq!(4, last_pass_success.nskipped);
+        assert_eq!(4, plan.nskipped());
+        assert_eq!(0, plan.sagas_maybe_done().count());
+        // Here's the only thing that differs from last time.  We removed a saga
+        // before, so this time there's nothing to remove.
+        // removed.  We could tell this time.
+        assert_eq!(0, last_pass_success.nremoved);
+        assert_eq!(0, plan.sagas_inferred_done().count());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 1);
+        assert_eq!(report.ntotal_failures, 3);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 1);
+
+        //
+        // Once more and make sure nothing changes.
+        //
+        println!("test: recovery pass after no changes (3)");
+        let previous_rest_state = sim.rest_state.clone();
+        let previous_last_pass_success = last_pass_success.clone();
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        assert_eq!(previous_rest_state, sim.rest_state);
+        assert_eq!(previous_last_pass_success, last_pass_success);
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 1);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 1);
+
+        //
+        // This time, fix that saga whose recovery has been failing.
+        //
+        println!("test: recovery pass after removing injected error");
+        sim.sim_config_recovery_result(saga_recover_fail, false);
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        // Same as above.
+        assert_eq!(5, last_pass_success.nfound);
+        assert_eq!(4, last_pass_success.nskipped);
+        assert_eq!(4, plan.nskipped());
+        assert_eq!(0, last_pass_success.nremoved);
+        assert_eq!(0, plan.sagas_inferred_done().count());
+        assert_eq!(0, plan.sagas_maybe_done().count());
+        // Here's what's different from before.
+        assert_eq!(1, last_pass_success.nrecovered);
+        assert_eq!(1, execution.succeeded.len());
+        assert_eq!(saga_recover_fail, execution.succeeded[0].saga_id);
+        assert_eq!(0, last_pass_success.nfailed);
+        assert_eq!(0, execution.failed.len());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 2);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 1);
+
+        //
+        // After the next pass, we should have one more saga that seems to be
+        // running.
+        //
+        println!("test: recovery pass after no changes (4)");
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        // Same as above.
+        assert_eq!(5, last_pass_success.nfound);
+        assert_eq!(0, last_pass_success.nremoved);
+        assert_eq!(0, plan.sagas_inferred_done().count());
+        assert_eq!(0, plan.sagas_maybe_done().count());
+        assert_eq!(0, last_pass_success.nfailed);
+        assert_eq!(0, execution.failed.len());
+        // Here's what's different from before.
+        assert_eq!(5, last_pass_success.nskipped);
+        assert_eq!(5, plan.nskipped());
+        assert_eq!(0, last_pass_success.nrecovered);
+        assert_eq!(0, execution.succeeded.len());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 2);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 1);
+
+        //
+        // With another pass, nothing should differ.
+        //
+        println!("test: recovery pass after no changes (5)");
+        let previous_rest_state = sim.rest_state.clone();
+        let previous_last_pass_success = last_pass_success.clone();
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        assert_eq!(previous_rest_state, sim.rest_state);
+        assert_eq!(previous_last_pass_success, last_pass_success);
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 2);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 1);
+
+        //
+        // Now let's complete a couple of different sagas.
+        // It'll take two passes for the system to be sure they're done.
+        //
+        println!("test: recovery pass after completing some sagas");
+        sim.sim_normal_saga_done(saga_started_normally_1);
+        sim.sim_normal_saga_done(saga_started_after_listing);
+        sim.sim_normal_saga_done(saga_recover_fail);
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        assert_eq!(2, last_pass_success.nfound);
+        assert_eq!(0, last_pass_success.nremoved);
+        assert_eq!(0, plan.sagas_inferred_done().count());
+        assert_eq!(3, plan.sagas_maybe_done().count());
+        assert_eq!(0, last_pass_success.nfailed);
+        assert_eq!(0, execution.failed.len());
+        assert_eq!(2, last_pass_success.nskipped);
+        assert_eq!(2, plan.nskipped());
+        assert_eq!(0, last_pass_success.nrecovered);
+        assert_eq!(0, execution.succeeded.len());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 2);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 1);
+
+        //
+        // With another pass, we can remove those three that finished.
+        //
+        println!("test: recovery pass after no changes (6)");
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        assert_eq!(2, last_pass_success.nfound);
+        assert_eq!(3, last_pass_success.nremoved);
+        assert_eq!(3, plan.sagas_inferred_done().count());
+        assert_eq!(0, plan.sagas_maybe_done().count());
+        assert_eq!(0, last_pass_success.nfailed);
+        assert_eq!(0, execution.failed.len());
+        assert_eq!(2, last_pass_success.nskipped);
+        assert_eq!(2, plan.nskipped());
+        assert_eq!(0, last_pass_success.nrecovered);
+        assert_eq!(0, execution.succeeded.len());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 2);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 4);
+
+        //
+        // Finish the last two sagas.
+        //
+        println!("test: recovery pass after completing remaining sagas");
+        sim.sim_normal_saga_done(saga_started_normally_2);
+        sim.sim_normal_saga_done(saga_recover_ok);
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        assert_eq!(0, last_pass_success.nfound);
+        assert_eq!(0, last_pass_success.nremoved);
+        assert_eq!(0, plan.sagas_inferred_done().count());
+        assert_eq!(2, plan.sagas_maybe_done().count());
+        assert_eq!(0, last_pass_success.nfailed);
+        assert_eq!(0, execution.failed.len());
+        assert_eq!(0, last_pass_success.nskipped);
+        assert_eq!(0, plan.nskipped());
+        assert_eq!(0, last_pass_success.nrecovered);
+        assert_eq!(0, execution.succeeded.len());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 2);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 4);
+
+        //
+        // With another pass, remove those last two.
+        //
+        println!("test: recovery pass after no changes (7)");
+        let (plan, execution, last_pass_success, nstarted) =
+            sim.sim_recovery_pass();
+        assert_eq!(0, last_pass_success.nfound);
+        assert_eq!(2, last_pass_success.nremoved);
+        assert_eq!(2, plan.sagas_inferred_done().count());
+        assert_eq!(0, plan.sagas_maybe_done().count());
+        assert_eq!(0, last_pass_success.nfailed);
+        assert_eq!(0, execution.failed.len());
+        assert_eq!(0, last_pass_success.nskipped);
+        assert_eq!(0, plan.nskipped());
+        assert_eq!(0, last_pass_success.nrecovered);
+        assert_eq!(0, execution.succeeded.len());
+        report.update_after_pass(&plan, execution, nstarted);
+        assert_eq!(report.ntotal_recovered, 2);
+        assert_eq!(report.ntotal_failures, 4);
+        assert_eq!(report.ntotal_started, 4);
+        assert_eq!(report.ntotal_finished, 6);
+
+        // At this point, the rest state should match our existing rest state.
+        // This is an extra check to make sure we're not leaking memory related
+        // to old sagas.
+        assert_eq!(sim.rest_state, initial_rest_state);
+
+        //
+        // At this point, we've exercised:
+        // - recovering a saga that we didn't start
+        //   (basic "recovery" path after a crash, plus re-assignment path)
+        // - retrying a saga whose recovery failed (multiple times)
+        // - *not* trying to recover:
+        //   - a newly-started saga
+        //   - a saga that was recovered before
+        // - not hanging on forever to sagas that have finished
+        // - the edge case built into our implementation where we learned that a
+        //   saga was started before it appeared in the database
+        //
+        logctx.cleanup_successful();
+    }
+}
diff --git a/nexus/saga-recovery/src/recovery.rs b/nexus/saga-recovery/src/recovery.rs
new file mode 100644
index 0000000000..0b13e68a49
--- /dev/null
+++ b/nexus/saga-recovery/src/recovery.rs
@@ -0,0 +1,705 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Guts of the saga recovery bookkeeping
+
+use super::status::RecoveryFailure;
+use super::status::RecoverySuccess;
+use chrono::{DateTime, Utc};
+use omicron_common::api::external::Error;
+use slog::{debug, error, info, warn};
+use slog_error_chain::InlineErrorChain;
+use std::collections::BTreeMap;
+use std::collections::BTreeSet;
+use steno::SagaId;
+use tokio::sync::mpsc;
+
+/// Describes state related to saga recovery that needs to be maintained across
+/// multiple passes
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct RestState {
+    /// set of sagas that we believe may be running
+    ///
+    /// See the big block comment in the saga recovery background task for more
+    /// on how this works and why.
+    sagas_started: BTreeMap<SagaId, SagaStartInfo>,
+    remove_next: BTreeSet<SagaId>,
+}
+
+/// Describes how we learned that a particular saga might be running
+///
+/// This is only intended for debugging.
+#[derive(Debug, Clone, Eq, PartialEq)]
+#[allow(dead_code)]
+struct SagaStartInfo {
+    time_observed: DateTime<Utc>,
+    source: SagaStartSource,
+}
+
+#[derive(Debug, Clone, Copy, Eq, PartialEq)]
+enum SagaStartSource {
+    StartChannel,
+    Recovered,
+}
+
+impl RestState {
+    /// Returns initial at-rest state related to saga recovery
+    pub fn new() -> RestState {
+        RestState {
+            sagas_started: BTreeMap::new(),
+            remove_next: BTreeSet::new(),
+        }
+    }
+
+    /// Read messages from the channel (signaling sagas that have started
+    /// running) and update the set of sagas that we believe may be running.
+    ///
+    /// See the big block comment in the saga recovery background task for more
+    /// on how this works and why.
+    pub fn update_started_sagas(
+        &mut self,
+        log: &slog::Logger,
+        sagas_started_rx: &mut mpsc::UnboundedReceiver<SagaId>,
+    ) -> usize {
+        let (new_sagas, disconnected) = read_all_from_channel(sagas_started_rx);
+        if disconnected {
+            warn!(
+                log,
+                "sagas_started_rx disconnected (is Nexus shutting down?)"
+            );
+        }
+
+        let rv = new_sagas.len();
+        let time_observed = Utc::now();
+        for saga_id in new_sagas {
+            info!(log, "observed saga start"; "saga_id" => %saga_id);
+            assert!(self
+                .sagas_started
+                .insert(
+                    saga_id,
+                    SagaStartInfo {
+                        time_observed,
+                        source: SagaStartSource::StartChannel,
+                    }
+                )
+                .is_none());
+        }
+        rv
+    }
+
+    /// Update the at-rest state based on the results of a recovery pass.
+    pub fn update_after_pass(&mut self, plan: &Plan, execution: &Execution) {
+        let time_observed = Utc::now();
+
+        for saga_id in plan.sagas_inferred_done() {
+            assert!(self.sagas_started.remove(&saga_id).is_some());
+        }
+
+        for saga_id in execution.sagas_recovered_successfully() {
+            assert!(self
+                .sagas_started
+                .insert(
+                    saga_id,
+                    SagaStartInfo {
+                        time_observed,
+                        source: SagaStartSource::Recovered,
+                    }
+                )
+                .is_none());
+        }
+
+        self.remove_next = plan.sagas_maybe_done().collect();
+    }
+}
+
+/// Read all message that are currently available on the given channel (without
+/// blocking or waiting)
+///
+/// Returns the list of messages (as a `Vec`) plus a boolean that's true iff the
+/// channel is now disconnected.
+fn read_all_from_channel<T>(
+    rx: &mut mpsc::UnboundedReceiver<T>,
+) -> (Vec<T>, bool) {
+    let mut values = Vec::new();
+    let mut disconnected = false;
+
+    loop {
+        match rx.try_recv() {
+            Ok(value) => {
+                values.push(value);
+            }
+
+            Err(mpsc::error::TryRecvError::Empty) => break,
+            Err(mpsc::error::TryRecvError::Disconnected) => {
+                disconnected = true;
+                break;
+            }
+        }
+    }
+
+    (values, disconnected)
+}
+
+/// Describes what should happen during a particular recovery pass
+///
+/// This is constructed by the saga recovery background task via
+/// [`Plan::new()`].
+///
+/// This structure is also much more detailed than it needs to be to support
+/// better observability and testing.
+pub struct Plan {
+    /// sagas that need to be recovered
+    needs_recovery: BTreeMap<SagaId, nexus_db_model::Saga>,
+
+    /// sagas that were found in the database to be in-progress, but that don't
+    /// need to be recovered because they are either already running or have
+    /// actually finished
+    skipped_running: BTreeSet<SagaId>,
+
+    /// sagas that we infer have finished because they were missing from two
+    /// consecutive database queries for in-progress sagas with no intervening
+    /// message indicating that they had been started
+    inferred_done: BTreeSet<SagaId>,
+
+    /// sagas that may be done, but we can't tell yet.  These are sagas where we
+    /// previously had them running in this process and the database state now
+    /// says that they're not running, but the database snapshot was potentially
+    /// from before the time that the saga started, so we cannot tell yet
+    /// whether the saga finished or just started.  We'll be able to tell during
+    /// the next pass and if it's done at that point then these sagas will move
+    /// to `inferred_done`.
+    maybe_done: BTreeSet<SagaId>,
+}
+
+impl Plan {
+    /// For a given saga recovery pass, determine what to do with each found
+    /// saga
+    ///
+    /// This function accepts:
+    ///
+    /// * `rest_state`: the at-rest saga recovery state from the end of the
+    ///   previous pass
+    /// * `running_sagas_found`: a list of sagas that the database reports
+    ///   to be in progress
+    ///
+    /// It determines:
+    ///
+    /// * which in-progresss sagas we don't need to do anything about because
+    ///   they're already running in this process (those sagas that are in both
+    ///   `sagas_started` and `running_sagas_found`)
+    /// * which sagas need to be recovered (those sagas in `running_sagas_found`
+    ///   but not in `sagas_started`)
+    /// * which sagas can be removed from `sagas_started` because they have
+    ///   finished (those in `previously_maybe_done` and *not* in
+    ///   `running_sagas_found`)
+    pub fn new(
+        log: &slog::Logger,
+        rest_state: &RestState,
+        mut running_sagas_found: BTreeMap<SagaId, nexus_db_model::Saga>,
+    ) -> Plan {
+        let mut builder = PlanBuilder::new(log);
+
+        // First of all, remove finished sagas from our "ignore" set.
+        //
+        // `previously_maybe_done` was computed the last time we ran and
+        // contains sagas that either just started or already finished.  We
+        // couldn't tell.  All we knew is that they were running in-memory but
+        // were not included in our database query for in-progress sagas.  At
+        // this point, though, we've done a second database query for
+        // in-progress sagas.  Any items that aren't in that list either cannot
+        // still be running, so we can safely remove them from our ignore set.
+        let previously_maybe_done = &rest_state.remove_next;
+        for saga_id in previously_maybe_done {
+            if !running_sagas_found.contains_key(saga_id) {
+                builder.saga_infer_done(*saga_id);
+            }
+        }
+
+        // Figure out which of the candidate sagas can clearly be skipped.
+        // Correctness here requires that the caller has already updated the set
+        // of sagas that we're ignoring to include any that may have been
+        // created up to the beginning of the database query.  (They do that by
+        // doing the database query first and then updating this set.)  Since we
+        // now have the list of sagas that were not-finished in the database, we
+        // can compare these two sets.
+        let sagas_started = &rest_state.sagas_started;
+        for running_saga_id in sagas_started.keys() {
+            match running_sagas_found.remove(running_saga_id) {
+                None => {
+                    // If this saga is in `previously_maybe_done`, then we
+                    // processed it above already.  We know it's done.
+                    //
+                    // Otherwise, the saga is in the ignore set, but not the
+                    // database list of running sagas.  It's possible that the
+                    // saga has simply finished.  And if the saga is definitely
+                    // not running any more, then we can remove it from the
+                    // ignore set.  This is important to keep that set from
+                    // growing without bound.
+                    //
+                    // But it's also possible that the saga started immediately
+                    // after the database query's snapshot, in which case we
+                    // don't really know if it's still running.
+                    //
+                    // The way to resolve this is to do another database query
+                    // for unfinished sagas.  If it's not in that list, the saga
+                    // must have finished.  Rather than do that now, we'll just
+                    // keep track of this list and take care of it on the next
+                    // pass.
+                    if !previously_maybe_done.contains(running_saga_id) {
+                        builder.saga_maybe_done(*running_saga_id)
+                    }
+                }
+
+                Some(_found_saga) => {
+                    // The saga is in the ignore set and the database list of
+                    // running sagas.  It may have been created in the lifetime
+                    // of this program or we may have recovered it previously,
+                    // but either way, we don't have to do anything else with
+                    // this one.
+                    builder.saga_recovery_not_needed(*running_saga_id);
+                }
+            }
+        }
+
+        // Whatever's left in `running_sagas_found` at this point was found in
+        // the database list of running sagas but is not in the ignore set.  We
+        // must recover it.  (It's not possible that we already did recover it
+        // because we would have added it to our ignore set.  It's not possible
+        // that it was newly started because the starter sends a message to add
+        // this to the ignore set (and waits for it to make it to the channel)
+        // before writing the database record, and we read everything off that
+        // channel and added it to the set before calling this function.
+        for (saga_id, saga) in running_sagas_found.into_iter() {
+            builder.saga_recovery_needed(saga_id, saga);
+        }
+
+        builder.build()
+    }
+
+    /// Iterate over the sagas that need to be recovered
+    pub fn sagas_needing_recovery(
+        &self,
+    ) -> impl Iterator<Item = (&SagaId, &nexus_db_model::Saga)> + '_ {
+        self.needs_recovery.iter()
+    }
+
+    /// Iterate over the sagas that were inferred to be done
+    pub fn sagas_inferred_done(&self) -> impl Iterator<Item = SagaId> + '_ {
+        self.inferred_done.iter().copied()
+    }
+
+    /// Iterate over the sagas that should be checked on the next pass to see if
+    /// they're done
+    pub fn sagas_maybe_done(&self) -> impl Iterator<Item = SagaId> + '_ {
+        self.maybe_done.iter().copied()
+    }
+
+    /// Returns how many in-progress sagas we ignored because they were already
+    /// running
+    pub fn nskipped(&self) -> usize {
+        self.skipped_running.len()
+    }
+
+    /// Returns how many previously-in-progress sagas we now believe are done
+    pub fn ninferred_done(&self) -> usize {
+        self.inferred_done.len()
+    }
+}
+
+/// Internal helper used to construct `Plan`
+struct PlanBuilder<'a> {
+    log: &'a slog::Logger,
+    needs_recovery: BTreeMap<SagaId, nexus_db_model::Saga>,
+    skipped_running: BTreeSet<SagaId>,
+    inferred_done: BTreeSet<SagaId>,
+    maybe_done: BTreeSet<SagaId>,
+}
+
+impl<'a> PlanBuilder<'a> {
+    /// Begin building a `Plan`
+    fn new(log: &'a slog::Logger) -> PlanBuilder {
+        PlanBuilder {
+            log,
+            needs_recovery: BTreeMap::new(),
+            skipped_running: BTreeSet::new(),
+            inferred_done: BTreeSet::new(),
+            maybe_done: BTreeSet::new(),
+        }
+    }
+
+    /// Turn this into a `Plan`
+    fn build(self) -> Plan {
+        Plan {
+            needs_recovery: self.needs_recovery,
+            skipped_running: self.skipped_running,
+            inferred_done: self.inferred_done,
+            maybe_done: self.maybe_done,
+        }
+    }
+
+    /// Record that this saga appears to be done, based on it being missing from
+    /// two different database queries for in-progress sagas with no intervening
+    /// indication that a saga with this id was started in the meantime
+    fn saga_infer_done(&mut self, saga_id: SagaId) {
+        info!(
+            self.log,
+            "found saga that appears to be done \
+             (missing from two database listings)";
+            "saga_id" => %saga_id
+        );
+        assert!(!self.needs_recovery.contains_key(&saga_id));
+        assert!(!self.skipped_running.contains(&saga_id));
+        assert!(!self.maybe_done.contains(&saga_id));
+        assert!(self.inferred_done.insert(saga_id));
+    }
+
+    /// Record that no action is needed for this saga in this recovery pass
+    /// because it appears to already be running
+    fn saga_recovery_not_needed(&mut self, saga_id: SagaId) {
+        debug!(
+            self.log,
+            "found saga that can be ignored (already running)";
+            "saga_id" => %saga_id,
+        );
+        assert!(!self.needs_recovery.contains_key(&saga_id));
+        assert!(!self.inferred_done.contains(&saga_id));
+        assert!(!self.maybe_done.contains(&saga_id));
+        assert!(self.skipped_running.insert(saga_id));
+    }
+
+    /// Record that this saga might be done, but we won't be able to tell for
+    /// sure until we complete the next recovery pass
+    ///
+    /// This sounds a little goofy but there's a race in comparing what our
+    /// in-memory state reports is running vs. what's in the database.  Our
+    /// solution is to only consider sagas done that are missing for two
+    /// consecutive database queries with no intervening report that a saga with
+    /// that id has just started.
+    fn saga_maybe_done(&mut self, saga_id: SagaId) {
+        debug!(
+            self.log,
+            "found saga that may be done (will be sure on the next pass)";
+            "saga_id" => %saga_id
+        );
+        assert!(!self.needs_recovery.contains_key(&saga_id));
+        assert!(!self.skipped_running.contains(&saga_id));
+        assert!(!self.inferred_done.contains(&saga_id));
+        assert!(self.maybe_done.insert(saga_id));
+    }
+
+    /// Record that this saga needs to be recovered, based on it being "in
+    /// progress" according to the database but not yet resumed in this process
+    fn saga_recovery_needed(
+        &mut self,
+        saga_id: SagaId,
+        saga: nexus_db_model::Saga,
+    ) {
+        info!(
+            self.log,
+            "found saga that needs to be recovered";
+            "saga_id" => %saga_id
+        );
+        assert!(!self.skipped_running.contains(&saga_id));
+        assert!(!self.inferred_done.contains(&saga_id));
+        assert!(!self.maybe_done.contains(&saga_id));
+        assert!(self.needs_recovery.insert(saga_id, saga).is_none());
+    }
+}
+
+/// Summarizes the results of executing a single saga recovery pass
+///
+/// This is constructed by the saga recovery background task (in
+/// `recovery_execute()`) via [`ExecutionBuilder::new()`].
+pub struct Execution {
+    /// list of sagas that were successfully recovered
+    pub succeeded: Vec<RecoverySuccess>,
+    /// list of sagas that failed to be recovered
+    pub failed: Vec<RecoveryFailure>,
+}
+
+impl Execution {
+    /// Iterate over the sagas that were successfully recovered during this pass
+    pub fn sagas_recovered_successfully(
+        &self,
+    ) -> impl Iterator<Item = SagaId> + '_ {
+        self.succeeded.iter().map(|s| s.saga_id)
+    }
+
+    pub fn into_results(self) -> (Vec<RecoverySuccess>, Vec<RecoveryFailure>) {
+        (self.succeeded, self.failed)
+    }
+}
+
+pub struct ExecutionBuilder {
+    in_progress: BTreeMap<SagaId, slog::Logger>,
+    succeeded: Vec<RecoverySuccess>,
+    failed: Vec<RecoveryFailure>,
+}
+
+impl ExecutionBuilder {
+    pub fn new() -> ExecutionBuilder {
+        ExecutionBuilder {
+            in_progress: BTreeMap::new(),
+            succeeded: Vec::new(),
+            failed: Vec::new(),
+        }
+    }
+
+    pub fn build(self) -> Execution {
+        assert!(
+            self.in_progress.is_empty(),
+            "attempted to build execution result while some recoveries are \
+            still in progress"
+        );
+        Execution { succeeded: self.succeeded, failed: self.failed }
+    }
+
+    /// Record that we've started recovering this saga
+    pub fn saga_recovery_start(
+        &mut self,
+        saga_id: SagaId,
+        saga_logger: slog::Logger,
+    ) {
+        info!(&saga_logger, "recovering saga: start");
+        assert!(self.in_progress.insert(saga_id, saga_logger).is_none());
+    }
+
+    /// Record that we've successfully recovered this saga
+    pub fn saga_recovery_success(&mut self, saga_id: SagaId) {
+        let saga_logger = self
+            .in_progress
+            .remove(&saga_id)
+            .expect("recovered saga should have previously started");
+        info!(saga_logger, "recovered saga");
+        self.succeeded.push(RecoverySuccess { time: Utc::now(), saga_id });
+    }
+
+    /// Record that we failed to recover this saga
+    pub fn saga_recovery_failure(&mut self, saga_id: SagaId, error: &Error) {
+        let saga_logger = self
+            .in_progress
+            .remove(&saga_id)
+            .expect("recovered saga should have previously started");
+        error!(saga_logger, "failed to recover saga"; error);
+        self.failed.push(RecoveryFailure {
+            time: Utc::now(),
+            saga_id,
+            message: InlineErrorChain::new(error).to_string(),
+        });
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::status;
+    use crate::test::make_fake_saga;
+    use crate::test::make_saga_ids;
+    use omicron_test_utils::dev::test_setup_log;
+
+    #[test]
+    fn test_read_all_from_channel() {
+        let (tx, mut rx) = mpsc::unbounded_channel();
+
+        // If we send nothing on the channel, reading from it should return
+        // immediately, having found nothing.
+        let (numbers, disconnected) = read_all_from_channel::<u32>(&mut rx);
+        assert!(numbers.is_empty());
+        assert!(!disconnected);
+
+        // Send some numbers and make sure we get them back.
+        let expected_numbers = vec![1, 7, 0, 1];
+        for n in &expected_numbers {
+            tx.send(*n).unwrap();
+        }
+        let (numbers, disconnected) = read_all_from_channel(&mut rx);
+        assert_eq!(expected_numbers, numbers);
+        assert!(!disconnected);
+
+        // Send some more numbers and make sure we get them back.
+        let expected_numbers = vec![9, 7, 2, 0, 0, 6];
+        for n in &expected_numbers {
+            tx.send(*n).unwrap();
+        }
+
+        let (numbers, disconnected) = read_all_from_channel(&mut rx);
+        assert_eq!(expected_numbers, numbers);
+        assert!(!disconnected);
+
+        // Send just a few more, then disconnect the channel.
+        tx.send(128).unwrap();
+        drop(tx);
+        let (numbers, disconnected) = read_all_from_channel(&mut rx);
+        assert_eq!(vec![128], numbers);
+        assert!(disconnected);
+
+        // Also exercise the trivial case where the channel is disconnected
+        // before we read anything.
+        let (tx, mut rx) = mpsc::unbounded_channel();
+        drop(tx);
+        let (numbers, disconnected) = read_all_from_channel::<u32>(&mut rx);
+        assert!(numbers.is_empty());
+        assert!(disconnected);
+    }
+
+    /// Creates a `Plan` for testing that covers a variety of cases
+    pub struct BasicPlanTestCase {
+        pub plan: Plan,
+        pub to_recover: Vec<SagaId>,
+        pub to_skip: Vec<SagaId>,
+        pub to_mark_done: Vec<SagaId>,
+        pub to_mark_maybe: Vec<SagaId>,
+    }
+
+    impl BasicPlanTestCase {
+        pub fn new(log: &slog::Logger) -> BasicPlanTestCase {
+            let to_recover = make_saga_ids(4);
+            let to_skip = make_saga_ids(3);
+            let to_mark_done = make_saga_ids(2);
+            let to_mark_maybe = make_saga_ids(1);
+
+            info!(log, "test setup";
+                "to_recover" => ?to_recover,
+                "to_skip" => ?to_skip,
+                "to_mark_done" => ?to_mark_done,
+                "to_mark_maybe" => ?to_mark_maybe,
+            );
+
+            let mut plan_builder = PlanBuilder::new(log);
+            for saga_id in &to_recover {
+                plan_builder
+                    .saga_recovery_needed(*saga_id, make_fake_saga(*saga_id));
+            }
+            for saga_id in &to_skip {
+                plan_builder.saga_recovery_not_needed(*saga_id);
+            }
+            for saga_id in &to_mark_done {
+                plan_builder.saga_infer_done(*saga_id);
+            }
+            for saga_id in &to_mark_maybe {
+                plan_builder.saga_maybe_done(*saga_id);
+            }
+            let plan = plan_builder.build();
+
+            BasicPlanTestCase {
+                plan,
+                to_recover,
+                to_skip,
+                to_mark_done,
+                to_mark_maybe,
+            }
+        }
+    }
+
+    #[test]
+    fn test_plan_basic() {
+        let logctx = test_setup_log("saga_recovery_plan_basic");
+
+        // Trivial initial case
+        let plan_builder = PlanBuilder::new(&logctx.log);
+        let plan = plan_builder.build();
+        assert_eq!(0, plan.sagas_needing_recovery().count());
+        assert_eq!(0, plan.sagas_inferred_done().count());
+        assert_eq!(0, plan.sagas_maybe_done().count());
+
+        // Basic case
+        let BasicPlanTestCase {
+            plan,
+            to_recover,
+            to_skip: _,
+            to_mark_done,
+            to_mark_maybe,
+        } = BasicPlanTestCase::new(&logctx.log);
+
+        let found_to_recover =
+            plan.sagas_needing_recovery().collect::<Vec<_>>();
+        assert_eq!(to_recover.len(), found_to_recover.len());
+        for (expected_saga_id, (found_saga_id, found_saga_record)) in
+            to_recover.into_iter().zip(found_to_recover.into_iter())
+        {
+            assert_eq!(expected_saga_id, *found_saga_id);
+            assert_eq!(expected_saga_id, found_saga_record.id.0);
+            assert_eq!("dummy", found_saga_record.name);
+        }
+        assert_eq!(
+            to_mark_done,
+            plan.sagas_inferred_done().collect::<Vec<_>>(),
+        );
+        assert_eq!(to_mark_maybe, plan.sagas_maybe_done().collect::<Vec<_>>(),);
+
+        logctx.cleanup_successful();
+    }
+
+    #[tokio::test]
+    async fn test_execution_basic() {
+        let logctx = test_setup_log("saga_recovery_execution_basic");
+
+        // Trivial initial case
+        let plan_builder = PlanBuilder::new(&logctx.log);
+        let plan = plan_builder.build();
+        let execution_builder = ExecutionBuilder::new();
+        let execution = execution_builder.build();
+
+        assert_eq!(0, execution.sagas_recovered_successfully().count());
+        let last_pass = status::LastPassSuccess::new(&plan, &execution);
+        assert_eq!(0, last_pass.nfound);
+        assert_eq!(0, last_pass.nrecovered);
+        assert_eq!(0, last_pass.nfailed);
+        assert_eq!(0, last_pass.nskipped);
+        assert_eq!(0, last_pass.nremoved);
+
+        // Test a non-trivial ExecutionDone
+        let BasicPlanTestCase {
+            plan,
+            mut to_recover,
+            to_skip,
+            to_mark_done,
+            to_mark_maybe: _,
+        } = BasicPlanTestCase::new(&logctx.log);
+        let mut execution_builder = ExecutionBuilder::new();
+        assert!(to_recover.len() >= 3, "someone changed the test case");
+
+        // Start recovery backwards, just to make sure there's not some implicit
+        // dependency on the order.  (We could shuffle, but then the test would
+        // be non-deterministic.)
+        for saga_id in to_recover.iter().rev() {
+            execution_builder.saga_recovery_start(*saga_id, logctx.log.clone());
+        }
+
+        // "Finish" recovery, in yet a different order (for the same reason as
+        // above).  We want to test the success and failure cases.
+        //
+        // Act like:
+        // - recovery for the last saga failed
+        // - recovery for the other sagas completes successfully
+        to_recover.rotate_left(2);
+        for (i, saga_id) in to_recover.iter().enumerate() {
+            if i == to_recover.len() - 1 {
+                execution_builder.saga_recovery_failure(
+                    *saga_id,
+                    &Error::internal_error("test error"),
+                );
+            } else {
+                execution_builder.saga_recovery_success(*saga_id);
+            }
+        }
+
+        let execution = execution_builder.build();
+        assert_eq!(
+            to_recover.len() - 1,
+            execution.sagas_recovered_successfully().count()
+        );
+        let last_pass = status::LastPassSuccess::new(&plan, &execution);
+        assert_eq!(to_recover.len() + to_skip.len(), last_pass.nfound);
+        assert_eq!(to_recover.len() - 1, last_pass.nrecovered);
+        assert_eq!(1, last_pass.nfailed);
+        assert_eq!(to_skip.len(), last_pass.nskipped);
+        assert_eq!(to_mark_done.len(), last_pass.nremoved);
+
+        logctx.cleanup_successful();
+    }
+
+    // More interesting tests are done at the crate level because they include
+    // stuff from the `status` module, too.
+}
diff --git a/nexus/saga-recovery/src/status.rs b/nexus/saga-recovery/src/status.rs
new file mode 100644
index 0000000000..d9b0ce242d
--- /dev/null
+++ b/nexus/saga-recovery/src/status.rs
@@ -0,0 +1,175 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Report status for the saga recovery background task
+
+use super::recovery;
+use chrono::{DateTime, Utc};
+use omicron_common::api::external::Error;
+use serde::{Deserialize, Serialize};
+use slog_error_chain::InlineErrorChain;
+use std::collections::VecDeque;
+use steno::SagaId;
+
+// These values are chosen to be large enough to likely cover the complete
+// history of saga recoveries, successful and otherwise.  They just need to be
+// finite so that this system doesn't use an unbounded amount of memory.
+/// Maximum number of successful recoveries to keep track of for debugging
+const N_SUCCESS_SAGA_HISTORY: usize = 128;
+/// Maximum number of recent failures to keep track of for debugging
+const N_FAILED_SAGA_HISTORY: usize = 128;
+
+/// Summarizes the status of saga recovery for debugging
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+pub struct Report {
+    pub recent_recoveries: DebuggingHistory<RecoverySuccess>,
+    pub recent_failures: DebuggingHistory<RecoveryFailure>,
+    pub last_pass: LastPass,
+
+    pub ntotal_recovered: usize,
+    pub ntotal_failures: usize,
+    pub ntotal_started: usize,
+    pub ntotal_finished: usize,
+    pub ntotal_sec_errors_missing: usize,
+    pub ntotal_sec_errors_bad_state: usize,
+}
+
+impl Report {
+    pub fn new() -> Report {
+        Report {
+            recent_recoveries: DebuggingHistory::new(N_SUCCESS_SAGA_HISTORY),
+            recent_failures: DebuggingHistory::new(N_FAILED_SAGA_HISTORY),
+            last_pass: LastPass::NeverStarted,
+            ntotal_recovered: 0,
+            ntotal_failures: 0,
+            ntotal_started: 0,
+            ntotal_finished: 0,
+            ntotal_sec_errors_missing: 0,
+            ntotal_sec_errors_bad_state: 0,
+        }
+    }
+
+    /// Update the report after a single saga recovery pass where we at least
+    /// successfully constructed a plan
+    pub fn update_after_pass(
+        &mut self,
+        plan: &recovery::Plan,
+        execution: recovery::Execution,
+        nstarted: usize,
+    ) {
+        self.last_pass =
+            LastPass::Success(LastPassSuccess::new(plan, &execution));
+
+        let (succeeded, failed) = execution.into_results();
+
+        for success in succeeded {
+            self.recent_recoveries.append(success);
+            self.ntotal_recovered += 1;
+        }
+
+        for failure in failed {
+            self.recent_failures.append(failure);
+            self.ntotal_failures += 1;
+        }
+
+        self.ntotal_started += nstarted;
+        self.ntotal_finished += plan.ninferred_done();
+    }
+
+    /// Update the report after a saga recovery pass where we couldn't even
+    /// construct a plan (usually because we couldn't load state from the
+    /// database)
+    pub fn update_after_failure(&mut self, error: &Error, nstarted: usize) {
+        self.ntotal_started += nstarted;
+        self.last_pass = LastPass::Failed {
+            message: InlineErrorChain::new(error).to_string(),
+        };
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
+pub struct RecoverySuccess {
+    pub time: DateTime<Utc>,
+    pub saga_id: SagaId,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
+pub struct RecoveryFailure {
+    pub time: DateTime<Utc>,
+    pub saga_id: SagaId,
+    pub message: String,
+}
+
+/// Describes what happened during the last saga recovery pass
+#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
+pub enum LastPass {
+    /// There has not been a saga recovery pass yet
+    NeverStarted,
+    /// This pass failed to even construct a plan (usually because we couldn't
+    /// load state from the database)
+    Failed { message: String },
+    /// This pass was at least partially successful
+    Success(LastPassSuccess),
+}
+
+/// Describes what happened during a saga recovery pass where we at least
+/// managed to construct a plan
+#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
+pub struct LastPassSuccess {
+    pub nfound: usize,
+    pub nrecovered: usize,
+    pub nfailed: usize,
+    pub nskipped: usize,
+    pub nremoved: usize,
+}
+
+impl LastPassSuccess {
+    pub fn new(
+        plan: &recovery::Plan,
+        execution: &recovery::Execution,
+    ) -> LastPassSuccess {
+        let nfound = plan.sagas_needing_recovery().count() + plan.nskipped();
+        LastPassSuccess {
+            nfound,
+            nrecovered: execution.succeeded.len(),
+            nfailed: execution.failed.len(),
+            nskipped: plan.nskipped(),
+            nremoved: plan.ninferred_done(),
+        }
+    }
+}
+
+/// Debugging ringbuffer, storing arbitrary objects of type `T`
+// There surely exist faster and richer implementations.  At least this one's
+// pretty simple.
+#[derive(Clone, Debug, Eq, PartialEq, Deserialize, Serialize)]
+#[serde(transparent)]
+pub struct DebuggingHistory<T> {
+    ring: VecDeque<T>,
+}
+
+impl<T> DebuggingHistory<T> {
+    pub fn new(size: usize) -> DebuggingHistory<T> {
+        DebuggingHistory { ring: VecDeque::with_capacity(size) }
+    }
+
+    pub fn append(&mut self, t: T) {
+        if self.ring.len() == self.ring.capacity() {
+            let _ = self.ring.pop_front();
+        }
+        self.ring.push_back(t);
+    }
+
+    pub fn len(&self) -> usize {
+        self.ring.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.ring.is_empty()
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = &T> {
+        self.ring.iter()
+    }
+}
diff --git a/nexus/src/app/background/driver.rs b/nexus/src/app/background/driver.rs
index c93729a335..be09ccb21f 100644
--- a/nexus/src/app/background/driver.rs
+++ b/nexus/src/app/background/driver.rs
@@ -382,6 +382,9 @@ impl TaskExec {
 
         // Do it!
         let details = self.imp.activate(&self.opctx).await;
+        let details_str = serde_json::to_string(&details).unwrap_or_else(|e| {
+            format!("<<failed to serialize task status: {}>>", e)
+        });
 
         let elapsed = start_instant.elapsed();
 
@@ -407,6 +410,7 @@ impl TaskExec {
             "activation complete";
             "elapsed" => ?elapsed,
             "iteration" => iteration,
+            "status" => details_str,
         );
     }
 }
diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index 5f420773e0..3e79c42978 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -106,6 +106,7 @@ use super::tasks::phantom_disks;
 use super::tasks::physical_disk_adoption;
 use super::tasks::region_replacement;
 use super::tasks::region_replacement_driver;
+use super::tasks::saga_recovery;
 use super::tasks::service_firewall_rules;
 use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker;
 use super::tasks::sync_switch_configuration::SwitchPortSettingsManager;
@@ -115,6 +116,7 @@ use super::Activator;
 use super::Driver;
 use crate::app::oximeter::PRODUCER_LEASE_DURATION;
 use crate::app::saga::StartSaga;
+use crate::Nexus;
 use nexus_config::BackgroundTaskConfig;
 use nexus_config::DnsTasksConfig;
 use nexus_db_model::DnsGroup;
@@ -153,6 +155,7 @@ pub struct BackgroundTasks {
     pub task_service_firewall_propagation: Activator,
     pub task_abandoned_vmm_reaper: Activator,
     pub task_vpc_route_manager: Activator,
+    pub task_saga_recovery: Activator,
     pub task_lookup_region_port: Activator,
 
     // Handles to activate background tasks that do not get used by Nexus
@@ -231,6 +234,7 @@ impl BackgroundTasksInitializer {
             task_service_firewall_propagation: Activator::new(),
             task_abandoned_vmm_reaper: Activator::new(),
             task_vpc_route_manager: Activator::new(),
+            task_saga_recovery: Activator::new(),
             task_lookup_region_port: Activator::new(),
 
             task_internal_dns_propagation: Activator::new(),
@@ -246,22 +250,20 @@ impl BackgroundTasksInitializer {
     ///
     /// This function will wire up the `Activator`s in `background_tasks` to the
     /// corresponding tasks once they've been started.
-    #[allow(clippy::too_many_arguments)]
     pub fn start(
         self,
         background_tasks: &'_ BackgroundTasks,
-        opctx: OpContext,
-        datastore: Arc<DataStore>,
-        config: BackgroundTaskConfig,
-        rack_id: Uuid,
-        nexus_id: Uuid,
-        resolver: internal_dns::resolver::Resolver,
-        sagas: Arc<dyn StartSaga>,
-        producer_registry: ProducerRegistry,
+        args: BackgroundTasksData,
     ) -> Driver {
         let mut driver = self.driver;
-        let opctx = &opctx;
-        let producer_registry = &producer_registry;
+        let opctx = &args.opctx;
+        let datastore = args.datastore;
+        let config = args.config;
+        let rack_id = args.rack_id;
+        let nexus_id = args.nexus_id;
+        let resolver = args.resolver;
+        let sagas = args.saga_starter;
+        let producer_registry = &args.producer_registry;
 
         // This "let" construction helps catch mistakes where someone forgets to
         // wire up an activator to its corresponding background task.
@@ -291,6 +293,7 @@ impl BackgroundTasksInitializer {
             task_service_firewall_propagation,
             task_abandoned_vmm_reaper,
             task_vpc_route_manager,
+            task_saga_recovery,
             task_lookup_region_port,
             // Add new background tasks here.  Be sure to use this binding in a
             // call to `Driver::register()` below.  That's what actually wires
@@ -651,6 +654,25 @@ impl BackgroundTasksInitializer {
             activator: task_abandoned_vmm_reaper,
         });
 
+        // Background task: saga recovery
+        {
+            let task_impl = Box::new(saga_recovery::SagaRecovery::new(
+                datastore.clone(),
+                nexus_db_model::SecId(args.nexus_id),
+                args.saga_recovery,
+            ));
+
+            driver.register(TaskDefinition {
+                name: "saga_recovery",
+                description: "recovers sagas assigned to this Nexus",
+                period: config.saga_recovery.period_secs,
+                task_impl,
+                opctx: opctx.child(BTreeMap::new()),
+                watchers: vec![],
+                activator: task_saga_recovery,
+            });
+        }
+
         driver.register(TaskDefinition {
             name: "lookup_region_port",
             description: "fill in missing ports for region records",
@@ -667,6 +689,28 @@ impl BackgroundTasksInitializer {
     }
 }
 
+pub struct BackgroundTasksData {
+    /// root `OpContext` used for background tasks
+    pub opctx: OpContext,
+    /// handle to `DataStore`, provided directly to many background tasks
+    pub datastore: Arc<DataStore>,
+    /// background task configuration
+    pub config: BackgroundTaskConfig,
+    /// rack identifier
+    pub rack_id: Uuid,
+    /// nexus identifier
+    pub nexus_id: Uuid,
+    /// internal DNS DNS resolver, used when tasks need to contact other
+    /// internal services
+    pub resolver: internal_dns::resolver::Resolver,
+    /// handle to saga subsystem for starting sagas
+    pub saga_starter: Arc<dyn StartSaga>,
+    /// Oximeter producer registry (for metrics)
+    pub producer_registry: ProducerRegistry,
+    /// Helpers for saga recovery
+    pub saga_recovery: saga_recovery::SagaRecoveryHelpers<Arc<Nexus>>,
+}
+
 /// Starts the three DNS-propagation-related background tasks for either
 /// internal or external DNS (depending on the arguments)
 #[allow(clippy::too_many_arguments)]
diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs
index 1bd7a323c3..5b24907b0f 100644
--- a/nexus/src/app/background/mod.rs
+++ b/nexus/src/app/background/mod.rs
@@ -136,7 +136,9 @@ mod tasks;
 pub use driver::Activator;
 pub use driver::Driver;
 pub use init::BackgroundTasks;
+pub use init::BackgroundTasksData;
 pub use init::BackgroundTasksInitializer;
+pub use tasks::saga_recovery::SagaRecoveryHelpers;
 
 use futures::future::BoxFuture;
 use nexus_auth::context::OpContext;
diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs
index 5eb44ed7c3..a5204588d8 100644
--- a/nexus/src/app/background/tasks/mod.rs
+++ b/nexus/src/app/background/tasks/mod.rs
@@ -23,6 +23,7 @@ pub mod phantom_disks;
 pub mod physical_disk_adoption;
 pub mod region_replacement;
 pub mod region_replacement_driver;
+pub mod saga_recovery;
 pub mod service_firewall_rules;
 pub mod sync_service_zone_nat;
 pub mod sync_switch_configuration;
diff --git a/nexus/src/app/background/tasks/saga_recovery.rs b/nexus/src/app/background/tasks/saga_recovery.rs
new file mode 100644
index 0000000000..7b0fe1b331
--- /dev/null
+++ b/nexus/src/app/background/tasks/saga_recovery.rs
@@ -0,0 +1,927 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Saga recovery
+//!
+//! ## Review of distributed sagas
+//!
+//! Nexus uses distributed sagas via [`steno`] to manage multi-step operations
+//! that have their own unwinding or cleanup steps.  While executing sagas,
+//! critical state is durably stored in the **saga log** such that after a
+//! crash, the saga can be resumed while maintaining certain guarantees:
+//!
+//! - During normal execution, each **action** will be executed at least once.
+//! - If an action B depends on action A, then once B has started, A will not
+//!   run again.
+//! - Once any action has failed, the saga is **unwound**, meaning that the undo
+//!   actions are executed for any action that has successfully completed.
+//! - The saga will not come to rest until one of these three things has
+//!   happened:
+//!   1. All actions complete successfully.  This is the normal case of saga
+//!      completion.
+//!   2. Any number of actions complete successfully, at least one action
+//!      failed, and the undo actions complete successfully for any actions that
+//!      *did* run.  This is the normal case of clean saga failure where
+//!      intuitively the state of the world is unwound to match whatever it was
+//!      before the saga ran.
+//!   3. Any number of actions complete successfully, at least one action
+//!      failed, and at least one undo action also failed.  This is a nebulous
+//!      "stuck" state where the world may be partially changed by the saga.
+//!
+//! There's more to all this (see the Steno docs), but the important thing here
+//! is that the persistent state is critical for ensuring these properties
+//! across a Nexus crash.  The process of resuming in-progress sagas after a
+//! crash is called **saga recovery**.  Fortunately, Steno handles the details
+//! of those constraints.  All we have to do is provide Steno with the
+//! persistent state of any sagas that it needs to resume.
+//!
+//!
+//! ## Saga recovery and persistent state
+//!
+//! Everything needed to recover a saga is stored in:
+//!
+//! 1. a **saga** record, which is mostly immutable
+//! 2. the **saga log**, an append-only description of exactly what happened
+//!    during execution
+//!
+//! Responsibility for persisting this state is divided across Steno and Nexus:
+//!
+//! 1. Steno tells its consumer (Nexus) precisely what information needs to be
+//!    stored and when.  It does this by invoking methods on the `SecStore`
+//!    trait at key points in the saga's execution.  Steno does not care how
+//!    this information is stored or where it is stored.
+//!
+//! 2. Nexus serializes the given state and stores it into CockroachDB using
+//!    the `saga` and `saga_node_event` tables.
+//!
+//! After a crash, Nexus is then responsible for:
+//!
+//! 1. Identifying what sagas were in progress before the crash,
+//! 2. Loading all the information about them from the database (namely, the
+//!    `saga` record and the full saga log in the form of records from the
+//!    `saga_node_event` table), and
+//! 3. Providing all this information to Steno so that it can resume running the
+//!    saga.
+//!
+//!
+//! ## Saga recovery: not just at startup
+//!
+//! So far, this is fairly straightforward.  What makes it tricky is that there
+//! are situations where we want to carry out saga recovery after Nexus has
+//! already started and potentially recovered other sagas and started its own
+//! sagas.  Specifically, when a Nexus instance gets **expunged** (removed
+//! permanently), it may have outstanding sagas that need to be re-assigned to
+//! another Nexus instance, which then needs to resume them.  To do this, we run
+//! saga recovery in a Nexus background task so that it runs both periodically
+//! and on-demand when activated.  (This approach is also useful for other
+//! reasons, like retrying recovery for sagas whose recovery failed due to
+//! transient errors.)
+//!
+//! Why does this make things tricky?  When Nexus goes to identify what sagas
+//! it needs to recover, it lists sagas that are (1) assigned to it (as opposed
+//! to a different Nexus) and (2) not yet finished.  But that could include
+//! sagas in one of three groups:
+//!
+//! 1. Sagas from a previous Nexus lifetime (i.e., a different Unix process)
+//!    that have not yet been recovered in this lifetime.  These **should** be
+//!    recovered.
+//! 2. Sagas from a previous Nexus lifetime (i.e., a different Unix process)
+//!    that have already been recovered in this lifetime.  These **should not**
+//!    be recovered.
+//! 3. Sagas that were created in this Nexus lifetime.  These **should not** be
+//!    recovered.
+//!
+//! There are a bunch of ways to attack this problem.  We do it by keeping track
+//! in-memory of the set of sagas that might be running in the current process
+//! and then ignoring those when we do recovery.  Even this is easier said than
+//! done!  It's easy enough to insert new sagas into the set whenever a saga is
+//! successfully recovered as well as any time a saga is created for the first
+//! time (though that requires a structure that's modifiable from multiple
+//! different contexts).  But to avoid this set growing unbounded, we should
+//! remove entries when a saga finishes running.  When exactly can we do that?
+//! We have to be careful of the intrinsic race between when the recovery
+//! process queries the database to list candidate sagas for recovery (i.e.,
+//! unfinished sagas assigned to this Nexus) and when it checks the set of sagas
+//! that should be ignored.  Suppose a saga is running, the recovery process
+//! finds it, then the saga finishes, it gets removed from the set, and then the
+//! recovery process checks the set.  We'll think it wasn't running and start it
+//! again -- very bad.  We can't remove anything from the set until we know that
+//! the saga recovery task _doesn't_ have a stale list of candidate sagas to be
+//! recovered.
+//!
+//! This constraint suggests the solution: the set will be owned and managed
+//! entirely by the task that's doing saga recovery.  We'll use a channel to
+//! trigger inserts when sagas are created elsewhere in Nexus.  What about
+//! deletes?  The recovery process can actually figure out on its own when a
+//! saga can be removed: if a saga that was previously in the list of candidates
+//! to be recovered and is now no longer in that list, then that means it's
+//! finished, and that means it can be deleted from the set.  Care must be taken
+//! to process things in the right order.  These details are mostly handled by
+//! the separate [`nexus_saga_recovery`] crate.
+
+use crate::app::background::BackgroundTask;
+use crate::app::sagas::NexusSagaType;
+use crate::saga_interface::SagaContext;
+use crate::Nexus;
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db;
+use nexus_db_queries::db::DataStore;
+use omicron_common::api::external::Error;
+use omicron_common::api::external::InternalContext;
+use std::collections::BTreeMap;
+use std::sync::Arc;
+use steno::SagaId;
+use steno::SagaStateView;
+use tokio::sync::mpsc;
+
+/// Helpers used for saga recovery
+pub struct SagaRecoveryHelpers<N: MakeSagaContext> {
+    pub recovery_opctx: OpContext,
+    pub maker: N,
+    pub sec_client: Arc<steno::SecClient>,
+    pub registry: Arc<steno::ActionRegistry<N::SagaType>>,
+    pub sagas_started_rx: mpsc::UnboundedReceiver<SagaId>,
+}
+
+/// Background task that recovers sagas assigned to this Nexus
+///
+/// Normally, this task only does anything of note once, when Nexus starts up.
+/// But it runs periodically and can be activated explicitly for the rare case
+/// when a saga has been re-assigned to this Nexus (e.g., because some other
+/// Nexus has been expunged) and to handle retries for sagas whose previous
+/// recovery failed.
+pub struct SagaRecovery<N: MakeSagaContext> {
+    datastore: Arc<DataStore>,
+    /// Unique identifier for this Saga Execution Coordinator
+    ///
+    /// This always matches the Nexus id.
+    sec_id: db::SecId,
+    /// OpContext used for saga recovery
+    saga_recovery_opctx: OpContext,
+
+    // state required to resume a saga
+    /// handle to Steno, which actually resumes the saga
+    sec_client: Arc<steno::SecClient>,
+    /// generates the SagaContext for the saga
+    maker: N,
+    /// registry of actions that we need to provide to Steno
+    registry: Arc<steno::ActionRegistry<N::SagaType>>,
+
+    // state that we use during each recovery pass
+    /// channel on which we listen for sagas being started elsewhere in Nexus
+    sagas_started_rx: mpsc::UnboundedReceiver<SagaId>,
+    /// recovery state persisted between passes
+    rest_state: nexus_saga_recovery::RestState,
+
+    /// status reporting
+    status: nexus_saga_recovery::Report,
+}
+
+impl<N: MakeSagaContext> BackgroundTask for SagaRecovery<N> {
+    fn activate<'a>(
+        &'a mut self,
+        opctx: &'a OpContext,
+    ) -> BoxFuture<'a, serde_json::Value> {
+        async {
+            // We don't need the future that's returned by activate_internal().
+            // That's only used by the test suite.
+            let _ = self.activate_internal(opctx).await;
+            serde_json::to_value(&self.status).unwrap()
+        }
+        .boxed()
+    }
+}
+
+impl<N: MakeSagaContext> SagaRecovery<N> {
+    pub fn new(
+        datastore: Arc<DataStore>,
+        sec_id: db::SecId,
+        helpers: SagaRecoveryHelpers<N>,
+    ) -> SagaRecovery<N> {
+        SagaRecovery {
+            datastore,
+            sec_id,
+            saga_recovery_opctx: helpers.recovery_opctx,
+            maker: helpers.maker,
+            sec_client: helpers.sec_client,
+            registry: helpers.registry,
+            sagas_started_rx: helpers.sagas_started_rx,
+            rest_state: nexus_saga_recovery::RestState::new(),
+            status: nexus_saga_recovery::Report::new(),
+        }
+    }
+
+    /// Invoked for each activation of the background task
+    ///
+    /// This internal version exists solely to expose some information about
+    /// what was recovered for testing.
+    async fn activate_internal(
+        &mut self,
+        opctx: &OpContext,
+    ) -> Option<(
+        BoxFuture<'static, Result<(), Error>>,
+        nexus_saga_recovery::LastPassSuccess,
+    )> {
+        let log = &opctx.log;
+        let datastore = &self.datastore;
+
+        // Fetch the list of not-yet-finished sagas that are assigned to
+        // this Nexus instance.
+        let result = list_sagas_in_progress(
+            &self.saga_recovery_opctx,
+            datastore,
+            self.sec_id,
+        )
+        .await;
+
+        // Process any newly-created sagas, adding them to our set of sagas
+        // to ignore during recovery.  We never want to try to recover a
+        // saga that was created within this Nexus's lifetime.
+        //
+        // We do this even if the previous step failed in order to avoid
+        // letting the channel queue build up.  In practice, it shouldn't
+        // really matter.
+        //
+        // But given that we're doing this, it's critical that we do it
+        // *after* having fetched the candidate sagas from the database.
+        // It's okay if one of these newly-created sagas doesn't show up in
+        // the candidate list (because it hadn't actually started at the
+        // point where we fetched the candidate list).  The reverse is not
+        // okay: if we did this step before fetching candidates, and a saga
+        // was immediately created and showed up in our candidate list, we'd
+        // erroneously conclude that it needed to be recovered when in fact
+        // it was already running.
+        let nstarted = self
+            .rest_state
+            .update_started_sagas(log, &mut self.sagas_started_rx);
+
+        match result {
+            Ok(db_sagas) => {
+                let plan = nexus_saga_recovery::Plan::new(
+                    log,
+                    &self.rest_state,
+                    db_sagas,
+                );
+                self.recovery_check_done(log, &plan).await;
+                let (execution, future) =
+                    self.recovery_execute(log, &plan).await;
+                self.rest_state.update_after_pass(&plan, &execution);
+                let last_pass_success =
+                    nexus_saga_recovery::LastPassSuccess::new(
+                        &plan, &execution,
+                    );
+                self.status.update_after_pass(&plan, execution, nstarted);
+                Some((future, last_pass_success))
+            }
+            Err(error) => {
+                self.status.update_after_failure(&error, nstarted);
+                None
+            }
+        }
+    }
+
+    /// Check that for each saga that we inferred was done, Steno agrees
+    ///
+    /// This is not strictly necessary because this should always be true.  But
+    /// if for some reason it's not, that would be a serious issue and we'd want
+    /// to know that.
+    async fn recovery_check_done(
+        &mut self,
+        log: &slog::Logger,
+        plan: &nexus_saga_recovery::Plan,
+    ) {
+        for saga_id in plan.sagas_inferred_done() {
+            match self.sec_client.saga_get(saga_id).await {
+                Err(_) => {
+                    self.status.ntotal_sec_errors_missing += 1;
+                    error!(
+                        log,
+                        "SEC does not know about saga that we thought \
+                        had finished";
+                        "saga_id" => %saga_id
+                    );
+                }
+                Ok(saga_state) => match saga_state.state {
+                    SagaStateView::Done { .. } => (),
+                    _ => {
+                        self.status.ntotal_sec_errors_bad_state += 1;
+                        error!(
+                            log,
+                            "we thought saga was done, but SEC reports a \
+                            different state";
+                            "saga_id" => %saga_id,
+                            "sec_state" => ?saga_state.state
+                        );
+                    }
+                },
+            }
+        }
+    }
+
+    /// Recovers the sagas described in `plan`
+    async fn recovery_execute(
+        &self,
+        bgtask_log: &slog::Logger,
+        plan: &nexus_saga_recovery::Plan,
+    ) -> (nexus_saga_recovery::Execution, BoxFuture<'static, Result<(), Error>>)
+    {
+        let mut builder = nexus_saga_recovery::ExecutionBuilder::new();
+        let mut completion_futures = Vec::new();
+
+        // Load and resume all these sagas serially.  Too much parallelism here
+        // could overload the database.  It wouldn't buy us much anyway to
+        // parallelize this since these operations should generally be quick,
+        // and there shouldn't be too many sagas outstanding, and Nexus has
+        // already crashed so they've experienced a bit of latency already.
+        for (saga_id, saga) in plan.sagas_needing_recovery() {
+            let saga_log = self.maker.make_saga_log(*saga_id, &saga.name);
+            builder.saga_recovery_start(*saga_id, saga_log.clone());
+            match self.recover_one_saga(bgtask_log, &saga_log, saga).await {
+                Ok(completion_future) => {
+                    builder.saga_recovery_success(*saga_id);
+                    completion_futures.push(completion_future);
+                }
+                Err(error) => {
+                    // It's essential that we not bail out early just because we
+                    // hit an error here.  We want to recover all the sagas that
+                    // we can.
+                    builder.saga_recovery_failure(*saga_id, &error);
+                }
+            }
+        }
+
+        let future = async {
+            futures::future::try_join_all(completion_futures).await?;
+            Ok(())
+        }
+        .boxed();
+        (builder.build(), future)
+    }
+
+    async fn recover_one_saga(
+        &self,
+        bgtask_logger: &slog::Logger,
+        saga_logger: &slog::Logger,
+        saga: &nexus_db_model::Saga,
+    ) -> Result<BoxFuture<'static, Result<(), Error>>, Error> {
+        let datastore = &self.datastore;
+        let saga_id: SagaId = saga.id.into();
+
+        let log_events = datastore
+            .saga_fetch_log_batched(&self.saga_recovery_opctx, saga.id)
+            .await
+            .with_internal_context(|| format!("recovering saga {saga_id}"))?;
+        trace!(bgtask_logger, "recovering saga: loaded log";
+            "nevents" => log_events.len(),
+            "saga_id" => %saga_id,
+        );
+
+        let saga_context = self.maker.make_saga_context(saga_logger.clone());
+        let saga_completion = self
+            .sec_client
+            .saga_resume(
+                saga_id,
+                saga_context,
+                saga.saga_dag.clone(),
+                self.registry.clone(),
+                log_events,
+            )
+            .await
+            .map_err(|error| {
+                // TODO-robustness We want to differentiate between retryable and
+                // not here
+                Error::internal_error(&format!(
+                    "failed to resume saga: {:#}",
+                    error
+                ))
+            })?;
+
+        trace!(&bgtask_logger, "recovering saga: starting the saga";
+            "saga_id" => %saga_id
+        );
+        self.sec_client.saga_start(saga_id).await.map_err(|error| {
+            Error::internal_error(&format!("failed to start saga: {:#}", error))
+        })?;
+
+        Ok(async {
+            saga_completion.await.kind.map_err(|e| {
+                Error::internal_error(&format!("Saga failure: {:?}", e))
+            })?;
+            Ok(())
+        }
+        .boxed())
+    }
+}
+
+/// List all in-progress sagas assigned to the given SEC
+async fn list_sagas_in_progress(
+    opctx: &OpContext,
+    datastore: &DataStore,
+    sec_id: db::SecId,
+) -> Result<BTreeMap<SagaId, nexus_db_model::saga_types::Saga>, Error> {
+    let log = &opctx.log;
+    debug!(log, "listing candidate sagas for recovery");
+    let result = datastore
+        .saga_list_recovery_candidates_batched(&opctx, sec_id)
+        .await
+        .internal_context("listing in-progress sagas for saga recovery")
+        .map(|list| {
+            list.into_iter()
+                .map(|saga| (saga.id.into(), saga))
+                .collect::<BTreeMap<SagaId, nexus_db_model::Saga>>()
+        });
+    match &result {
+        Ok(list) => {
+            info!(log, "listed in-progress sagas"; "count" => list.len());
+        }
+        Err(error) => {
+            warn!(log, "failed to list in-progress sagas"; error);
+        }
+    };
+    result
+}
+
+/// Encapsulates the tiny bit of behavior associated with constructing a new
+/// saga context
+///
+/// This type exists so that the rest of the `SagaRecovery` task can avoid
+/// knowing directly about Nexus, which in turn allows us to test it with sagas
+/// that we control.
+pub trait MakeSagaContext: Send + Sync {
+    type SagaType: steno::SagaType;
+
+    fn make_saga_context(
+        &self,
+        log: slog::Logger,
+    ) -> Arc<<Self::SagaType as steno::SagaType>::ExecContextType>;
+
+    fn make_saga_log(&self, id: SagaId, name: &str) -> slog::Logger;
+}
+
+impl MakeSagaContext for Arc<Nexus> {
+    type SagaType = NexusSagaType;
+    fn make_saga_context(&self, log: slog::Logger) -> Arc<Arc<SagaContext>> {
+        // The extra `Arc` is a little ridiculous.  The problem is that Steno
+        // expects (in `sec_client.saga_resume()`) that the user-defined context
+        // will be wrapped in an `Arc`.  But we already use `Arc<SagaContext>`
+        // for our type.  Hence we need two Arcs.
+        Arc::new(Arc::new(SagaContext::new(self.clone(), log)))
+    }
+
+    fn make_saga_log(&self, id: SagaId, name: &str) -> slog::Logger {
+        self.log.new(o!(
+            "saga_name" => name.to_owned(),
+            "saga_id" => id.to_string(),
+        ))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use nexus_auth::authn;
+    use nexus_db_queries::context::OpContext;
+    use nexus_db_queries::db::test_utils::UnpluggableCockroachDbSecStore;
+    use nexus_test_utils::{
+        db::test_setup_database, resource_helpers::create_project,
+    };
+    use nexus_test_utils_macros::nexus_test;
+    use nexus_types::internal_api::views::LastResult;
+    use omicron_test_utils::dev::{
+        self,
+        poll::{wait_for_condition, CondCheckError},
+    };
+    use once_cell::sync::Lazy;
+    use pretty_assertions::assert_eq;
+    use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
+    use steno::{
+        new_action_noop_undo, Action, ActionContext, ActionError,
+        ActionRegistry, DagBuilder, Node, SagaDag, SagaId, SagaName,
+        SagaResult, SagaType, SecClient,
+    };
+    use uuid::Uuid;
+    type ControlPlaneTestContext =
+        nexus_test_utils::ControlPlaneTestContext<crate::Server>;
+
+    // Returns a Cockroach DB, as well as a "datastore" interface (which is the
+    // one more frequently used by Nexus).
+    //
+    // The caller is responsible for calling "cleanup().await" on the returned
+    // CockroachInstance - we would normally wrap this in a drop method, but it
+    // is async.
+    async fn new_db(
+        log: &slog::Logger,
+    ) -> (dev::db::CockroachInstance, Arc<db::DataStore>) {
+        let db = test_setup_database(&log).await;
+        let cfg = nexus_db_queries::db::Config { url: db.pg_config().clone() };
+        let pool = Arc::new(db::Pool::new(log, &cfg));
+        let db_datastore = Arc::new(
+            db::DataStore::new(&log, Arc::clone(&pool), None).await.unwrap(),
+        );
+        (db, db_datastore)
+    }
+
+    // The following is our "saga-under-test". It's a simple two-node operation
+    // that tracks how many times it has been called, and provides a mechanism
+    // for detaching storage to simulate power failure (and meaningfully
+    // recover).
+
+    #[derive(Debug)]
+    struct TestContext {
+        log: slog::Logger,
+
+        // Storage, and instructions on whether or not to detach it
+        // when executing the first saga action.
+        storage: Arc<UnpluggableCockroachDbSecStore>,
+        do_unplug: AtomicBool,
+
+        // Tracks of how many times each node has been reached.
+        n1_count: AtomicU32,
+        n2_count: AtomicU32,
+    }
+
+    impl TestContext {
+        fn new(
+            log: &slog::Logger,
+            storage: Arc<UnpluggableCockroachDbSecStore>,
+        ) -> Self {
+            TestContext {
+                log: log.clone(),
+                storage,
+                do_unplug: AtomicBool::new(false),
+
+                // Counters of how many times the nodes have been invoked.
+                n1_count: AtomicU32::new(0),
+                n2_count: AtomicU32::new(0),
+            }
+        }
+    }
+
+    #[derive(Debug)]
+    struct TestOp;
+    impl SagaType for TestOp {
+        type ExecContextType = TestContext;
+    }
+
+    impl MakeSagaContext for Arc<TestContext> {
+        type SagaType = TestOp;
+        fn make_saga_context(&self, _log: slog::Logger) -> Arc<TestContext> {
+            self.clone()
+        }
+
+        fn make_saga_log(&self, id: SagaId, name: &str) -> slog::Logger {
+            self.log.new(o!(
+                "saga_name" => name.to_owned(),
+                "saga_id" => id.to_string(),
+            ))
+        }
+    }
+
+    static ACTION_N1: Lazy<Arc<dyn Action<TestOp>>> =
+        Lazy::new(|| new_action_noop_undo("n1_action", node_one));
+    static ACTION_N2: Lazy<Arc<dyn Action<TestOp>>> =
+        Lazy::new(|| new_action_noop_undo("n2_action", node_two));
+
+    fn registry_create() -> Arc<ActionRegistry<TestOp>> {
+        let mut registry = ActionRegistry::new();
+        registry.register(Arc::clone(&ACTION_N1));
+        registry.register(Arc::clone(&ACTION_N2));
+        Arc::new(registry)
+    }
+
+    fn saga_object_create() -> Arc<SagaDag> {
+        let mut builder = DagBuilder::new(SagaName::new("test-saga"));
+        builder.append(Node::action("n1_out", "NodeOne", ACTION_N1.as_ref()));
+        builder.append(Node::action("n2_out", "NodeTwo", ACTION_N2.as_ref()));
+        let dag = builder.build().unwrap();
+        Arc::new(SagaDag::new(dag, serde_json::Value::Null))
+    }
+
+    async fn node_one(ctx: ActionContext<TestOp>) -> Result<i32, ActionError> {
+        let uctx = ctx.user_data();
+        uctx.n1_count.fetch_add(1, Ordering::SeqCst);
+        info!(&uctx.log, "ACTION: node_one");
+        // If "do_unplug" is true, we detach storage.
+        //
+        // This prevents the SEC from successfully recording that
+        // this node completed, and acts like a crash.
+        if uctx.do_unplug.load(Ordering::SeqCst) {
+            info!(&uctx.log, "Unplugged storage");
+            uctx.storage.set_unplug(true);
+        }
+        Ok(1)
+    }
+
+    async fn node_two(ctx: ActionContext<TestOp>) -> Result<i32, ActionError> {
+        let uctx = ctx.user_data();
+        uctx.n2_count.fetch_add(1, Ordering::SeqCst);
+        info!(&uctx.log, "ACTION: node_two");
+        Ok(2)
+    }
+
+    // Helper function for setting up storage, SEC, and a test context object.
+    fn create_storage_sec_and_context(
+        log: &slog::Logger,
+        db_datastore: Arc<db::DataStore>,
+        sec_id: db::SecId,
+    ) -> (Arc<UnpluggableCockroachDbSecStore>, SecClient, Arc<TestContext>)
+    {
+        let storage = Arc::new(UnpluggableCockroachDbSecStore::new(
+            sec_id,
+            db_datastore,
+            log.new(o!("component" => "SecStore")),
+        ));
+        let sec_client =
+            steno::sec(log.new(o!("component" => "SEC")), storage.clone());
+        let uctx = Arc::new(TestContext::new(&log, storage.clone()));
+        (storage, sec_client, uctx)
+    }
+
+    // Helper function to run a basic saga that we can use to see which nodes
+    // ran and how many times.
+    async fn run_test_saga(
+        uctx: &Arc<TestContext>,
+        sec_client: &SecClient,
+    ) -> (SagaId, SagaResult) {
+        let saga_id = SagaId(Uuid::new_v4());
+        let future = sec_client
+            .saga_create(
+                saga_id,
+                uctx.clone(),
+                saga_object_create(),
+                registry_create(),
+            )
+            .await
+            .unwrap();
+        sec_client.saga_start(saga_id).await.unwrap();
+        (saga_id, future.await)
+    }
+
+    // Tests the basic case: recovery of a saga that appears (from its log) to
+    // be still running, and which is not currently running already.  In Nexus,
+    // this corresponds to the basic case where a saga was created in a previous
+    // Nexus lifetime and the current process knows nothing about it.
+    #[tokio::test]
+    async fn test_failure_during_saga_can_be_recovered() {
+        // Test setup
+        let logctx =
+            dev::test_setup_log("test_failure_during_saga_can_be_recovered");
+        let log = logctx.log.new(o!());
+        let (mut db, db_datastore) = new_db(&log).await;
+        let sec_id = db::SecId(uuid::Uuid::new_v4());
+        let (storage, sec_client, uctx) =
+            create_storage_sec_and_context(&log, db_datastore.clone(), sec_id);
+        let sec_log = log.new(o!("component" => "SEC"));
+        let opctx = OpContext::for_tests(
+            log,
+            Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
+        );
+        let saga_recovery_opctx =
+            opctx.child_with_authn(authn::Context::internal_saga_recovery());
+
+        // In order to recover a partially-created saga, we need a partial log.
+        // To create one, we'll run the saga normally, but configure it to
+        // unplug the datastore partway through so that the later log entries
+        // don't get written.  Note that the unplugged datastore completes
+        // operations successfully so that the saga will appeaer to complete
+        // successfully.
+        uctx.do_unplug.store(true, Ordering::SeqCst);
+        let (_, result) = run_test_saga(&uctx, &sec_client).await;
+        let output = result.kind.unwrap();
+        assert_eq!(output.lookup_node_output::<i32>("n1_out").unwrap(), 1);
+        assert_eq!(output.lookup_node_output::<i32>("n2_out").unwrap(), 2);
+        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 1);
+        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 1);
+
+        // Simulate a crash by terminating the SEC and creating a new one using
+        // the same storage system.
+        //
+        // Update uctx to prevent the storage system from detaching again.
+        sec_client.shutdown().await;
+        let sec_client = steno::sec(sec_log, storage.clone());
+        uctx.storage.set_unplug(false);
+        uctx.do_unplug.store(false, Ordering::SeqCst);
+
+        // Use our background task to recover the saga.  Observe that it re-runs
+        // operations and completes.
+        let sec_client = Arc::new(sec_client);
+        let (_, sagas_started_rx) = tokio::sync::mpsc::unbounded_channel();
+        let mut task = SagaRecovery::new(
+            db_datastore.clone(),
+            sec_id,
+            SagaRecoveryHelpers {
+                recovery_opctx: saga_recovery_opctx,
+                maker: uctx.clone(),
+                sec_client: sec_client.clone(),
+                registry: registry_create(),
+                sagas_started_rx,
+            },
+        );
+
+        let Some((completion_future, last_pass_success)) =
+            task.activate_internal(&opctx).await
+        else {
+            panic!("saga recovery failed");
+        };
+
+        assert_eq!(last_pass_success.nrecovered, 1);
+        assert_eq!(last_pass_success.nfailed, 0);
+        assert_eq!(last_pass_success.nskipped, 0);
+
+        // Wait for the recovered saga to complete and make sure it re-ran the
+        // operations that we expected it to.
+        completion_future
+            .await
+            .expect("recovered saga to complete successfully");
+        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 2);
+        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 2);
+
+        // Test cleanup
+        drop(task);
+        let sec_client = Arc::try_unwrap(sec_client).unwrap();
+        sec_client.shutdown().await;
+        db.cleanup().await.unwrap();
+        logctx.cleanup_successful();
+    }
+
+    // Tests that a saga that has finished (as reflected in the database state)
+    // does not get recovered.
+    #[tokio::test]
+    async fn test_successful_saga_does_not_replay_during_recovery() {
+        // Test setup
+        let logctx = dev::test_setup_log(
+            "test_successful_saga_does_not_replay_during_recovery",
+        );
+        let log = logctx.log.new(o!());
+        let (mut db, db_datastore) = new_db(&log).await;
+        let sec_id = db::SecId(uuid::Uuid::new_v4());
+        let (storage, sec_client, uctx) =
+            create_storage_sec_and_context(&log, db_datastore.clone(), sec_id);
+        let sec_log = log.new(o!("component" => "SEC"));
+        let opctx = OpContext::for_tests(
+            log,
+            Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
+        );
+        let saga_recovery_opctx =
+            opctx.child_with_authn(authn::Context::internal_saga_recovery());
+
+        // Create and start a saga, which we expect to complete successfully.
+        let (_, result) = run_test_saga(&uctx, &sec_client).await;
+        let output = result.kind.unwrap();
+        assert_eq!(output.lookup_node_output::<i32>("n1_out").unwrap(), 1);
+        assert_eq!(output.lookup_node_output::<i32>("n2_out").unwrap(), 2);
+        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 1);
+        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 1);
+
+        // Simulate a crash by terminating the SEC and creating a new one using
+        // the same storage system.
+        sec_client.shutdown().await;
+        let sec_client = steno::sec(sec_log, storage.clone());
+
+        // Go through recovery.  We should not find or recover this saga.
+        let sec_client = Arc::new(sec_client);
+        let (_, sagas_started_rx) = tokio::sync::mpsc::unbounded_channel();
+        let mut task = SagaRecovery::new(
+            db_datastore.clone(),
+            sec_id,
+            SagaRecoveryHelpers {
+                recovery_opctx: saga_recovery_opctx,
+                maker: uctx.clone(),
+                sec_client: sec_client.clone(),
+                registry: registry_create(),
+                sagas_started_rx,
+            },
+        );
+
+        let Some((_, last_pass_success)) = task.activate_internal(&opctx).await
+        else {
+            panic!("saga recovery failed");
+        };
+
+        assert_eq!(last_pass_success.nrecovered, 0);
+        assert_eq!(last_pass_success.nfailed, 0);
+        assert_eq!(last_pass_success.nskipped, 0);
+
+        // The nodes should not have been replayed.
+        assert_eq!(uctx.n1_count.load(Ordering::SeqCst), 1);
+        assert_eq!(uctx.n2_count.load(Ordering::SeqCst), 1);
+
+        // Test cleanup
+        drop(task);
+        let sec_client = Arc::try_unwrap(sec_client).unwrap();
+        sec_client.shutdown().await;
+        db.cleanup().await.unwrap();
+        logctx.cleanup_successful();
+    }
+
+    // Verify the plumbing that exists between regular saga creation and saga
+    // recovery.
+    #[nexus_test(server = crate::Server)]
+    async fn test_nexus_recovery(cptestctx: &ControlPlaneTestContext) {
+        let nexus = &cptestctx.server.server_context().nexus;
+
+        // This is tricky to do.  We're trying to make sure the plumbing is
+        // hooked up so that when a saga is created, the saga recovery task
+        // learns about it.  The purpose of that plumbing is to ensure that we
+        // don't try to recover a task that's already running.  It'd be ideal to
+        // test that directly, but we can't easily control execution well enough
+        // to ensure that the background task runs while the saga is still
+        // running.  However, even if we miss it (i.e., the background task only
+        // runs after the saga completes successfully), there's a side effect we
+        // can look for: the task should report the completed saga as "maybe
+        // done".  On the next activation, it should report that it's removed a
+        // saga from its internal state (because it saw that it was done).
+
+        // Wait for the task to run once.
+        let driver = nexus.background_tasks_driver.get().unwrap();
+        let task_name = driver
+            .tasks()
+            .find(|task_name| task_name.as_str() == "saga_recovery")
+            .expect("expected background task called \"saga_recovery\"");
+        let first_completed = wait_for_condition(
+            || async {
+                let status = driver.task_status(task_name);
+                let LastResult::Completed(completed) = status.last else {
+                    return Err(CondCheckError::<()>::NotYet);
+                };
+                Ok(completed)
+            },
+            &std::time::Duration::from_millis(250),
+            &std::time::Duration::from_secs(15),
+        )
+        .await
+        .unwrap();
+
+        // Make sure that it didn't find anything to do.
+        let status_raw = first_completed.details;
+        let status: nexus_saga_recovery::Report =
+            serde_json::from_value(status_raw).unwrap();
+        let nexus_saga_recovery::LastPass::Success(last_pass_success) =
+            status.last_pass
+        else {
+            panic!("wrong last pass variant");
+        };
+        assert_eq!(last_pass_success.nfound, 0);
+        assert_eq!(last_pass_success.nrecovered, 0);
+        assert_eq!(last_pass_success.nfailed, 0);
+        assert_eq!(last_pass_success.nskipped, 0);
+
+        // Now kick off a saga -- any saga will do.  We don't even care if it
+        // works or not.  In practice, it will have finished by the time this
+        // call completes.
+        let _ = create_project(&cptestctx.external_client, "test").await;
+
+        // Activate the background task.  Wait for one pass.
+        nexus.background_tasks.task_saga_recovery.activate();
+        let _ = wait_for_condition(
+            || async {
+                let status = driver.task_status(task_name);
+                let LastResult::Completed(completed) = status.last else {
+                    panic!("task had completed before; how has it not now?");
+                };
+                if completed.iteration <= first_completed.iteration {
+                    return Err(CondCheckError::<()>::NotYet);
+                }
+                Ok(completed)
+            },
+            &std::time::Duration::from_millis(250),
+            &std::time::Duration::from_secs(15),
+        )
+        .await
+        .unwrap();
+
+        // Activate it again.  This should be enough for it to report having
+        // removed a saga from its state.
+        nexus.background_tasks.task_saga_recovery.activate();
+        let last_pass_success = wait_for_condition(
+            || async {
+                let status = driver.task_status(task_name);
+                let LastResult::Completed(completed) = status.last else {
+                    panic!("task had completed before; how has it not now?");
+                };
+
+                let status: nexus_saga_recovery::Report =
+                    serde_json::from_value(completed.details).unwrap();
+                let nexus_saga_recovery::LastPass::Success(last_pass_success) =
+                    status.last_pass
+                else {
+                    panic!("wrong last pass variant");
+                };
+                if last_pass_success.nremoved > 0 {
+                    return Ok(last_pass_success);
+                }
+
+                Err(CondCheckError::<()>::NotYet)
+            },
+            &std::time::Duration::from_millis(250),
+            &std::time::Duration::from_secs(15),
+        )
+        .await
+        .unwrap();
+
+        assert!(last_pass_success.nremoved > 0);
+    }
+}
diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs
index cee62f1107..60ed611bd7 100644
--- a/nexus/src/app/mod.rs
+++ b/nexus/src/app/mod.rs
@@ -6,11 +6,11 @@
 
 use self::external_endpoints::NexusCertResolver;
 use self::saga::SagaExecutor;
+use crate::app::background::BackgroundTasksData;
 use crate::app::oximeter::LazyTimeseriesClient;
 use crate::populate::populate_start;
 use crate::populate::PopulateArgs;
 use crate::populate::PopulateStatus;
-use crate::saga_interface::SagaContext;
 use crate::DropshotServer;
 use ::oximeter::types::ProducerRegistry;
 use anyhow::anyhow;
@@ -91,8 +91,10 @@ pub(crate) mod sagas;
 
 pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE;
 
+use crate::app::background::SagaRecoveryHelpers;
 use nexus_db_model::AllSchemaVersions;
 pub(crate) use nexus_db_model::MAX_NICS_PER_INSTANCE;
+use tokio::sync::mpsc;
 
 // XXX: Might want to recast as max *floating* IPs, we have at most one
 //      ephemeral (so bounded in saga by design).
@@ -132,12 +134,9 @@ pub struct Nexus {
     /// handle to global authz information
     authz: Arc<authz::Authz>,
 
-    /// saga execution coordinator
+    /// saga execution coordinator (SEC)
     sagas: Arc<SagaExecutor>,
 
-    /// Task representing completion of recovered Sagas
-    recovery_task: std::sync::Mutex<Option<db::RecoveryTask>>,
-
     /// External dropshot servers
     external_server: std::sync::Mutex<Option<DropshotServer>>,
 
@@ -248,9 +247,34 @@ impl Nexus {
             sec_store,
         ));
 
+        // It's a bit of a red flag to use an unbounded channel.
+        //
+        // This particular channel is used to send a Uuid from the saga executor
+        // to the saga recovery background task each time a saga is started.
+        //
+        // The usual argument for keeping a channel bounded is to ensure
+        // backpressure.  But we don't really want that here.  These items don't
+        // represent meaningful work for the saga recovery task, such that if it
+        // were somehow processing these slowly, we'd want to slow down the saga
+        // dispatch process.  Under normal conditions, we'd expect this queue to
+        // grow as we dispatch new sagas until the saga recovery task runs, at
+        // which point the queue will quickly be drained.  The only way this
+        // could really grow without bound is if the saga recovery task gets
+        // completely wedged and stops receiving these messages altogether.  In
+        // this case, the maximum size this queue could grow over time is the
+        // number of sagas we can launch in that time.  That's not ever likely
+        // to be a significant amount of memory.
+        //
+        // We could put our money where our mouth is: pick a sufficiently large
+        // bound and panic if we reach it.  But "sufficiently large" depends on
+        // the saga creation rate and the period of the saga recovery background
+        // task.  If someone changed the config, they'd have to remember to
+        // update this here.  This doesn't seem worth it.
+        let (saga_create_tx, saga_recovery_rx) = mpsc::unbounded_channel();
         let sagas = Arc::new(SagaExecutor::new(
             Arc::clone(&sec_client),
             log.new(o!("component" => "SagaExecutor")),
+            saga_create_tx,
         ));
 
         let client_state = dpd_client::ClientState {
@@ -420,7 +444,6 @@ impl Nexus {
             db_datastore: Arc::clone(&db_datastore),
             authz: Arc::clone(&authz),
             sagas,
-            recovery_task: std::sync::Mutex::new(None),
             external_server: std::sync::Mutex::new(None),
             techport_external_server: std::sync::Mutex::new(None),
             internal_server: std::sync::Mutex::new(None),
@@ -462,26 +485,12 @@ impl Nexus {
         // TODO-cleanup all the extra Arcs here seems wrong
         let nexus = Arc::new(nexus);
         nexus.sagas.set_nexus(nexus.clone());
-        let opctx = OpContext::for_background(
+        let saga_recovery_opctx = OpContext::for_background(
             log.new(o!("component" => "SagaRecoverer")),
             Arc::clone(&authz),
             authn::Context::internal_saga_recovery(),
             Arc::clone(&db_datastore) as Arc<dyn nexus_auth::storage::Storage>,
         );
-        let saga_logger = nexus.log.new(o!("saga_type" => "recovery"));
-        let recovery_task = db::recover(
-            opctx,
-            my_sec_id,
-            Arc::new(Arc::new(SagaContext::new(
-                Arc::clone(&nexus),
-                saga_logger,
-            ))),
-            Arc::clone(&db_datastore),
-            Arc::clone(&sec_client),
-            sagas::ACTION_REGISTRY.clone(),
-        );
-
-        *nexus.recovery_task.lock().unwrap() = Some(recovery_task);
 
         // Wait to start background tasks until after the populate step
         // finishes.  Among other things, the populate step installs role
@@ -508,14 +517,24 @@ impl Nexus {
 
             let driver = background_tasks_initializer.start(
                 &task_nexus.background_tasks,
-                background_ctx,
-                db_datastore,
-                task_config.pkg.background_tasks,
-                rack_id,
-                task_config.deployment.id,
-                resolver,
-                task_nexus.sagas.clone(),
-                task_registry,
+                BackgroundTasksData {
+                    opctx: background_ctx,
+                    datastore: db_datastore,
+                    config: task_config.pkg.background_tasks,
+                    rack_id,
+                    nexus_id: task_config.deployment.id,
+                    resolver,
+                    saga_starter: task_nexus.sagas.clone(),
+                    producer_registry: task_registry,
+
+                    saga_recovery: SagaRecoveryHelpers {
+                        recovery_opctx: saga_recovery_opctx,
+                        maker: task_nexus.clone(),
+                        sec_client: sec_client.clone(),
+                        registry: sagas::ACTION_REGISTRY.clone(),
+                        sagas_started_rx: saga_recovery_rx,
+                    },
+                },
             );
 
             if let Err(_) = task_nexus.background_tasks_driver.set(driver) {
diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs
index ed4ccf44fd..2b510a0f12 100644
--- a/nexus/src/app/saga.rs
+++ b/nexus/src/app/saga.rs
@@ -70,6 +70,7 @@ use steno::SagaDag;
 use steno::SagaId;
 use steno::SagaResult;
 use steno::SagaResultOk;
+use tokio::sync::mpsc;
 use uuid::Uuid;
 
 /// Given a particular kind of Nexus saga (the type parameter `N`) and
@@ -111,14 +112,16 @@ pub(crate) struct SagaExecutor {
     sec_client: Arc<steno::SecClient>,
     log: slog::Logger,
     nexus: OnceLock<Arc<Nexus>>,
+    saga_create_tx: mpsc::UnboundedSender<steno::SagaId>,
 }
 
 impl SagaExecutor {
     pub(crate) fn new(
         sec_client: Arc<steno::SecClient>,
         log: slog::Logger,
+        saga_create_tx: mpsc::UnboundedSender<steno::SagaId>,
     ) -> SagaExecutor {
-        SagaExecutor { sec_client, log, nexus: OnceLock::new() }
+        SagaExecutor { sec_client, log, nexus: OnceLock::new(), saga_create_tx }
     }
 
     // This is a little gross.  We want to hang the SagaExecutor off of Nexus,
@@ -190,6 +193,19 @@ impl SagaExecutor {
             saga_logger.clone(),
         )));
 
+        // Tell the recovery task about this.  It's critical that we send this
+        // message before telling Steno about this saga.  It's not critical that
+        // the task _receive_ this message synchronously.  See the comments in
+        // the recovery task implementation for details.
+        self.saga_create_tx.send(saga_id).map_err(
+            |_: mpsc::error::SendError<SagaId>| {
+                Error::internal_error(
+                    "cannot create saga: recovery task not listening \
+                     (is Nexus shutting down?)",
+                )
+            },
+        )?;
+
         // Tell Steno about it.  This does not start it running yet.
         info!(saga_logger, "preparing saga");
         let saga_completion_future = self
diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs
index d278fb5600..17f43b4950 100644
--- a/nexus/src/app/sagas/mod.rs
+++ b/nexus/src/app/sagas/mod.rs
@@ -50,7 +50,7 @@ pub mod common_storage;
 mod test_helpers;
 
 #[derive(Debug)]
-pub(crate) struct NexusSagaType;
+pub struct NexusSagaType;
 impl steno::SagaType for NexusSagaType {
     type ExecContextType = Arc<SagaContext>;
 }
diff --git a/nexus/src/saga_interface.rs b/nexus/src/saga_interface.rs
index 5a828ff0ec..aef7044408 100644
--- a/nexus/src/saga_interface.rs
+++ b/nexus/src/saga_interface.rs
@@ -13,7 +13,7 @@ use std::sync::Arc;
 // TODO-design Should this be the same thing as ServerContext?  It's
 // very analogous, but maybe there's utility in having separate views for the
 // HTTP server and sagas.
-pub(crate) struct SagaContext {
+pub struct SagaContext {
     nexus: Arc<Nexus>,
     log: Logger,
 }
diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml
index dfcaec2157..415727693b 100644
--- a/nexus/tests/config.test.toml
+++ b/nexus/tests/config.test.toml
@@ -119,6 +119,7 @@ instance_watcher.period_secs = 30
 service_firewall_propagation.period_secs = 300
 v2p_mapping_propagation.period_secs = 30
 abandoned_vmm_reaper.period_secs = 60
+saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 
 [default_region_allocation_strategy]
diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
index 92d3d6e392..50f9bf646e 100644
--- a/smf/nexus/multi-sled/config-partial.toml
+++ b/smf/nexus/multi-sled/config-partial.toml
@@ -61,6 +61,7 @@ service_firewall_propagation.period_secs = 300
 v2p_mapping_propagation.period_secs = 30
 instance_watcher.period_secs = 30
 abandoned_vmm_reaper.period_secs = 60
+saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 
 [default_region_allocation_strategy]
diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
index 8de9b6cb79..31db278616 100644
--- a/smf/nexus/single-sled/config-partial.toml
+++ b/smf/nexus/single-sled/config-partial.toml
@@ -61,6 +61,7 @@ service_firewall_propagation.period_secs = 300
 v2p_mapping_propagation.period_secs = 30
 instance_watcher.period_secs = 30
 abandoned_vmm_reaper.period_secs = 60
+saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 
 [default_region_allocation_strategy]

From f6d9e3c389ce228306680c56419e30506e643785 Mon Sep 17 00:00:00 2001
From: "oxide-renovate[bot]"
 <146848827+oxide-renovate[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 04:18:01 +0000
Subject: [PATCH 25/27] Update taiki-e/install-action digest to 3e71e71 (#6091)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR contains the following updates:

| Package | Type | Update | Change |
|---|---|---|---|
| [taiki-e/install-action](https://togithub.com/taiki-e/install-action)
| action | digest | [`996330b` ->
`3e71e71`](https://togithub.com/taiki-e/install-action/compare/996330b...3e71e71)
|

---

### Configuration

📅 **Schedule**: Branch creation - "after 8pm,before 6am" in timezone
America/Los_Angeles, Automerge - "after 8pm,before 6am" in timezone
America/Los_Angeles.

🚦 **Automerge**: Enabled.

♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the
rebase/retry checkbox.

🔕 **Ignore**: Close this PR and you won't be reminded about this update
again.

---

- [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check
this box

---

This PR has been generated by [Renovate
Bot](https://togithub.com/renovatebot/renovate).

<!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiIzNy40MzEuNyIsInVwZGF0ZWRJblZlciI6IjM3LjQzMS43IiwidGFyZ2V0QnJhbmNoIjoibWFpbiIsImxhYmVscyI6WyJkZXBlbmRlbmNpZXMiXX0=-->

Co-authored-by: oxide-renovate[bot] <146848827+oxide-renovate[bot]@users.noreply.github.com>
---
 .github/workflows/hakari.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/hakari.yml b/.github/workflows/hakari.yml
index 6e847ce8c4..980acc33dc 100644
--- a/.github/workflows/hakari.yml
+++ b/.github/workflows/hakari.yml
@@ -24,7 +24,7 @@ jobs:
         with:
           toolchain: stable
       - name: Install cargo-hakari
-        uses: taiki-e/install-action@996330bfc2ff267dc45a3d59354705b61547df0b # v2
+        uses: taiki-e/install-action@3e71e7135de310b70bc22dccb4d275acde8e055a # v2
         with:
           tool: cargo-hakari
       - name: Check workspace-hack Cargo.toml is up-to-date

From 76dc293d78bbc5e517bad089ad81937dd9bde837 Mon Sep 17 00:00:00 2001
From: Benjamin Naecker <ben@oxide.computer>
Date: Mon, 15 Jul 2024 21:53:06 -0700
Subject: [PATCH 26/27] Support deleting timeseries by name during schema
 upgrades (#6040)

This adds support for listing timeseries by name in a schema upgrade
directory, and deleting all records (schema and data) from those
timeseries during an offline ClickHouse database upgrade. The main goal
here is a relatively simple but effective mechanism to clean up
abandoned timeseries, while we figure out how to implement breaking
changes more robustly.

We alreay have examples of these abandonded timeseries in some existing
installations. The existing effort to move timeseries to TOML also
presents an opportunity to make one-time breaking changes for individual
timeseries. Both of these can be supported with this mechanism.

Fixes #5266
---
 oximeter/db/src/client/mod.rs | 458 ++++++++++++++++++++++++++++++++++
 oximeter/db/src/lib.rs        |  20 ++
 2 files changed, 478 insertions(+)

diff --git a/oximeter/db/src/client/mod.rs b/oximeter/db/src/client/mod.rs
index 2d6212971e..517c52f11e 100644
--- a/oximeter/db/src/client/mod.rs
+++ b/oximeter/db/src/client/mod.rs
@@ -43,6 +43,7 @@ use std::collections::btree_map::Entry;
 use std::collections::BTreeMap;
 use std::collections::BTreeSet;
 use std::convert::TryFrom;
+use std::io::ErrorKind;
 use std::net::SocketAddr;
 use std::num::NonZeroU32;
 use std::ops::Bound;
@@ -490,6 +491,29 @@ impl Client {
                 }
             }
         }
+
+        // Check if we have a list of timeseries that should be deleted, and
+        // remove them from the history books.
+        let to_delete = Self::read_timeseries_to_delete(
+            replicated,
+            next_version,
+            schema_dir,
+        )
+        .await?;
+        if to_delete.is_empty() {
+            debug!(
+                self.log,
+                "schema upgrade contained timeseries list file, \
+                but it did not contain any timeseries names",
+            );
+        } else {
+            debug!(
+                self.log,
+                "schema upgrade includes list of timeseries to be deleted";
+                "n_timeseries" => to_delete.len(),
+            );
+            self.expunge_timeseries_by_name(replicated, &to_delete).await?;
+        }
         Ok(())
     }
 
@@ -961,6 +985,128 @@ impl Client {
         }
         Ok(())
     }
+
+    /// Given a list of timeseries by name, delete their schema and any
+    /// associated data records from all tables.
+    async fn expunge_timeseries_by_name(
+        &self,
+        replicated: bool,
+        to_delete: &[TimeseriesName],
+    ) -> Result<(), Error> {
+        // The version table should not have any matching data, but let's avoid
+        // it entirely anyway.
+        let tables = self
+            .list_oximeter_database_tables(ListDetails {
+                include_version: false,
+                replicated,
+            })
+            .await?;
+
+        // This size is arbitrary, and just something to avoid enormous requests
+        // to ClickHouse. It's unlikely that we'll hit this in practice anyway,
+        // given that we have far fewer than 1000 timeseries today.
+        const DELETE_BATCH_SIZE: usize = 1000;
+        let maybe_on_cluster = if replicated {
+            format!("ON CLUSTER {}", crate::CLUSTER_NAME)
+        } else {
+            String::new()
+        };
+        for chunk in to_delete.chunks(DELETE_BATCH_SIZE) {
+            let names = chunk
+                .iter()
+                .map(|name| format!("'{name}'"))
+                .collect::<Vec<_>>()
+                .join(",");
+            debug!(
+                self.log,
+                "deleting chunk of timeseries";
+                "timeseries_names" => &names,
+                "n_timeseries" => chunk.len(),
+            );
+            for table in tables.iter() {
+                let sql = format!(
+                    "ALTER TABLE {}.{} \
+                    {} \
+                    DELETE WHERE timeseries_name in ({})",
+                    crate::DATABASE_NAME,
+                    table,
+                    maybe_on_cluster,
+                    names,
+                );
+                debug!(
+                    self.log,
+                    "deleting timeseries from next table";
+                    "table_name" => table,
+                    "n_timeseries" => chunk.len(),
+                );
+                self.execute(sql).await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn read_timeseries_to_delete(
+        replicated: bool,
+        next_version: u64,
+        schema_dir: &Path,
+    ) -> Result<Vec<TimeseriesName>, Error> {
+        let version_schema_dir =
+            Self::full_upgrade_path(replicated, next_version, schema_dir);
+        let filename =
+            version_schema_dir.join(crate::TIMESERIES_TO_DELETE_FILE);
+        match fs::read_to_string(&filename).await {
+            Ok(contents) => contents
+                .lines()
+                .map(|line| line.trim().parse().map_err(Error::from))
+                .collect(),
+            Err(e) if e.kind() == ErrorKind::NotFound => Ok(vec![]),
+            Err(err) => Err(Error::ReadTimeseriesToDeleteFile { err }),
+        }
+    }
+
+    /// List tables in the oximeter database.
+    async fn list_oximeter_database_tables(
+        &self,
+        ListDetails { include_version, replicated }: ListDetails,
+    ) -> Result<Vec<String>, Error> {
+        let mut sql = format!(
+            "SELECT name FROM system.tables WHERE database = '{}'",
+            crate::DATABASE_NAME,
+        );
+        if !include_version {
+            sql.push_str(" AND name != '");
+            sql.push_str(crate::VERSION_TABLE_NAME);
+            sql.push('\'');
+        }
+        // On a cluster, we need to operate on the "local" replicated tables.
+        if replicated {
+            sql.push_str(" AND engine = 'ReplicatedMergeTree'");
+        }
+        self.execute_with_body(sql).await.map(|(_summary, body)| {
+            body.lines().map(ToString::to_string).collect()
+        })
+    }
+}
+
+/// Helper argument to `Client::list_oximeter_database_tables`.
+#[derive(Clone, Copy, Debug, PartialEq)]
+struct ListDetails {
+    /// If true, include the version table in the output.
+    include_version: bool,
+    /// If true, list tables to operate on in a replicated cluster configuration.
+    ///
+    /// NOTE: We would like to always operate on the "top-level table", e.g.
+    /// `oximeter.measurements_u64`, regardless of whether we're working on the
+    /// cluster or a single-node setup. Otherwise, we need to know which cluster
+    /// we're working with, and then query either `measurements_u64` or
+    /// `measurements_u64_local` based on that.
+    ///
+    /// However, while that works for the local tables (even replicated ones),
+    /// it does _not_ work for the `Distributed` tables that we use as those
+    /// "top-level tables" in a cluster setup. That table engine does not
+    /// support mutations. Instead, we need to run those operations on the
+    /// `*_local` tables.
+    replicated: bool,
 }
 
 // A regex used to validate supported schema updates.
@@ -4423,4 +4569,316 @@ mod tests {
             })
             .collect()
     }
+
+    // Helper to write a test file containing timeseries to delete.
+    async fn write_timeseries_to_delete_file(
+        schema_dir: &Path,
+        replicated: bool,
+        version: u64,
+        names: &[TimeseriesName],
+    ) {
+        let subdir = schema_dir
+            .join(if replicated { "replicated" } else { "single-node" })
+            .join(version.to_string());
+        tokio::fs::create_dir_all(&subdir)
+            .await
+            .expect("failed to make subdirectories");
+        let filename = subdir.join(crate::TIMESERIES_TO_DELETE_FILE);
+        let contents = names
+            .iter()
+            .map(ToString::to_string)
+            .collect::<Vec<_>>()
+            .join("\n");
+        tokio::fs::write(&filename, contents)
+            .await
+            .expect("failed to write test timeseries to delete file");
+    }
+
+    #[tokio::test]
+    async fn test_read_timeseries_to_delete() {
+        let names: Vec<TimeseriesName> =
+            vec!["a:b".parse().unwrap(), "c:d".parse().unwrap()];
+        let schema_dir =
+            tempfile::TempDir::new().expect("failed to make temp dir");
+        const VERSION: u64 = 7;
+        write_timeseries_to_delete_file(
+            schema_dir.path(),
+            false,
+            VERSION,
+            &names,
+        )
+        .await;
+        let read = Client::read_timeseries_to_delete(
+            false,
+            VERSION,
+            schema_dir.path(),
+        )
+        .await
+        .expect("Failed to read timeseries to delete");
+        assert_eq!(names, read, "Read incorrect list of timeseries to delete",);
+    }
+
+    #[tokio::test]
+    async fn test_read_timeseries_to_delete_empty_file_is_ok() {
+        let schema_dir =
+            tempfile::TempDir::new().expect("failed to make temp dir");
+        const VERSION: u64 = 7;
+        write_timeseries_to_delete_file(schema_dir.path(), false, VERSION, &[])
+            .await;
+        let read = Client::read_timeseries_to_delete(
+            false,
+            VERSION,
+            schema_dir.path(),
+        )
+        .await
+        .expect("Failed to read timeseries to delete");
+        assert!(read.is_empty(), "Read incorrect list of timeseries to delete",);
+    }
+
+    #[tokio::test]
+    async fn test_read_timeseries_to_delete_nonexistent_file_is_ok() {
+        let path = PathBuf::from("/this/file/better/not/exist");
+        let read = Client::read_timeseries_to_delete(false, 1000000, &path)
+            .await
+            .expect("Failed to read timeseries to delete");
+        assert!(read.is_empty(), "Read incorrect list of timeseries to delete",);
+    }
+
+    #[tokio::test]
+    async fn test_expunge_timeseries_by_name_single_node() {
+        const TEST_NAME: &str = "test_expunge_timeseries_by_name_single_node";
+        let logctx = test_setup_log(TEST_NAME);
+        let log = &logctx.log;
+        let mut db = ClickHouseInstance::new_single_node(&logctx, 0)
+            .await
+            .expect("Failed to start ClickHouse");
+        let address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), db.port());
+        test_expunge_timeseries_by_name_impl(log, address, false).await;
+        db.cleanup().await.expect("Failed to cleanup ClickHouse server");
+        logctx.cleanup_successful();
+    }
+
+    #[tokio::test]
+    async fn test_expunge_timeseries_by_name_replicated() {
+        const TEST_NAME: &str = "test_expunge_timeseries_by_name_replicated";
+        let logctx = test_setup_log(TEST_NAME);
+        let mut cluster = create_cluster(&logctx).await;
+        let address = cluster.replica_1.address;
+        test_expunge_timeseries_by_name_impl(&logctx.log, address, true).await;
+
+        // TODO-cleanup: These should be arrays.
+        // See https://github.com/oxidecomputer/omicron/issues/4460.
+        cluster
+            .keeper_1
+            .cleanup()
+            .await
+            .expect("Failed to cleanup ClickHouse keeper 1");
+        cluster
+            .keeper_2
+            .cleanup()
+            .await
+            .expect("Failed to cleanup ClickHouse keeper 2");
+        cluster
+            .keeper_3
+            .cleanup()
+            .await
+            .expect("Failed to cleanup ClickHouse keeper 3");
+        cluster
+            .replica_1
+            .cleanup()
+            .await
+            .expect("Failed to cleanup ClickHouse server 1");
+        cluster
+            .replica_2
+            .cleanup()
+            .await
+            .expect("Failed to cleanup ClickHouse server 2");
+        logctx.cleanup_successful();
+    }
+
+    // Implementation of the test for expunging timeseries by name during an
+    // upgrade.
+    async fn test_expunge_timeseries_by_name_impl(
+        log: &Logger,
+        address: SocketAddr,
+        replicated: bool,
+    ) {
+        usdt::register_probes().unwrap();
+        let client = Client::new(address, &log);
+
+        const STARTING_VERSION: u64 = 1;
+        const NEXT_VERSION: u64 = 2;
+        const VERSIONS: [u64; 2] = [STARTING_VERSION, NEXT_VERSION];
+
+        // We need to actually have the oximeter DB here, and the version table,
+        // since `ensure_schema()` writes out versions to the DB as they're
+        // applied.
+        client
+            .initialize_db_with_version(replicated, STARTING_VERSION)
+            .await
+            .expect("failed to initialize test DB");
+
+        // Let's insert a few samples from two different timeseries. The
+        // timeseries share some field types and have others that are distinct
+        // between them, so that we can test that we don't touch tables we
+        // shouldn't, and only delete the parts we should.
+        let samples = generate_expunge_timeseries_samples();
+        client
+            .insert_samples(&samples)
+            .await
+            .expect("failed to insert test samples");
+        let all_timeseries: BTreeSet<TimeseriesName> = samples
+            .iter()
+            .map(|s| s.timeseries_name.parse().unwrap())
+            .collect();
+        assert_eq!(all_timeseries.len(), 2);
+
+        // Count the number of records in all tables, by timeseries.
+        let mut records_by_timeseries: BTreeMap<_, Vec<_>> = BTreeMap::new();
+        let all_tables = client
+            .list_oximeter_database_tables(ListDetails {
+                include_version: false,
+                replicated,
+            })
+            .await
+            .unwrap();
+        for table in all_tables.iter() {
+            let sql = format!(
+                "SELECT * FROM {}.{} FORMAT JSONEachRow",
+                crate::DATABASE_NAME,
+                table,
+            );
+            let body = client.execute_with_body(sql).await.unwrap().1;
+            for line in body.lines() {
+                let json: serde_json::Value =
+                    serde_json::from_str(line.trim()).unwrap();
+                let name = json["timeseries_name"].to_string();
+                records_by_timeseries.entry(name).or_default().push(json);
+            }
+        }
+
+        // Even though we don't need SQL, we need the directory for the first
+        // version too.
+        let (schema_dir, _version_dirs) =
+            create_test_upgrade_schema_directory(replicated, &VERSIONS).await;
+
+        // We don't actually need any SQL files in the version we're upgrading
+        // to. The function `ensure_schema` will apply any SQL and any
+        // timeseries to be deleted independently. We're just testing the
+        // latter.
+        let to_delete = vec![all_timeseries.first().unwrap().clone()];
+        write_timeseries_to_delete_file(
+            schema_dir.path(),
+            replicated,
+            NEXT_VERSION,
+            &to_delete,
+        )
+        .await;
+
+        // Let's run the "schema upgrade", which should only delete these
+        // particular timeseries.
+        client
+            .ensure_schema(replicated, NEXT_VERSION, schema_dir.path())
+            .await
+            .unwrap();
+
+        // Look over all tables.
+        //
+        // First, we should have zero mentions of the timeseries we've deleted.
+        for table in all_tables.iter() {
+            let sql = format!(
+                "SELECT COUNT() \
+                FROM {}.{} \
+                WHERE timeseries_name = '{}'
+                FORMAT CSV",
+                crate::DATABASE_NAME,
+                table,
+                &to_delete[0].to_string(),
+            );
+            let count: u64 = client
+                .execute_with_body(sql)
+                .await
+                .expect("failed to get count of timeseries")
+                .1
+                .trim()
+                .parse()
+                .expect("invalid record count from query");
+            assert_eq!(
+                count, 0,
+                "Should not have any rows associated with the deleted \
+                but found {count} records in table {table}",
+            );
+        }
+
+        // We should also still have all the records from the timeseries that we
+        // did _not_ expunge.
+        let mut found: BTreeMap<_, Vec<_>> = BTreeMap::new();
+        for table in all_tables.iter() {
+            let sql = format!(
+                "SELECT * FROM {}.{} FORMAT JSONEachRow",
+                crate::DATABASE_NAME,
+                table,
+            );
+            let body = client.execute_with_body(sql).await.unwrap().1;
+            for line in body.lines() {
+                let json: serde_json::Value =
+                    serde_json::from_str(line.trim()).unwrap();
+                let name = json["timeseries_name"].to_string();
+                found.entry(name).or_default().push(json);
+            }
+        }
+
+        // Check that all records we found exist in the previous set of found
+        // records, and that they are identical.
+        for (name, records) in found.iter() {
+            let existing_records = records_by_timeseries
+                .get(name)
+                .expect("expected to find previous records for timeseries");
+            assert_eq!(
+                records, existing_records,
+                "Some records from timeseries {name} were removed, \
+                but should not have been"
+            );
+        }
+    }
+
+    fn generate_expunge_timeseries_samples() -> Vec<Sample> {
+        #[derive(oximeter::Target)]
+        struct FirstTarget {
+            first_field: String,
+            second_field: Uuid,
+        }
+
+        #[derive(oximeter::Target)]
+        struct SecondTarget {
+            first_field: String,
+            second_field: bool,
+        }
+
+        #[derive(oximeter::Metric)]
+        struct SharedMetric {
+            datum: u64,
+        }
+
+        let ft = FirstTarget {
+            first_field: String::from("foo"),
+            second_field: Uuid::new_v4(),
+        };
+        let st = SecondTarget {
+            first_field: String::from("foo"),
+            second_field: false,
+        };
+        let mut m = SharedMetric { datum: 0 };
+
+        let mut out = Vec::with_capacity(8);
+        for i in 0..4 {
+            m.datum = i;
+            out.push(Sample::new(&ft, &m).unwrap());
+        }
+        for i in 4..8 {
+            m.datum = i;
+            out.push(Sample::new(&st, &m).unwrap());
+        }
+        out
+    }
 }
diff --git a/oximeter/db/src/lib.rs b/oximeter/db/src/lib.rs
index c3d2014ad1..d5cafc84f2 100644
--- a/oximeter/db/src/lib.rs
+++ b/oximeter/db/src/lib.rs
@@ -142,6 +142,12 @@ pub enum Error {
     #[error("Schema update versions must be sequential without gaps")]
     NonSequentialSchemaVersions,
 
+    #[error("Could not read timeseries_to_delete file")]
+    ReadTimeseriesToDeleteFile {
+        #[source]
+        err: io::Error,
+    },
+
     #[cfg(any(feature = "sql", test))]
     #[error("SQL error")]
     Sql(#[from] sql::Error),
@@ -317,6 +323,20 @@ const DATABASE_TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S%.9f";
 // The name of the database storing all metric information.
 const DATABASE_NAME: &str = "oximeter";
 
+// The name of the oximeter cluster, in the case of a replicated database.
+//
+// This must match what is used in the replicated SQL files when created the
+// database itself, and the XML files describing the cluster.
+const CLUSTER_NAME: &str = "oximeter_cluster";
+
+// The name of the table storing database version information.
+const VERSION_TABLE_NAME: &str = "version";
+
+// During schema upgrades, it is possible to list timeseries that should be
+// deleted, rather than deleting the entire database. These must be listed one
+// per line, in the file inside the schema version directory with this name.
+const TIMESERIES_TO_DELETE_FILE: &str = "timeseries-to-delete.txt";
+
 // The output format used for the result of select queries
 //
 // See https://clickhouse.com/docs/en/interfaces/formats/#jsoneachrow for details.

From c352f46357d96ff7aec229bb7616806ec3eff196 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@users.noreply.github.com>
Date: Tue, 16 Jul 2024 13:14:51 -0400
Subject: [PATCH 27/27] Remove sprockets (#6087)

The approach in the existing sprockets code was never fully implemented.
We're going to replace it with something else. Just remove the old code.
---
 Cargo.lock            | 104 ++++--------------------------------------
 Cargo.toml            |   5 --
 sp-sim/Cargo.toml     |   1 -
 sp-sim/src/gimlet.rs  |  24 ----------
 sp-sim/src/lib.rs     |  14 ------
 sp-sim/src/rot.rs     |  46 -------------------
 sp-sim/src/sidecar.rs |  24 ----------
 7 files changed, 8 insertions(+), 210 deletions(-)
 delete mode 100644 sp-sim/src/rot.rs

diff --git a/Cargo.lock b/Cargo.lock
index 22e647c69c..95420642e4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1220,12 +1220,6 @@ version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
 
-[[package]]
-name = "corncobs"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9236877021b66ad90f833d8a73a7acb702b985b64c5986682d9f1f1a184f0fb"
-
 [[package]]
 name = "cpufeatures"
 version = "0.2.12"
@@ -2123,19 +2117,10 @@ dependencies = [
  "digest",
  "elliptic-curve",
  "rfc6979",
- "signature 2.2.0",
+ "signature",
  "spki",
 ]
 
-[[package]]
-name = "ed25519"
-version = "1.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91cff35c70bba8a626e3185d8cd48cc11b5437e1a5bcd15b9b5fa3c64b6dfee7"
-dependencies = [
- "signature 1.6.4",
-]
-
 [[package]]
 name = "ed25519"
 version = "2.2.3"
@@ -2143,7 +2128,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
 dependencies = [
  "pkcs8",
- "signature 2.2.0",
+ "signature",
 ]
 
 [[package]]
@@ -2153,7 +2138,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
 dependencies = [
  "curve25519-dalek",
- "ed25519 2.2.3",
+ "ed25519",
  "rand_core 0.6.4",
  "serde",
  "sha2",
@@ -2715,7 +2700,7 @@ version = "0.1.0"
 source = "git+https://github.com/oxidecomputer/management-gateway-service?rev=c85a4ca043aaa389df12aac5348d8a3feda28762#c85a4ca043aaa389df12aac5348d8a3feda28762"
 dependencies = [
  "bitflags 2.5.0",
- "hubpack 0.1.2",
+ "hubpack",
  "serde",
  "serde_repr",
  "smoltcp 0.9.1",
@@ -2736,14 +2721,14 @@ dependencies = [
  "fxhash",
  "gateway-messages",
  "hex",
- "hubpack 0.1.2",
+ "hubpack",
  "hubtools",
  "lru-cache",
  "nix 0.27.1",
  "once_cell",
  "paste",
  "serde",
- "serde-big-array 0.5.1",
+ "serde-big-array",
  "slog",
  "slog-error-chain",
  "socket2 0.5.7",
@@ -3225,35 +3210,16 @@ dependencies = [
  "tokio",
 ]
 
-[[package]]
-name = "hubpack"
-version = "0.1.0"
-source = "git+https://github.com/cbiffle/hubpack.git?rev=df08cc3a6e1f97381cd0472ae348e310f0119e25#df08cc3a6e1f97381cd0472ae348e310f0119e25"
-dependencies = [
- "hubpack_derive 0.1.0",
- "serde",
-]
-
 [[package]]
 name = "hubpack"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "61a0b84aeae519f65e0ba3aa998327080993426024edbd5cc38dbaf5ec524303"
 dependencies = [
- "hubpack_derive 0.1.1",
+ "hubpack_derive",
  "serde",
 ]
 
-[[package]]
-name = "hubpack_derive"
-version = "0.1.0"
-source = "git+https://github.com/cbiffle/hubpack.git?rev=df08cc3a6e1f97381cd0472ae348e310f0119e25#df08cc3a6e1f97381cd0472ae348e310f0119e25"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "hubpack_derive"
 version = "0.1.1"
@@ -8035,7 +8001,7 @@ dependencies = [
  "rand_core 0.6.4",
  "serde",
  "sha2",
- "signature 2.2.0",
+ "signature",
  "spki",
  "subtle",
  "zeroize",
@@ -8380,17 +8346,6 @@ version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
-[[package]]
-name = "salty"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77cdd38ed8bfe51e53ee991aae0791b94349d0a05cfdecd283835a8a965d4c37"
-dependencies = [
- "ed25519 1.5.3",
- "subtle",
- "zeroize",
-]
-
 [[package]]
 name = "samael"
 version = "0.0.15"
@@ -8578,15 +8533,6 @@ dependencies = [
  "serde_derive",
 ]
 
-[[package]]
-name = "serde-big-array"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3323f09a748af288c3dc2474ea6803ee81f118321775bffa3ac8f7e65c5e90e7"
-dependencies = [
- "serde",
-]
-
 [[package]]
 name = "serde-big-array"
 version = "0.5.1"
@@ -8850,12 +8796,6 @@ dependencies = [
  "tokio",
 ]
 
-[[package]]
-name = "signature"
-version = "1.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c"
-
 [[package]]
 name = "signature"
 version = "2.2.0"
@@ -9263,7 +9203,6 @@ dependencies = [
  "serde",
  "slog",
  "slog-dtrace",
- "sprockets-rot",
  "thiserror",
  "tokio",
  "toml 0.8.14",
@@ -9294,33 +9233,6 @@ dependencies = [
  "der",
 ]
 
-[[package]]
-name = "sprockets-common"
-version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/sprockets?rev=77df31efa5619d0767ffc837ef7468101608aee9#77df31efa5619d0767ffc837ef7468101608aee9"
-dependencies = [
- "derive_more",
- "hubpack 0.1.0",
- "salty",
- "serde",
- "serde-big-array 0.4.1",
-]
-
-[[package]]
-name = "sprockets-rot"
-version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/sprockets?rev=77df31efa5619d0767ffc837ef7468101608aee9#77df31efa5619d0767ffc837ef7468101608aee9"
-dependencies = [
- "corncobs",
- "derive_more",
- "hubpack 0.1.0",
- "rand 0.8.5",
- "salty",
- "serde",
- "sprockets-common",
- "tinyvec",
-]
-
 [[package]]
 name = "sqlformat"
 version = "0.2.4"
diff --git a/Cargo.toml b/Cargo.toml
index 96f962708a..6c67dbd6c4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -483,9 +483,6 @@ slog-term = "2.9.1"
 smf = "0.2"
 socket2 = { version = "0.5", features = ["all"] }
 sp-sim = { path = "sp-sim" }
-sprockets-common = { git = "https://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" }
-sprockets-host = { git = "https://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" }
-sprockets-rot = { git = "https://github.com/oxidecomputer/sprockets", rev = "77df31efa5619d0767ffc837ef7468101608aee9" }
 sqlformat = "0.2.4"
 sqlparser = { version = "0.45.0", features = [ "visitor" ] }
 static_assertions = "1.1.0"
@@ -683,8 +680,6 @@ opt-level = 3
 opt-level = 3
 [profile.dev.package.rsa]
 opt-level = 3
-[profile.dev.package.salty]
-opt-level = 3
 [profile.dev.package.signature]
 opt-level = 3
 [profile.dev.package.subtle]
diff --git a/sp-sim/Cargo.toml b/sp-sim/Cargo.toml
index 35cb791f4c..7270db1a67 100644
--- a/sp-sim/Cargo.toml
+++ b/sp-sim/Cargo.toml
@@ -20,7 +20,6 @@ omicron-common.workspace = true
 serde.workspace = true
 slog.workspace = true
 slog-dtrace.workspace = true
-sprockets-rot.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = [ "full" ] }
 toml.workspace = true
diff --git a/sp-sim/src/gimlet.rs b/sp-sim/src/gimlet.rs
index 4e0b264e64..ac465cb217 100644
--- a/sp-sim/src/gimlet.rs
+++ b/sp-sim/src/gimlet.rs
@@ -6,7 +6,6 @@ use crate::config::GimletConfig;
 use crate::config::SpComponentConfig;
 use crate::helpers::rot_slot_id_from_u16;
 use crate::helpers::rot_slot_id_to_u16;
-use crate::rot::RotSprocketExt;
 use crate::serial_number_padded;
 use crate::server;
 use crate::server::SimSpHandler;
@@ -38,9 +37,6 @@ use gateway_messages::{version, MessageKind};
 use gateway_messages::{ComponentDetails, Message, MgsError, StartupOptions};
 use gateway_messages::{DiscoverResponse, IgnitionState, PowerState};
 use slog::{debug, error, info, warn, Logger};
-use sprockets_rot::common::msgs::{RotRequestV1, RotResponseV1};
-use sprockets_rot::common::Ed25519PublicKey;
-use sprockets_rot::{RotSprocket, RotSprocketError};
 use std::cell::Cell;
 use std::collections::HashMap;
 use std::iter;
@@ -88,8 +84,6 @@ pub enum SimSpHandledRequest {
 }
 
 pub struct Gimlet {
-    rot: Mutex<RotSprocket>,
-    manufacturing_public_key: Ed25519PublicKey,
     local_addrs: Option<[SocketAddrV6; 2]>,
     handler: Option<Arc<TokioMutex<Handler>>>,
     serial_console_addrs: HashMap<String, SocketAddrV6>,
@@ -116,10 +110,6 @@ impl SimulatedSp for Gimlet {
         )
     }
 
-    fn manufacturing_public_key(&self) -> Ed25519PublicKey {
-        self.manufacturing_public_key
-    }
-
     fn local_addr(&self, port: SpPort) -> Option<SocketAddrV6> {
         let i = match port {
             SpPort::One => 0,
@@ -135,13 +125,6 @@ impl SimulatedSp for Gimlet {
         }
     }
 
-    fn rot_request(
-        &self,
-        request: RotRequestV1,
-    ) -> Result<RotResponseV1, RotSprocketError> {
-        self.rot.lock().unwrap().handle_deserialized(request)
-    }
-
     async fn last_sp_update_data(&self) -> Option<Box<[u8]>> {
         let handler = self.handler.as_ref()?;
         let handler = handler.lock().await;
@@ -201,16 +184,11 @@ impl Gimlet {
         let (commands, commands_rx) = mpsc::unbounded_channel();
         let last_request_handled = Arc::default();
 
-        let (manufacturing_public_key, rot) =
-            RotSprocket::bootstrap_from_config(&gimlet.common);
-
         // Weird case - if we don't have any bind addresses, we're only being
         // created to simulate an RoT, so go ahead and return without actually
         // starting a simulated SP.
         let Some(bind_addrs) = gimlet.common.bind_addrs else {
             return Ok(Self {
-                rot: Mutex::new(rot),
-                manufacturing_public_key,
                 local_addrs: None,
                 handler: None,
                 serial_console_addrs,
@@ -299,8 +277,6 @@ impl Gimlet {
             .push(task::spawn(async move { inner.run().await.unwrap() }));
 
         Ok(Self {
-            rot: Mutex::new(rot),
-            manufacturing_public_key,
             local_addrs: Some(local_addrs),
             handler: Some(handler),
             serial_console_addrs,
diff --git a/sp-sim/src/lib.rs b/sp-sim/src/lib.rs
index ca9231bec0..868d7ded2c 100644
--- a/sp-sim/src/lib.rs
+++ b/sp-sim/src/lib.rs
@@ -5,7 +5,6 @@
 pub mod config;
 mod gimlet;
 mod helpers;
-mod rot;
 mod server;
 mod sidecar;
 mod update;
@@ -21,10 +20,6 @@ pub use server::logger;
 pub use sidecar::Sidecar;
 pub use sidecar::SIM_SIDECAR_BOARD;
 pub use slog::Logger;
-pub use sprockets_rot::common::msgs::RotRequestV1;
-pub use sprockets_rot::common::msgs::RotResponseV1;
-use sprockets_rot::common::Ed25519PublicKey;
-pub use sprockets_rot::RotSprocketError;
 use std::net::SocketAddrV6;
 use tokio::sync::mpsc;
 use tokio::sync::watch;
@@ -43,9 +38,6 @@ pub trait SimulatedSp {
     /// Serial number.
     async fn state(&self) -> omicron_gateway::http_entrypoints::SpState;
 
-    /// Public key for the manufacturing cert used to sign this SP's RoT certs.
-    fn manufacturing_public_key(&self) -> Ed25519PublicKey;
-
     /// Listening UDP address of the given port of this simulated SP, if it was
     /// configured to listen.
     fn local_addr(&self, port: SpPort) -> Option<SocketAddrV6>;
@@ -54,12 +46,6 @@ pub trait SimulatedSp {
     /// messages.
     async fn set_responsiveness(&self, r: Responsiveness);
 
-    /// Send a request to the (simulated) RoT.
-    fn rot_request(
-        &self,
-        request: RotRequestV1,
-    ) -> Result<RotResponseV1, RotSprocketError>;
-
     /// Get the last completed update delivered to this simulated SP.
     ///
     /// Only returns data after a simulated reset of the SP.
diff --git a/sp-sim/src/rot.rs b/sp-sim/src/rot.rs
deleted file mode 100644
index 9f0bf61cc0..0000000000
--- a/sp-sim/src/rot.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-// This Source Code Form is subject to the terms of the Mozilla Public
-// License, v. 2.0. If a copy of the MPL was not distributed with this
-// file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
-//! Simualting a Root of Trust
-
-use crate::config::SpCommonConfig;
-use sprockets_rot::common::certificates::SerialNumber;
-use sprockets_rot::common::Ed25519PublicKey;
-use sprockets_rot::salty;
-use sprockets_rot::RotConfig;
-use sprockets_rot::RotSprocket;
-
-pub(crate) trait RotSprocketExt {
-    // Returns the (derived-from-config) manufacturing public key and the
-    // `RotSprocket`.
-    fn bootstrap_from_config(
-        config: &SpCommonConfig,
-    ) -> (Ed25519PublicKey, Self);
-}
-
-impl RotSprocketExt for RotSprocket {
-    fn bootstrap_from_config(
-        config: &SpCommonConfig,
-    ) -> (Ed25519PublicKey, Self) {
-        let mut serial_number = [0; 16];
-        serial_number
-            .get_mut(0..config.serial_number.len())
-            .expect("simulated serial number too long")
-            .copy_from_slice(config.serial_number.as_bytes());
-
-        let manufacturing_keypair =
-            salty::Keypair::from(&config.manufacturing_root_cert_seed);
-        let device_id_keypair =
-            salty::Keypair::from(&config.device_id_cert_seed);
-        let serial_number = SerialNumber(serial_number);
-        let config = RotConfig::bootstrap_for_testing(
-            &manufacturing_keypair,
-            device_id_keypair,
-            serial_number,
-        );
-        let manufacturing_public_key =
-            Ed25519PublicKey(manufacturing_keypair.public.to_bytes());
-        (manufacturing_public_key, Self::new(config))
-    }
-}
diff --git a/sp-sim/src/sidecar.rs b/sp-sim/src/sidecar.rs
index 696989f791..a6bc49e609 100644
--- a/sp-sim/src/sidecar.rs
+++ b/sp-sim/src/sidecar.rs
@@ -8,7 +8,6 @@ use crate::config::SimulatedSpsConfig;
 use crate::config::SpComponentConfig;
 use crate::helpers::rot_slot_id_from_u16;
 use crate::helpers::rot_slot_id_to_u16;
-use crate::rot::RotSprocketExt;
 use crate::serial_number_padded;
 use crate::server;
 use crate::server::SimSpHandler;
@@ -49,16 +48,10 @@ use slog::debug;
 use slog::info;
 use slog::warn;
 use slog::Logger;
-use sprockets_rot::common::msgs::RotRequestV1;
-use sprockets_rot::common::msgs::RotResponseV1;
-use sprockets_rot::common::Ed25519PublicKey;
-use sprockets_rot::RotSprocket;
-use sprockets_rot::RotSprocketError;
 use std::iter;
 use std::net::SocketAddrV6;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::sync::Mutex;
 use tokio::select;
 use tokio::sync::mpsc;
 use tokio::sync::oneshot;
@@ -70,8 +63,6 @@ use tokio::task::JoinHandle;
 pub const SIM_SIDECAR_BOARD: &str = "SimSidecarSp";
 
 pub struct Sidecar {
-    rot: Mutex<RotSprocket>,
-    manufacturing_public_key: Ed25519PublicKey,
     local_addrs: Option<[SocketAddrV6; 2]>,
     handler: Option<Arc<TokioMutex<Handler>>>,
     commands: mpsc::UnboundedSender<Command>,
@@ -96,10 +87,6 @@ impl SimulatedSp for Sidecar {
         )
     }
 
-    fn manufacturing_public_key(&self) -> Ed25519PublicKey {
-        self.manufacturing_public_key
-    }
-
     fn local_addr(&self, port: SpPort) -> Option<SocketAddrV6> {
         let i = match port {
             SpPort::One => 0,
@@ -117,13 +104,6 @@ impl SimulatedSp for Sidecar {
         rx.await.unwrap();
     }
 
-    fn rot_request(
-        &self,
-        request: RotRequestV1,
-    ) -> Result<RotResponseV1, RotSprocketError> {
-        self.rot.lock().unwrap().handle_deserialized(request)
-    }
-
     async fn last_sp_update_data(&self) -> Option<Box<[u8]>> {
         let handler = self.handler.as_ref()?;
         let handler = handler.lock().await;
@@ -224,11 +204,7 @@ impl Sidecar {
                 (None, None, None, None)
             };
 
-        let (manufacturing_public_key, rot) =
-            RotSprocket::bootstrap_from_config(&sidecar.common);
         Ok(Self {
-            rot: Mutex::new(rot),
-            manufacturing_public_key,
             local_addrs,
             handler,
             commands,