diff --git a/Cargo.lock b/Cargo.lock
index f2fc13b9f6..6ddb2a90ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -256,12 +256,14 @@ checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
 
 [[package]]
 name = "async-bb8-diesel"
-version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/async-bb8-diesel?rev=ed7ab5ef0513ba303d33efd41d3e9e381169d59b#ed7ab5ef0513ba303d33efd41d3e9e381169d59b"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc03a2806f66f36513d65e0a7f34200382230250cadcf8a8397cfbe3f26b795"
 dependencies = [
  "async-trait",
  "bb8",
  "diesel",
+ "futures",
  "thiserror",
  "tokio",
 ]
@@ -703,7 +705,7 @@ dependencies = [
 name = "bootstrap-agent-api"
 version = "0.1.0"
 dependencies = [
- "dropshot",
+ "dropshot 0.10.2-dev",
  "nexus-client",
  "omicron-common",
  "omicron-uuid-kinds",
@@ -973,7 +975,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "libc",
  "omicron-rpaths",
@@ -1117,7 +1119,7 @@ checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
 name = "clickhouse-admin-api"
 version = "0.1.0"
 dependencies = [
- "dropshot",
+ "dropshot 0.10.2-dev",
  "omicron-common",
  "omicron-uuid-kinds",
  "omicron-workspace-hack",
@@ -1175,7 +1177,7 @@ name = "cockroach-admin-api"
 version = "0.1.0"
 dependencies = [
  "cockroach-admin-types",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "omicron-common",
  "omicron-uuid-kinds",
  "omicron-workspace-hack",
@@ -1402,7 +1404,7 @@ name = "crdb-seed"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "omicron-test-utils",
  "omicron-workspace-hack",
  "slog",
@@ -1554,7 +1556,7 @@ dependencies = [
  "anyhow",
  "atty",
  "crucible-workspace-hack",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "nix 0.28.0",
  "rusqlite",
  "rustls-pemfile 1.0.4",
@@ -2119,7 +2121,7 @@ dependencies = [
  "clap",
  "dns-server-api",
  "dns-service-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "hickory-client",
  "hickory-proto",
@@ -2152,7 +2154,7 @@ name = "dns-server-api"
 version = "0.1.0"
 dependencies = [
  "chrono",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "omicron-workspace-hack",
  "schemars",
  "serde",
@@ -2225,6 +2227,52 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "dropshot"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a391eeedf8a75a188eb670327c704b7ab10eb2bb890e2ec0880dd21d609fb6e8"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "base64 0.22.1",
+ "bytes",
+ "camino",
+ "chrono",
+ "debug-ignore",
+ "dropshot_endpoint 0.10.1",
+ "form_urlencoded",
+ "futures",
+ "hostname 0.4.0",
+ "http 0.2.12",
+ "hyper 0.14.30",
+ "indexmap 2.4.0",
+ "multer",
+ "openapiv3",
+ "paste",
+ "percent-encoding",
+ "rustls 0.22.4",
+ "rustls-pemfile 2.1.3",
+ "schemars",
+ "scopeguard",
+ "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sha1",
+ "slog",
+ "slog-async",
+ "slog-bunyan",
+ "slog-json",
+ "slog-term",
+ "tokio",
+ "tokio-rustls 0.25.0",
+ "toml 0.8.19",
+ "uuid",
+ "version_check",
+ "waitgroup",
+]
+
 [[package]]
 name = "dropshot"
 version = "0.10.2-dev"
@@ -2237,7 +2285,7 @@ dependencies = [
  "camino",
  "chrono",
  "debug-ignore",
- "dropshot_endpoint",
+ "dropshot_endpoint 0.10.2-dev",
  "form_urlencoded",
  "futures",
  "hostname 0.4.0",
@@ -2271,6 +2319,19 @@ dependencies = [
  "waitgroup",
 ]
 
+[[package]]
+name = "dropshot_endpoint"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9058c9c7e4a6b378cd12e71dc155bb15d0d4f8e1e6039ce2cf0a7c0c81043e33"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_tokenstream",
+ "syn 2.0.74",
+]
+
 [[package]]
 name = "dropshot_endpoint"
 version = "0.10.2-dev"
@@ -2890,7 +2951,7 @@ dependencies = [
 name = "gateway-api"
 version = "0.1.0"
 dependencies = [
- "dropshot",
+ "dropshot 0.10.2-dev",
  "gateway-types",
  "omicron-common",
  "omicron-uuid-kinds",
@@ -2994,7 +3055,7 @@ name = "gateway-test-utils"
 version = "0.1.0"
 dependencies = [
  "camino",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "gateway-messages",
  "gateway-types",
  "omicron-gateway",
@@ -3631,7 +3692,7 @@ dependencies = [
  "httpdate",
  "itoa",
  "pin-project-lite",
- "socket2 0.4.10",
+ "socket2 0.5.7",
  "tokio",
  "tower-service",
  "tracing",
@@ -3825,7 +3886,7 @@ dependencies = [
 [[package]]
 name = "illumos-sys-hdrs"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d"
+source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653"
 
 [[package]]
 name = "illumos-utils"
@@ -3997,7 +4058,7 @@ name = "installinator-api"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "hyper 0.14.30",
  "installinator-common",
  "omicron-common",
@@ -4065,7 +4126,7 @@ dependencies = [
  "chrono",
  "dns-server",
  "dns-service-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "futures",
  "hickory-resolver",
@@ -4092,7 +4153,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "hickory-resolver",
  "internal-dns",
  "omicron-common",
@@ -4261,7 +4322,7 @@ dependencies = [
 [[package]]
 name = "kstat-macro"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d"
+source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653"
 dependencies = [
  "quote",
  "syn 2.0.74",
@@ -4735,6 +4796,7 @@ dependencies = [
  "gateway-messages",
  "gateway-test-utils",
  "libc",
+ "omicron-gateway",
  "omicron-workspace-hack",
  "signal-hook-tokio",
  "tokio",
@@ -4926,7 +4988,7 @@ dependencies = [
  "base64 0.22.1",
  "chrono",
  "cookie 0.18.1",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "headers",
  "http 0.2.12",
@@ -4983,7 +5045,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "camino",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "libc",
  "omicron-common",
@@ -5066,7 +5128,6 @@ dependencies = [
  "assert_matches",
  "async-bb8-diesel",
  "async-trait",
- "bb8",
  "camino",
  "camino-tempfile",
  "chrono",
@@ -5074,7 +5135,7 @@ dependencies = [
  "db-macros",
  "diesel",
  "diesel-dtrace",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "futures",
  "gateway-client",
@@ -5111,6 +5172,7 @@ dependencies = [
  "pq-sys",
  "predicates",
  "pretty_assertions",
+ "qorb",
  "rand",
  "rcgen",
  "ref-cast",
@@ -5132,6 +5194,7 @@ dependencies = [
  "term",
  "thiserror",
  "tokio",
+ "url",
  "usdt",
  "uuid",
 ]
@@ -5153,7 +5216,7 @@ dependencies = [
 name = "nexus-internal-api"
 version = "0.1.0"
 dependencies = [
- "dropshot",
+ "dropshot 0.10.2-dev",
  "nexus-types",
  "omicron-common",
  "omicron-uuid-kinds",
@@ -5260,6 +5323,7 @@ dependencies = [
  "httptest",
  "internal-dns",
  "ipnet",
+ "newtype-uuid",
  "nexus-config",
  "nexus-db-model",
  "nexus-db-queries",
@@ -5408,7 +5472,7 @@ dependencies = [
  "crucible-agent-client",
  "dns-server",
  "dns-service-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "gateway-messages",
  "gateway-test-utils",
@@ -5466,7 +5530,7 @@ dependencies = [
  "derive-where",
  "derive_more",
  "dns-service-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "gateway-client",
  "http 0.2.12",
@@ -5792,7 +5856,7 @@ dependencies = [
  "clap",
  "clickhouse-admin-api",
  "clickhouse-admin-types",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "http 0.2.12",
  "illumos-utils",
@@ -5829,7 +5893,7 @@ dependencies = [
  "cockroach-admin-api",
  "cockroach-admin-types",
  "csv",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "http 0.2.12",
  "illumos-utils",
@@ -5871,7 +5935,7 @@ dependencies = [
  "camino",
  "camino-tempfile",
  "chrono",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "futures",
  "hex",
@@ -5939,7 +6003,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "futures",
  "libc",
@@ -5977,8 +6041,9 @@ dependencies = [
  "anyhow",
  "base64 0.22.1",
  "camino",
+ "chrono",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "futures",
  "gateway-api",
@@ -5995,6 +6060,9 @@ dependencies = [
  "omicron-test-utils",
  "omicron-workspace-hack",
  "once_cell",
+ "oximeter",
+ "oximeter-instruments",
+ "oximeter-producer",
  "schemars",
  "serde",
  "serde_json",
@@ -6039,7 +6107,7 @@ dependencies = [
  "dns-server",
  "dns-service-client",
  "dpd-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "fatfs",
  "futures",
@@ -6162,7 +6230,7 @@ dependencies = [
  "crucible-agent-client",
  "csv",
  "diesel",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "dyn-clone",
  "expectorate",
  "futures",
@@ -6325,7 +6393,7 @@ dependencies = [
  "dns-server",
  "dns-service-client",
  "dpd-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "flate2",
  "flume",
@@ -6410,7 +6478,7 @@ dependencies = [
  "atomicwrites",
  "camino",
  "camino-tempfile",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "filetime",
  "gethostname",
@@ -6511,7 +6579,7 @@ dependencies = [
  "log",
  "managed",
  "memchr",
- "mio 0.8.11",
+ "mio 1.0.2",
  "nix 0.28.0",
  "nom",
  "num-bigint-dig",
@@ -6546,7 +6614,6 @@ dependencies = [
  "similar",
  "slog",
  "smallvec 1.13.2",
- "socket2 0.5.7",
  "spin 0.9.8",
  "string_cache",
  "subtle",
@@ -6647,7 +6714,7 @@ dependencies = [
  "clickhouse-admin-api",
  "cockroach-admin-api",
  "dns-server-api",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "fs-err",
  "gateway-api",
  "indent_write",
@@ -6733,7 +6800,7 @@ dependencies = [
 [[package]]
 name = "opte"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d"
+source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653"
 dependencies = [
  "cfg-if",
  "dyn-clone",
@@ -6750,7 +6817,7 @@ dependencies = [
 [[package]]
 name = "opte-api"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d"
+source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653"
 dependencies = [
  "illumos-sys-hdrs",
  "ipnetwork",
@@ -6762,7 +6829,7 @@ dependencies = [
 [[package]]
 name = "opte-ioctl"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d"
+source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653"
 dependencies = [
  "libc",
  "libnet 0.1.0 (git+https://github.com/oxidecomputer/netadm-sys)",
@@ -6836,7 +6903,7 @@ dependencies = [
 [[package]]
 name = "oxide-vpc"
 version = "0.1.0"
-source = "git+https://github.com/oxidecomputer/opte?rev=3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d#3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d"
+source = "git+https://github.com/oxidecomputer/opte?rev=76878de67229ea113d70503c441eab47ac5dc653#76878de67229ea113d70503c441eab47ac5dc653"
 dependencies = [
  "cfg-if",
  "illumos-sys-hdrs",
@@ -6871,7 +6938,7 @@ name = "oximeter-api"
 version = "0.1.0"
 dependencies = [
  "chrono",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "omicron-common",
  "omicron-workspace-hack",
  "schemars",
@@ -6902,7 +6969,7 @@ dependencies = [
  "camino",
  "chrono",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "futures",
  "hyper 0.14.30",
@@ -6950,7 +7017,7 @@ dependencies = [
  "clap",
  "clickward",
  "crossterm 0.28.1",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "futures",
  "highway",
@@ -6992,7 +7059,7 @@ version = "0.1.0"
 dependencies = [
  "cfg-if",
  "chrono",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "http 0.2.12",
  "hyper 0.14.30",
@@ -7028,7 +7095,7 @@ dependencies = [
  "anyhow",
  "chrono",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "internal-dns",
  "nexus-client",
  "omicron-common",
@@ -8023,7 +8090,7 @@ dependencies = [
  "atty",
  "base64 0.21.7",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "hyper 0.14.30",
  "progenitor",
@@ -8110,6 +8177,29 @@ dependencies = [
  "psl-types",
 ]
 
+[[package]]
+name = "qorb"
+version = "0.0.1"
+source = "git+https://github.com/oxidecomputer/qorb?branch=master#163a77838a3cfe8f7741d32e443f76d995b89df3"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "debug-ignore",
+ "derive-where",
+ "dropshot 0.10.1",
+ "futures",
+ "hickory-resolver",
+ "rand",
+ "schemars",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "tokio-tungstenite 0.23.1",
+ "tracing",
+]
+
 [[package]]
 name = "quick-error"
 version = "1.2.3"
@@ -8283,7 +8373,7 @@ dependencies = [
  "camino-tempfile",
  "clap",
  "dns-service-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "expectorate",
  "humantime",
  "indexmap 2.4.0",
@@ -9539,7 +9629,7 @@ name = "sled-agent-api"
 version = "0.1.0"
 dependencies = [
  "camino",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "nexus-sled-agent-shared",
  "omicron-common",
  "omicron-uuid-kinds",
@@ -9909,7 +9999,7 @@ dependencies = [
  "anyhow",
  "async-trait",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "gateway-messages",
  "gateway-types",
@@ -10633,28 +10723,27 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.38.1"
+version = "1.39.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
+checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5"
 dependencies = [
  "backtrace",
  "bytes",
  "libc",
- "mio 0.8.11",
- "num_cpus",
+ "mio 1.0.2",
  "parking_lot 0.12.2",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2 0.5.7",
  "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -10727,6 +10816,7 @@ dependencies = [
  "futures-core",
  "pin-project-lite",
  "tokio",
+ "tokio-util",
 ]
 
 [[package]]
@@ -10753,6 +10843,18 @@ dependencies = [
  "tungstenite 0.21.0",
 ]
 
+[[package]]
+name = "tokio-tungstenite"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6989540ced10490aaf14e6bad2e3d33728a2813310a0c71d1574304c49631cd"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.23.0",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.11"
@@ -11104,6 +11206,24 @@ dependencies = [
  "utf-8",
 ]
 
+[[package]]
+name = "tungstenite"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e2e2ce1e47ed2994fd43b04c8f618008d4cabdd5ee34027cf14f9d918edd9c8"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 1.1.0",
+ "httparse",
+ "log",
+ "rand",
+ "sha1",
+ "thiserror",
+ "utf-8",
+]
+
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -11305,7 +11425,7 @@ dependencies = [
  "clap",
  "debug-ignore",
  "display-error-chain",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "futures",
  "hex",
  "hubtools",
@@ -11772,7 +11892,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "dpd-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "gateway-client",
  "maplit",
  "omicron-common",
@@ -11828,7 +11948,7 @@ dependencies = [
  "debug-ignore",
  "display-error-chain",
  "dpd-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "either",
  "expectorate",
  "flate2",
@@ -11895,7 +12015,7 @@ name = "wicketd-api"
 version = "0.1.0"
 dependencies = [
  "bootstrap-agent-client",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "gateway-client",
  "omicron-common",
  "omicron-passwords",
@@ -12351,7 +12471,7 @@ dependencies = [
  "anyhow",
  "camino",
  "clap",
- "dropshot",
+ "dropshot 0.10.2-dev",
  "illumos-utils",
  "omicron-common",
  "omicron-sled-agent",
diff --git a/Cargo.toml b/Cargo.toml
index fad02eb790..096ccf2382 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -283,13 +283,12 @@ api_identity = { path = "api_identity" }
 approx = "0.5.1"
 assert_matches = "1.5.0"
 assert_cmd = "2.0.16"
-async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" }
+async-bb8-diesel = "0.2"
 async-trait = "0.1.81"
 atomicwrites = "0.4.3"
 authz-macros = { path = "nexus/authz-macros" }
 backoff = { version = "0.4.0", features = [ "tokio" ] }
 base64 = "0.22.1"
-bb8 = "0.8.5"
 bcs = "0.1.6"
 bincode = "1.3.3"
 bootstore = { path = "bootstore" }
@@ -452,7 +451,7 @@ omicron-test-utils = { path = "test-utils" }
 omicron-workspace-hack = "0.1.0"
 omicron-zone-package = "0.11.0"
 oxide-client = { path = "clients/oxide-client" }
-oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d", features = [ "api", "std" ] }
+oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "76878de67229ea113d70503c441eab47ac5dc653", features = [ "api", "std" ] }
 oxlog = { path = "dev-tools/oxlog" }
 oxnet = { git = "https://github.com/oxidecomputer/oxnet" }
 once_cell = "1.19.0"
@@ -462,7 +461,7 @@ openapiv3 = "2.0.0"
 # must match samael's crate!
 openssl = "0.10"
 openssl-sys = "0.9"
-opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "3dc9a3dd8d3c623f0cf2c659c7119ce0c026a96d" }
+opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "76878de67229ea113d70503c441eab47ac5dc653" }
 oso = "0.27"
 owo-colors = "4.0.0"
 oximeter = { path = "oximeter/oximeter" }
@@ -499,6 +498,7 @@ bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "24a74d0c
 propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "24a74d0c76b6a63961ecef76acb1516b6e66c5c9" }
 propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "24a74d0c76b6a63961ecef76acb1516b6e66c5c9" }
 proptest = "1.5.0"
+qorb = { git = "https://github.com/oxidecomputer/qorb", branch = "master" }
 quote = "1.0"
 rand = "0.8.5"
 rand_core = "0.6.4"
@@ -582,7 +582,7 @@ textwrap = "0.16.1"
 test-strategy = "0.3.1"
 thiserror = "1.0"
 tofino = { git = "https://github.com/oxidecomputer/tofino", branch = "main" }
-tokio = "1.38.1"
+tokio = "1.39.3"
 tokio-postgres = { version = "0.7", features = [ "with-chrono-0_4", "with-uuid-1" ] }
 tokio-stream = "0.1.15"
 tokio-tungstenite = "0.20"
diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs
index 62366c45e1..97f6373e29 100644
--- a/clients/nexus-client/src/lib.rs
+++ b/clients/nexus-client/src/lib.rs
@@ -131,14 +131,11 @@ impl From<omicron_common::api::internal::nexus::VmmRuntimeState>
     }
 }
 
-impl From<omicron_common::api::internal::nexus::SledInstanceState>
-    for types::SledInstanceState
+impl From<omicron_common::api::internal::nexus::SledVmmState>
+    for types::SledVmmState
 {
-    fn from(
-        s: omicron_common::api::internal::nexus::SledInstanceState,
-    ) -> Self {
+    fn from(s: omicron_common::api::internal::nexus::SledVmmState) -> Self {
         Self {
-            propolis_id: s.propolis_id,
             vmm_state: s.vmm_state.into(),
             migration_in: s.migration_in.map(Into::into),
             migration_out: s.migration_out.map(Into::into),
@@ -213,6 +210,7 @@ impl From<omicron_common::api::internal::nexus::ProducerKind>
     fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self {
         use omicron_common::api::internal::nexus::ProducerKind;
         match kind {
+            ProducerKind::ManagementGateway => Self::ManagementGateway,
             ProducerKind::SledAgent => Self::SledAgent,
             ProducerKind::Service => Self::Service,
             ProducerKind::Instance => Self::Instance,
@@ -390,6 +388,9 @@ impl From<types::ProducerKind>
     fn from(kind: types::ProducerKind) -> Self {
         use omicron_common::api::internal::nexus::ProducerKind;
         match kind {
+            types::ProducerKind::ManagementGateway => {
+                ProducerKind::ManagementGateway
+            }
             types::ProducerKind::SledAgent => ProducerKind::SledAgent,
             types::ProducerKind::Instance => ProducerKind::Instance,
             types::ProducerKind::Service => ProducerKind::Service,
diff --git a/clients/oximeter-client/src/lib.rs b/clients/oximeter-client/src/lib.rs
index 74fc6968e8..c23e5177a0 100644
--- a/clients/oximeter-client/src/lib.rs
+++ b/clients/oximeter-client/src/lib.rs
@@ -26,6 +26,7 @@ impl From<omicron_common::api::internal::nexus::ProducerKind>
     fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self {
         use omicron_common::api::internal::nexus;
         match kind {
+            nexus::ProducerKind::ManagementGateway => Self::ManagementGateway,
             nexus::ProducerKind::Service => Self::Service,
             nexus::ProducerKind::SledAgent => Self::SledAgent,
             nexus::ProducerKind::Instance => Self::Instance,
diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs
index ed96d762dc..b14cf5a96f 100644
--- a/clients/sled-agent-client/src/lib.rs
+++ b/clients/sled-agent-client/src/lib.rs
@@ -5,6 +5,7 @@
 //! Interface for making API requests to a Sled Agent
 
 use async_trait::async_trait;
+use omicron_uuid_kinds::PropolisUuid;
 use schemars::JsonSchema;
 use serde::Deserialize;
 use serde::Serialize;
@@ -161,12 +162,11 @@ impl From<types::VmmRuntimeState>
     }
 }
 
-impl From<types::SledInstanceState>
-    for omicron_common::api::internal::nexus::SledInstanceState
+impl From<types::SledVmmState>
+    for omicron_common::api::internal::nexus::SledVmmState
 {
-    fn from(s: types::SledInstanceState) -> Self {
+    fn from(s: types::SledVmmState) -> Self {
         Self {
-            propolis_id: s.propolis_id,
             vmm_state: s.vmm_state.into(),
             migration_in: s.migration_in.map(Into::into),
             migration_out: s.migration_out.map(Into::into),
@@ -448,11 +448,11 @@ impl From<types::SledIdentifiers>
 /// are bonus endpoints, not generated in the real client.
 #[async_trait]
 pub trait TestInterfaces {
-    async fn instance_single_step(&self, id: Uuid);
-    async fn instance_finish_transition(&self, id: Uuid);
-    async fn instance_simulate_migration_source(
+    async fn vmm_single_step(&self, id: PropolisUuid);
+    async fn vmm_finish_transition(&self, id: PropolisUuid);
+    async fn vmm_simulate_migration_source(
         &self,
-        id: Uuid,
+        id: PropolisUuid,
         params: SimulateMigrationSource,
     );
     async fn disk_finish_transition(&self, id: Uuid);
@@ -460,10 +460,10 @@ pub trait TestInterfaces {
 
 #[async_trait]
 impl TestInterfaces for Client {
-    async fn instance_single_step(&self, id: Uuid) {
+    async fn vmm_single_step(&self, id: PropolisUuid) {
         let baseurl = self.baseurl();
         let client = self.client();
-        let url = format!("{}/instances/{}/poke-single-step", baseurl, id);
+        let url = format!("{}/vmms/{}/poke-single-step", baseurl, id);
         client
             .post(url)
             .send()
@@ -471,10 +471,10 @@ impl TestInterfaces for Client {
             .expect("instance_single_step() failed unexpectedly");
     }
 
-    async fn instance_finish_transition(&self, id: Uuid) {
+    async fn vmm_finish_transition(&self, id: PropolisUuid) {
         let baseurl = self.baseurl();
         let client = self.client();
-        let url = format!("{}/instances/{}/poke", baseurl, id);
+        let url = format!("{}/vmms/{}/poke", baseurl, id);
         client
             .post(url)
             .send()
@@ -493,14 +493,14 @@ impl TestInterfaces for Client {
             .expect("disk_finish_transition() failed unexpectedly");
     }
 
-    async fn instance_simulate_migration_source(
+    async fn vmm_simulate_migration_source(
         &self,
-        id: Uuid,
+        id: PropolisUuid,
         params: SimulateMigrationSource,
     ) {
         let baseurl = self.baseurl();
         let client = self.client();
-        let url = format!("{baseurl}/instances/{id}/sim-migration-source");
+        let url = format!("{baseurl}/vmms/{id}/sim-migration-source");
         client
             .post(url)
             .json(&params)
diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs
index 07e4fd0b83..58cace3032 100644
--- a/common/src/api/external/mod.rs
+++ b/common/src/api/external/mod.rs
@@ -2371,7 +2371,7 @@ pub struct SwitchPortLinkConfig {
 
     /// The link-layer discovery protocol service configuration id for this
     /// link.
-    pub lldp_link_config_id: Uuid,
+    pub lldp_link_config_id: Option<Uuid>,
 
     /// The name of this link.
     pub link_name: String,
diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs
index 7f4eb358a4..996b000ddc 100644
--- a/common/src/api/internal/nexus.rs
+++ b/common/src/api/internal/nexus.rs
@@ -113,13 +113,9 @@ pub struct VmmRuntimeState {
     pub time_updated: DateTime<Utc>,
 }
 
-/// A wrapper type containing a sled's total knowledge of the state of a
-/// specific VMM and the instance it incarnates.
+/// A wrapper type containing a sled's total knowledge of the state of a VMM.
 #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
-pub struct SledInstanceState {
-    /// The ID of the VMM whose state is being reported.
-    pub propolis_id: PropolisUuid,
-
+pub struct SledVmmState {
     /// The most recent state of the sled's VMM process.
     pub vmm_state: VmmRuntimeState,
 
@@ -142,7 +138,7 @@ impl Migrations<'_> {
     }
 }
 
-impl SledInstanceState {
+impl SledVmmState {
     pub fn migrations(&self) -> Migrations<'_> {
         Migrations {
             migration_in: self.migration_in.as_ref(),
@@ -223,6 +219,8 @@ pub enum ProducerKind {
     Service,
     /// The producer is a Propolis VMM managing a guest instance.
     Instance,
+    /// The producer is a management gateway service.
+    ManagementGateway,
 }
 
 /// Information announced by a metric server, used so that clients can contact it and collect
diff --git a/common/src/lib.rs b/common/src/lib.rs
index 6da32c56ba..b9d6dd3172 100644
--- a/common/src/lib.rs
+++ b/common/src/lib.rs
@@ -118,3 +118,27 @@ where
 async fn never_bail() -> Result<bool, Error> {
     Ok(false)
 }
+
+/// A wrapper struct that does nothing other than elide the inner value from
+/// [`std::fmt::Debug`] output.
+///
+/// We define this within Omicron instead of using one of the many available
+/// crates that do the same thing because it's trivial to do so, and we want the
+/// flexibility to add traits to this type without needing to wait on upstream
+/// to add an optional dependency.
+///
+/// If you want to use this for secrets, consider that it might not do
+/// everything you expect (it does not zeroize memory on drop, nor get in the
+/// way of you removing the inner value from this wrapper struct).
+#[derive(
+    Clone, Copy, serde::Deserialize, serde::Serialize, schemars::JsonSchema,
+)]
+#[repr(transparent)]
+#[serde(transparent)]
+pub struct NoDebug<T>(pub T);
+
+impl<T> std::fmt::Debug for NoDebug<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "..")
+    }
+}
diff --git a/dev-tools/mgs-dev/Cargo.toml b/dev-tools/mgs-dev/Cargo.toml
index d5f61f4b96..70382c0469 100644
--- a/dev-tools/mgs-dev/Cargo.toml
+++ b/dev-tools/mgs-dev/Cargo.toml
@@ -14,6 +14,7 @@ futures.workspace = true
 gateway-messages.workspace = true
 gateway-test-utils.workspace = true
 libc.workspace = true
+omicron-gateway.workspace = true
 omicron-workspace-hack.workspace = true
 signal-hook-tokio.workspace = true
 tokio.workspace = true
diff --git a/dev-tools/mgs-dev/src/main.rs b/dev-tools/mgs-dev/src/main.rs
index 85b1313d68..77947999d9 100644
--- a/dev-tools/mgs-dev/src/main.rs
+++ b/dev-tools/mgs-dev/src/main.rs
@@ -8,6 +8,7 @@ use clap::{Args, Parser, Subcommand};
 use futures::StreamExt;
 use libc::SIGINT;
 use signal_hook_tokio::Signals;
+use std::net::SocketAddr;
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
@@ -36,7 +37,12 @@ enum MgsDevCmd {
 }
 
 #[derive(Clone, Debug, Args)]
-struct MgsRunArgs {}
+struct MgsRunArgs {
+    /// Override the address of the Nexus instance to use when registering the
+    /// Oximeter producer.
+    #[clap(long)]
+    nexus_address: Option<SocketAddr>,
+}
 
 impl MgsRunArgs {
     async fn exec(&self) -> Result<(), anyhow::Error> {
@@ -46,9 +52,23 @@ impl MgsRunArgs {
         let mut signal_stream = signals.fuse();
 
         println!("mgs-dev: setting up MGS ... ");
-        let gwtestctx = gateway_test_utils::setup::test_setup(
+        let (mut mgs_config, sp_sim_config) =
+            gateway_test_utils::setup::load_test_config();
+        if let Some(addr) = self.nexus_address {
+            mgs_config.metrics =
+                Some(gateway_test_utils::setup::MetricsConfig {
+                    disabled: false,
+                    dev_nexus_address: Some(addr),
+                    dev_bind_loopback: true,
+                });
+        }
+
+        let gwtestctx = gateway_test_utils::setup::test_setup_with_config(
             "mgs-dev",
             gateway_messages::SpPort::One,
+            mgs_config,
+            &sp_sim_config,
+            None,
         )
         .await;
         println!("mgs-dev: MGS is running.");
diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs
index 9ce4c66a80..48f5137698 100644
--- a/dev-tools/omdb/src/bin/omdb/db.rs
+++ b/dev-tools/omdb/src/bin/omdb/db.rs
@@ -246,7 +246,8 @@ impl DbUrlOptions {
         eprintln!("note: using database URL {}", &db_url);
 
         let db_config = db::Config { url: db_url.clone() };
-        let pool = Arc::new(db::Pool::new(&log.clone(), &db_config));
+        let pool =
+            Arc::new(db::Pool::new_single_host(&log.clone(), &db_config));
 
         // Being a dev tool, we want to try this operation even if the schema
         // doesn't match what we expect.  So we use `DataStore::new_unchecked()`
@@ -4224,7 +4225,7 @@ async fn cmd_db_inventory(
 }
 
 async fn cmd_db_inventory_baseboard_ids(
-    conn: &DataStoreConnection<'_>,
+    conn: &DataStoreConnection,
     limit: NonZeroU32,
 ) -> Result<(), anyhow::Error> {
     #[derive(Tabled)]
@@ -4261,7 +4262,7 @@ async fn cmd_db_inventory_baseboard_ids(
 }
 
 async fn cmd_db_inventory_cabooses(
-    conn: &DataStoreConnection<'_>,
+    conn: &DataStoreConnection,
     limit: NonZeroU32,
 ) -> Result<(), anyhow::Error> {
     #[derive(Tabled)]
@@ -4302,7 +4303,7 @@ async fn cmd_db_inventory_cabooses(
 }
 
 async fn cmd_db_inventory_physical_disks(
-    conn: &DataStoreConnection<'_>,
+    conn: &DataStoreConnection,
     limit: NonZeroU32,
     args: InvPhysicalDisksArgs,
 ) -> Result<(), anyhow::Error> {
@@ -4359,7 +4360,7 @@ async fn cmd_db_inventory_physical_disks(
 }
 
 async fn cmd_db_inventory_rot_pages(
-    conn: &DataStoreConnection<'_>,
+    conn: &DataStoreConnection,
     limit: NonZeroU32,
 ) -> Result<(), anyhow::Error> {
     #[derive(Tabled)]
@@ -4394,7 +4395,7 @@ async fn cmd_db_inventory_rot_pages(
 }
 
 async fn cmd_db_inventory_collections_list(
-    conn: &DataStoreConnection<'_>,
+    conn: &DataStoreConnection,
     limit: NonZeroU32,
 ) -> Result<(), anyhow::Error> {
     #[derive(Tabled)]
diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index ede2743404..5af75fac8f 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -36,6 +36,7 @@ use nexus_types::internal_api::background::LookupRegionPortStatus;
 use nexus_types::internal_api::background::RegionReplacementDriverStatus;
 use nexus_types::internal_api::background::RegionSnapshotReplacementGarbageCollectStatus;
 use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus;
+use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus;
 use nexus_types::inventory::BaseboardId;
 use omicron_uuid_kinds::CollectionUuid;
 use omicron_uuid_kinds::DemoSagaUuid;
@@ -51,6 +52,7 @@ use std::collections::BTreeMap;
 use std::collections::BTreeSet;
 use std::str::FromStr;
 use tabled::Tabled;
+use tokio::sync::OnceCell;
 use uuid::Uuid;
 
 /// Arguments to the "omdb nexus" subcommand
@@ -244,6 +246,10 @@ struct BlueprintTargetSetArgs {
     blueprint_id: Uuid,
     /// whether this blueprint should be enabled
     enabled: BlueprintTargetSetEnabled,
+    /// if specified, diff against the current target and wait for confirmation
+    /// before proceeding
+    #[clap(long)]
+    diff: bool,
 }
 
 #[derive(Debug, Clone, Copy, ValueEnum)]
@@ -1510,6 +1516,102 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 }
             }
         }
+    } else if name == "region_snapshot_replacement_step" {
+        match serde_json::from_value::<RegionSnapshotReplacementStepStatus>(
+            details.clone(),
+        ) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+
+            Ok(status) => {
+                println!(
+                    "    total step records created ok: {}",
+                    status.step_records_created_ok.len(),
+                );
+                for line in &status.step_records_created_ok {
+                    println!("    > {line}");
+                }
+
+                println!(
+                    "    total step garbage collect saga invoked ok: {}",
+                    status.step_garbage_collect_invoked_ok.len(),
+                );
+                for line in &status.step_garbage_collect_invoked_ok {
+                    println!("    > {line}");
+                }
+
+                println!(
+                    "    total step saga invoked ok: {}",
+                    status.step_invoked_ok.len(),
+                );
+                for line in &status.step_invoked_ok {
+                    println!("    > {line}");
+                }
+
+                println!("    errors: {}", status.errors.len());
+                for line in &status.errors {
+                    println!("    > {line}");
+                }
+            }
+        }
+    } else if name == "blueprint_loader" {
+        #[derive(Deserialize)]
+        struct BlueprintLoaderStatus {
+            target_id: Uuid,
+            time_created: DateTime<Utc>,
+            status: String,
+            enabled: bool,
+        }
+
+        match serde_json::from_value::<BlueprintLoaderStatus>(details.clone()) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+            Ok(status) => {
+                println!("    target blueprint: {}", status.target_id);
+                println!(
+                    "    execution:        {}",
+                    if status.enabled { "enabled" } else { "disabled" }
+                );
+                println!(
+                    "    created at:       {}",
+                    humantime::format_rfc3339_millis(
+                        status.time_created.into()
+                    )
+                );
+                println!("    status:           {}", status.status);
+            }
+        }
+    } else if name == "blueprint_executor" {
+        #[derive(Deserialize)]
+        struct BlueprintExecutorStatus {
+            target_id: Uuid,
+            enabled: bool,
+            errors: Option<Vec<String>>,
+        }
+
+        match serde_json::from_value::<BlueprintExecutorStatus>(details.clone())
+        {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+            Ok(status) => {
+                println!("    target blueprint: {}", status.target_id);
+                println!(
+                    "    execution:        {}",
+                    if status.enabled { "enabled" } else { "disabled" }
+                );
+                let errors = status.errors.as_deref().unwrap_or(&[]);
+                println!("    errors:           {}", errors.len());
+                for (i, e) in errors.iter().enumerate() {
+                    println!("        error {}: {}", i, e);
+                }
+            }
+        }
     } else {
         println!(
             "warning: unknown background task: {:?} \
@@ -1722,6 +1824,38 @@ async fn cmd_nexus_blueprints_target_set(
     args: &BlueprintTargetSetArgs,
     _destruction_token: DestructiveOperationToken,
 ) -> Result<(), anyhow::Error> {
+    // Helper to only fetch the current target once. We may need it immediately
+    // if `args.diff` is true, or later if `args.enabled` is "inherit" (or
+    // both).
+    let current_target = OnceCell::new();
+    let get_current_target = || async {
+        current_target
+            .get_or_try_init(|| client.blueprint_target_view())
+            .await
+            .context("failed to fetch current target blueprint")
+    };
+
+    if args.diff {
+        let current_target = get_current_target().await?;
+        let blueprint1 = client
+            .blueprint_view(&current_target.target_id)
+            .await
+            .context("failed to fetch target blueprint")?
+            .into_inner();
+        let blueprint2 =
+            client.blueprint_view(&args.blueprint_id).await.with_context(
+                || format!("fetching blueprint {}", args.blueprint_id),
+            )?;
+        let diff = blueprint2.diff_since_blueprint(&blueprint1);
+        println!("{}", diff.display());
+        println!(
+            "\nDo you want to make {} the target blueprint?",
+            args.blueprint_id
+        );
+        let mut prompt = ConfirmationPrompt::new();
+        prompt.read_and_validate("y/N", "y")?;
+    }
+
     let enabled = match args.enabled {
         BlueprintTargetSetEnabled::Enabled => true,
         BlueprintTargetSetEnabled::Disabled => false,
@@ -1734,12 +1868,11 @@ async fn cmd_nexus_blueprints_target_set(
         // operator. (In the case of the current target blueprint being changed
         // entirely, that will result in a failure to set the current target
         // below, because its parent will no longer be the current target.)
-        BlueprintTargetSetEnabled::Inherit => client
-            .blueprint_target_view()
-            .await
-            .map(|current| current.into_inner().enabled)
-            .context("failed to fetch current target blueprint")?,
+        BlueprintTargetSetEnabled::Inherit => {
+            get_current_target().await?.enabled
+        }
     };
+
     client
         .blueprint_target_set(&nexus_client::types::BlueprintTargetSet {
             target_id: args.blueprint_id,
@@ -1966,7 +2099,7 @@ impl ConfirmationPrompt {
         {
             Ok(input)
         } else {
-            bail!("expungement aborted")
+            bail!("operation aborted")
         }
     }
 
diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
index ec407cd123..2774a5d734 100644
--- a/dev-tools/omdb/tests/env.out
+++ b/dev-tools/omdb/tests/env.out
@@ -135,6 +135,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 
@@ -292,6 +297,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 
@@ -436,6 +446,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 
diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
index 2a9c9c8051..757b4e8888 100644
--- a/dev-tools/omdb/tests/successes.out
+++ b/dev-tools/omdb/tests/successes.out
@@ -141,9 +141,16 @@ SP DETAILS: type "Sled" slot 0
 
     COMPONENTS
 
-        NAME         DESCRIPTION             DEVICE          PRESENCE SERIAL 
-        sp3-host-cpu FAKE host cpu           sp3-host-cpu    Present  None   
-        dev-0        FAKE temperature sensor fake-tmp-sensor Failed   None   
+        NAME         DESCRIPTION                              DEVICE          PRESENCE SERIAL 
+        sp3-host-cpu FAKE host cpu                            sp3-host-cpu    Present  None   
+        dev-0        FAKE temperature sensor                  fake-tmp-sensor Failed   None   
+        dev-1        FAKE temperature sensor                  tmp117          Present  None   
+        dev-2        FAKE Southeast temperature sensor        tmp117          Present  None   
+        dev-6        FAKE U.2 Sharkfin A VPD                  at24csw080      Present  None   
+        dev-7        FAKE U.2 Sharkfin A hot swap controller  max5970         Present  None   
+        dev-8        FAKE U.2 A NVMe Basic Management Command nvme_bmc        Present  None   
+        dev-39       FAKE T6 temperature sensor               tmp451          Present  None   
+        dev-53       FAKE Fan controller                      max31790        Present  None   
 
     CABOOSES: none found
 
@@ -167,8 +174,16 @@ SP DETAILS: type "Sled" slot 1
 
     COMPONENTS
 
-        NAME         DESCRIPTION   DEVICE       PRESENCE SERIAL 
-        sp3-host-cpu FAKE host cpu sp3-host-cpu Present  None   
+        NAME         DESCRIPTION                              DEVICE       PRESENCE SERIAL 
+        sp3-host-cpu FAKE host cpu                            sp3-host-cpu Present  None   
+        dev-0        FAKE temperature sensor                  tmp117       Present  None   
+        dev-1        FAKE temperature sensor                  tmp117       Present  None   
+        dev-2        FAKE Southeast temperature sensor        tmp117       Present  None   
+        dev-6        FAKE U.2 Sharkfin A VPD                  at24csw080   Present  None   
+        dev-7        FAKE U.2 Sharkfin A hot swap controller  max5970      Present  None   
+        dev-8        FAKE U.2 A NVMe Basic Management Command nvme_bmc     Present  None   
+        dev-39       FAKE T6 temperature sensor               tmp451       Present  None   
+        dev-53       FAKE Fan controller                      max31790     Present  None   
 
     CABOOSES: none found
 
@@ -336,6 +351,11 @@ task: "region_snapshot_replacement_start"
     detect if region snapshots need replacement and begin the process
 
 
+task: "region_snapshot_replacement_step"
+    detect what volumes were affected by a region snapshot replacement, and run
+    the step saga for them
+
+
 task: "saga_recovery"
     recovers sagas assigned to this Nexus
 
@@ -591,6 +611,16 @@ task: "region_snapshot_replacement_start"
     total start saga invoked ok: 0
     errors: 0
 
+task: "region_snapshot_replacement_step"
+  configured period: every <REDACTED_DURATION>s
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    total step records created ok: 0
+    total step garbage collect saga invoked ok: 0
+    total step saga invoked ok: 0
+    errors: 0
+
 task: "saga_recovery"
   configured period: every 10m
   currently executing: no
@@ -999,6 +1029,16 @@ task: "region_snapshot_replacement_start"
     total start saga invoked ok: 0
     errors: 0
 
+task: "region_snapshot_replacement_step"
+  configured period: every <REDACTED_DURATION>s
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    total step records created ok: 0
+    total step garbage collect saga invoked ok: 0
+    total step saga invoked ok: 0
+    errors: 0
+
 task: "saga_recovery"
   configured period: every 10m
   currently executing: no
diff --git a/gateway-test-utils/configs/config.test.toml b/gateway-test-utils/configs/config.test.toml
index 79975f4611..4e3e9c6e6e 100644
--- a/gateway-test-utils/configs/config.test.toml
+++ b/gateway-test-utils/configs/config.test.toml
@@ -88,6 +88,15 @@ addr = "[::1]:0"
 ignition-target = 3
 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] }
 
+#
+# Configuration for SP sensor metrics polling
+#
+[metrics]
+# Allow the Oximeter metrics endpoint to bind on the loopback IP. This is
+# useful in local testing and development, when the gateway service is not
+# given a "real" underlay network IP.
+dev_bind_loopback = true
+
 #
 # NOTE: for the test suite, if mode = "file", the file path MUST be the sentinel
 # string "UNUSED".  The actual path will be generated by the test suite for each
diff --git a/gateway-test-utils/configs/sp_sim_config.test.toml b/gateway-test-utils/configs/sp_sim_config.test.toml
index cc08eec30b..4f370a167c 100644
--- a/gateway-test-utils/configs/sp_sim_config.test.toml
+++ b/gateway-test-utils/configs/sp_sim_config.test.toml
@@ -20,6 +20,9 @@ device = "fake-tmp-sensor"
 description = "FAKE temperature sensor 1"
 capabilities = 0x2
 presence = "Present"
+sensors = [
+    {name = "Southwest", kind = "Temperature", last_data.value = 41.7890625, last_data.timestamp = 1234 },
+]
 
 [[simulated_sps.sidecar.components]]
 id = "dev-1"
@@ -27,6 +30,9 @@ device = "fake-tmp-sensor"
 description = "FAKE temperature sensor 2"
 capabilities = 0x2
 presence = "Failed"
+sensors = [
+    { name = "South", kind = "Temperature", last_error.value = "DeviceError", last_error.timestamp = 1234 },
+]
 
 [[simulated_sps.sidecar]]
 multicast_addr = "::1"
@@ -56,6 +62,82 @@ device = "fake-tmp-sensor"
 description = "FAKE temperature sensor"
 capabilities = 0x2
 presence = "Failed"
+sensors = [
+    { name = "Southwest", kind = "Temperature", last_error.value = "DeviceError", last_error.timestamp = 1234 },
+]
+[[simulated_sps.gimlet.components]]
+id = "dev-1"
+device = "tmp117"
+description = "FAKE temperature sensor"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "South", kind = "Temperature", last_data.value = 42.5625, last_data.timestamp = 1234 },
+]
+
+[[simulated_sps.gimlet.components]]
+id = "dev-2"
+device = "tmp117"
+description = "FAKE Southeast temperature sensor"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "Southeast", kind = "Temperature", last_data.value = 41.570313, last_data.timestamp = 1234 },
+]
+
+[[simulated_sps.gimlet.components]]
+id = "dev-6"
+device = "at24csw080"
+description = "FAKE U.2 Sharkfin A VPD"
+capabilities = 0x0
+presence = "Present"
+
+[[simulated_sps.gimlet.components]]
+id = "dev-7"
+device = "max5970"
+description = "FAKE U.2 Sharkfin A hot swap controller"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "V12_U2A_A0", kind = "Current", last_data.value = 0.45898438, last_data.timestamp = 1234 },
+    { name = "V3P3_U2A_A0", kind = "Current", last_data.value = 0.024414063, last_data.timestamp = 1234 },
+    { name = "V12_U2A_A0", kind = "Voltage", last_data.value = 12.03125, last_data.timestamp = 1234 },
+    { name = "V3P3_U2A_A0", kind = "Voltage", last_data.value = 3.328125, last_data.timestamp = 1234 },
+]
+
+[[simulated_sps.gimlet.components]]
+id = "dev-8"
+device = "nvme_bmc"
+description = "FAKE U.2 A NVMe Basic Management Command"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "U2_N0", kind = "Temperature", last_data.value = 56.0, last_data.timestamp = 1234 },
+]
+[[simulated_sps.gimlet.components]]
+id = "dev-39"
+device = "tmp451"
+description = "FAKE T6 temperature sensor"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "t6", kind = "Temperature", last_data.value = 70.625, last_data.timestamp = 1234 },
+]
+[[simulated_sps.gimlet.components]]
+id = "dev-53"
+device = "max31790"
+description = "FAKE Fan controller"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "Southeast", kind = "Speed", last_data.value = 2607.0, last_data.timestamp = 1234 },
+    { name = "Northeast", kind = "Speed", last_data.value = 2476.0, last_data.timestamp = 1234 },
+    { name = "South", kind = "Speed", last_data.value = 2553.0, last_data.timestamp = 1234 },
+    { name = "North", kind = "Speed", last_data.value = 2265.0, last_data.timestamp = 1234 },
+    { name = "Southwest", kind = "Speed", last_data.value = 2649.0, last_data.timestamp = 1234 },
+    { name = "Northwest", kind = "Speed", last_data.value = 2275.0, last_data.timestamp = 1234 },
+]
+
 
 [[simulated_sps.gimlet]]
 multicast_addr = "::1"
@@ -72,6 +154,90 @@ capabilities = 0
 presence = "Present"
 serial_console = "[::1]:0"
 
+
+[[simulated_sps.gimlet.components]]
+id = "dev-0"
+device = "tmp117"
+description = "FAKE temperature sensor"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "Southwest", kind = "Temperature", last_data.value = 41.3629, last_data.timestamp = 1234 },
+]
+[[simulated_sps.gimlet.components]]
+id = "dev-1"
+device = "tmp117"
+description = "FAKE temperature sensor"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "South", kind = "Temperature", last_data.value = 42.5625, last_data.timestamp = 1234 },
+]
+
+[[simulated_sps.gimlet.components]]
+id = "dev-2"
+device = "tmp117"
+description = "FAKE Southeast temperature sensor"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "Southeast", kind = "Temperature", last_data.value = 41.570313, last_data.timestamp = 1234 },
+]
+
+[[simulated_sps.gimlet.components]]
+id = "dev-6"
+device = "at24csw080"
+description = "FAKE U.2 Sharkfin A VPD"
+capabilities = 0x0
+presence = "Present"
+
+[[simulated_sps.gimlet.components]]
+id = "dev-7"
+device = "max5970"
+description = "FAKE U.2 Sharkfin A hot swap controller"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "V12_U2A_A0", kind = "Current", last_data.value = 0.41893438, last_data.timestamp = 1234 },
+    { name = "V3P3_U2A_A0", kind = "Current", last_data.value = 0.025614603, last_data.timestamp = 1234 },
+    { name = "V12_U2A_A0", kind = "Voltage", last_data.value = 12.02914, last_data.timestamp = 1234 },
+    { name = "V3P3_U2A_A0", kind = "Voltage", last_data.value = 3.2618, last_data.timestamp = 1234 },
+]
+
+[[simulated_sps.gimlet.components]]
+id = "dev-8"
+device = "nvme_bmc"
+description = "FAKE U.2 A NVMe Basic Management Command"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "U2_N0", kind = "Temperature", last_data.value = 56.0, last_data.timestamp = 1234 },
+]
+[[simulated_sps.gimlet.components]]
+id = "dev-39"
+device = "tmp451"
+description = "FAKE T6 temperature sensor"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "t6", kind = "Temperature", last_data.value = 70.625, last_data.timestamp = 1234 },
+]
+[[simulated_sps.gimlet.components]]
+id = "dev-53"
+device = "max31790"
+description = "FAKE Fan controller"
+capabilities = 0x2
+presence = "Present"
+sensors = [
+    { name = "Southeast", kind = "Speed", last_data.value = 2510.0, last_data.timestamp = 1234 },
+    { name = "Northeast", kind = "Speed", last_data.value = 2390.0, last_data.timestamp = 1234 },
+    { name = "South", kind = "Speed", last_data.value = 2467.0, last_data.timestamp = 1234 },
+    { name = "North", kind = "Speed", last_data.value = 2195.0, last_data.timestamp = 1234 },
+    { name = "Southwest", kind = "Speed", last_data.value = 2680.0, last_data.timestamp = 1234 },
+    { name = "Northwest", kind = "Speed", last_data.value = 2212.0, last_data.timestamp = 1234 },
+]
+
+
 #
 # NOTE: for the test suite, the [log] section is ignored; sp-sim logs are rolled
 # into the gateway logfile.
diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs
index 46bc55805a..056bb451f7 100644
--- a/gateway-test-utils/src/setup.rs
+++ b/gateway-test-utils/src/setup.rs
@@ -8,6 +8,7 @@ use camino::Utf8Path;
 use dropshot::test_util::ClientTestContext;
 use dropshot::test_util::LogContext;
 use gateway_messages::SpPort;
+pub use omicron_gateway::metrics::MetricsConfig;
 use omicron_gateway::MgsArguments;
 use omicron_gateway::SpType;
 use omicron_gateway::SwitchPortConfig;
@@ -33,6 +34,7 @@ pub struct GatewayTestContext {
     pub server: omicron_gateway::Server,
     pub simrack: SimRack,
     pub logctx: LogContext,
+    pub gateway_id: Uuid,
 }
 
 impl GatewayTestContext {
@@ -48,13 +50,18 @@ pub fn load_test_config() -> (omicron_gateway::Config, sp_sim::Config) {
     let manifest_dir = Utf8Path::new(env!("CARGO_MANIFEST_DIR"));
     let server_config_file_path = manifest_dir.join("configs/config.test.toml");
     let server_config =
-        omicron_gateway::Config::from_file(&server_config_file_path)
-            .expect("failed to load config.test.toml");
+        match omicron_gateway::Config::from_file(&server_config_file_path) {
+            Ok(config) => config,
+            Err(e) => panic!("failed to load MGS config: {e}"),
+        };
 
     let sp_sim_config_file_path =
         manifest_dir.join("configs/sp_sim_config.test.toml");
-    let sp_sim_config = sp_sim::Config::from_file(&sp_sim_config_file_path)
-        .expect("failed to load sp_sim_config.test.toml");
+    let sp_sim_config =
+        match sp_sim::Config::from_file(&sp_sim_config_file_path) {
+            Ok(config) => config,
+            Err(e) => panic!("failed to load SP simulator config: {e}"),
+        };
     (server_config, sp_sim_config)
 }
 
@@ -143,8 +150,8 @@ pub async fn test_setup_with_config(
 
     // Start gateway server
     let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap());
-
-    let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id };
+    let gateway_id = Uuid::new_v4();
+    let args = MgsArguments { id: gateway_id, addresses, rack_id };
     let server = omicron_gateway::Server::start(
         server_config.clone(),
         args,
@@ -206,5 +213,5 @@ pub async fn test_setup_with_config(
         log.new(o!("component" => "client test context")),
     );
 
-    GatewayTestContext { client, server, simrack, logctx }
+    GatewayTestContext { client, server, simrack, logctx, gateway_id }
 }
diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml
index 3cfd1d447b..bdf4a911af 100644
--- a/gateway/Cargo.toml
+++ b/gateway/Cargo.toml
@@ -11,6 +11,7 @@ workspace = true
 anyhow.workspace = true
 base64.workspace = true
 camino.workspace = true
+chrono.workspace = true
 clap.workspace = true
 dropshot.workspace = true
 futures.workspace = true
@@ -39,6 +40,9 @@ tokio-tungstenite.workspace = true
 toml.workspace = true
 uuid.workspace = true
 omicron-workspace-hack.workspace = true
+oximeter.workspace = true
+oximeter-producer.workspace = true
+oximeter-instruments = { workspace = true, features = ["http-instruments"] }
 
 [dev-dependencies]
 expectorate.workspace = true
diff --git a/gateway/examples/config.toml b/gateway/examples/config.toml
index d29d9508b9..a76edcd7b5 100644
--- a/gateway/examples/config.toml
+++ b/gateway/examples/config.toml
@@ -71,6 +71,15 @@ addr = "[::1]:33320"
 ignition-target = 3
 location = { switch0 = ["sled", 1], switch1 = ["sled", 1] }
 
+#
+# Configuration for SP sensor metrics polling
+#
+[metrics]
+# Allow the Oximeter metrics endpoint to bind on the loopback IP. This is
+# useful in local testing and development, when the gateway service is not
+# given a "real" underlay network IP.
+dev_bind_loopback = true
+
 [log]
 # Show log messages of this level and more severe
 level = "debug"
diff --git a/gateway/src/config.rs b/gateway/src/config.rs
index afdb046881..edf895ef59 100644
--- a/gateway/src/config.rs
+++ b/gateway/src/config.rs
@@ -6,6 +6,7 @@
 //! configuration
 
 use crate::management_switch::SwitchConfig;
+use crate::metrics::MetricsConfig;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use dropshot::ConfigLogging;
@@ -25,6 +26,8 @@ pub struct Config {
     pub switch: SwitchConfig,
     /// Server-wide logging configuration.
     pub log: ConfigLogging,
+    /// Configuration for SP sensor metrics.
+    pub metrics: Option<MetricsConfig>,
 }
 
 impl Config {
@@ -47,13 +50,13 @@ pub struct PartialDropshotConfig {
 
 #[derive(Debug, Error, SlogInlineError)]
 pub enum LoadError {
-    #[error("error reading \"{path}\"")]
+    #[error("error reading \"{path}\": {err}")]
     Io {
         path: Utf8PathBuf,
         #[source]
         err: std::io::Error,
     },
-    #[error("error parsing \"{path}\"")]
+    #[error("error parsing \"{path}\": {err}")]
     Parse {
         path: Utf8PathBuf,
         #[source]
diff --git a/gateway/src/context.rs b/gateway/src/context.rs
index 939bb9b6b9..15592145cf 100644
--- a/gateway/src/context.rs
+++ b/gateway/src/context.rs
@@ -16,11 +16,13 @@ pub struct ServerContext {
     pub mgmt_switch: ManagementSwitch,
     pub host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
     pub rack_id: OnceLock<Uuid>,
+    pub latencies: oximeter_instruments::http::LatencyTracker,
     pub log: Logger,
 }
 
 impl ServerContext {
     pub async fn new(
+        id: Uuid,
         host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
         switch_config: SwitchConfig,
         rack_id_config: Option<Uuid>,
@@ -37,7 +39,21 @@ impl ServerContext {
             OnceLock::new()
         };
 
+        const START_LATENCY_DECADE: i16 = -6;
+        const END_LATENCY_DECADE: i16 = 3;
+        let latencies =
+            oximeter_instruments::http::LatencyTracker::with_latency_decades(
+                oximeter_instruments::http::HttpService {
+                    name: "management-gateway-service".into(),
+                    id,
+                },
+                START_LATENCY_DECADE,
+                END_LATENCY_DECADE,
+            )
+            .expect("start and end decades are hardcoded and should be valid");
+
         Ok(Arc::new(ServerContext {
+            latencies,
             mgmt_switch,
             host_phase2_provider,
             rack_id,
diff --git a/gateway/src/error.rs b/gateway/src/error.rs
index 5933daa340..ee148e0c98 100644
--- a/gateway/src/error.rs
+++ b/gateway/src/error.rs
@@ -26,12 +26,8 @@ pub enum StartupError {
 
 #[derive(Debug, Error, SlogInlineError)]
 pub enum SpCommsError {
-    #[error("discovery process not yet complete")]
-    DiscoveryNotYetComplete,
-    #[error("location discovery failed: {reason}")]
-    DiscoveryFailed { reason: String },
-    #[error("nonexistent SP {0:?}")]
-    SpDoesNotExist(SpIdentifier),
+    #[error(transparent)]
+    Discovery(#[from] SpLookupError),
     #[error("unknown socket address for SP {0:?}")]
     SpAddressUnknown(SpIdentifier),
     #[error(
@@ -52,13 +48,22 @@ pub enum SpCommsError {
     },
 }
 
+/// Errors returned by attempts to look up a SP in the management switch's
+/// discovery map.
+#[derive(Debug, Error, SlogInlineError)]
+pub enum SpLookupError {
+    #[error("discovery process not yet complete")]
+    DiscoveryNotYetComplete,
+    #[error("location discovery failed: {reason}")]
+    DiscoveryFailed { reason: String },
+    #[error("nonexistent SP {0:?}")]
+    SpDoesNotExist(SpIdentifier),
+}
+
 impl From<SpCommsError> for HttpError {
     fn from(error: SpCommsError) -> Self {
         match error {
-            SpCommsError::SpDoesNotExist(_) => HttpError::for_bad_request(
-                Some("InvalidSp".to_string()),
-                InlineErrorChain::new(&error).to_string(),
-            ),
+            SpCommsError::Discovery(err) => HttpError::from(err),
             SpCommsError::SpCommunicationFailed {
                 err:
                     CommunicationError::SpError(
@@ -124,21 +129,11 @@ impl From<SpCommsError> for HttpError {
                 "UpdateInProgress",
                 InlineErrorChain::new(&error).to_string(),
             ),
-            SpCommsError::DiscoveryNotYetComplete => http_err_with_message(
-                http::StatusCode::SERVICE_UNAVAILABLE,
-                "DiscoveryNotYetComplete",
-                InlineErrorChain::new(&error).to_string(),
-            ),
             SpCommsError::SpAddressUnknown(_) => http_err_with_message(
                 http::StatusCode::SERVICE_UNAVAILABLE,
                 "SpAddressUnknown",
                 InlineErrorChain::new(&error).to_string(),
             ),
-            SpCommsError::DiscoveryFailed { .. } => http_err_with_message(
-                http::StatusCode::SERVICE_UNAVAILABLE,
-                "DiscoveryFailed ",
-                InlineErrorChain::new(&error).to_string(),
-            ),
             SpCommsError::Timeout { .. } => http_err_with_message(
                 http::StatusCode::SERVICE_UNAVAILABLE,
                 "Timeout ",
@@ -160,6 +155,27 @@ impl From<SpCommsError> for HttpError {
     }
 }
 
+impl From<SpLookupError> for HttpError {
+    fn from(error: SpLookupError) -> Self {
+        match error {
+            SpLookupError::SpDoesNotExist(_) => HttpError::for_bad_request(
+                Some("InvalidSp".to_string()),
+                InlineErrorChain::new(&error).to_string(),
+            ),
+            SpLookupError::DiscoveryNotYetComplete => http_err_with_message(
+                http::StatusCode::SERVICE_UNAVAILABLE,
+                "DiscoveryNotYetComplete",
+                InlineErrorChain::new(&error).to_string(),
+            ),
+            SpLookupError::DiscoveryFailed { .. } => http_err_with_message(
+                http::StatusCode::SERVICE_UNAVAILABLE,
+                "DiscoveryFailed ",
+                InlineErrorChain::new(&error).to_string(),
+            ),
+        }
+    }
+}
+
 // Helper function to return an `HttpError` with the same internal and external
 // message. MGS is an "internal" service - even when we return a 500-level
 // status code, we want to give our caller some information about what is going
diff --git a/gateway/src/http_entrypoints.rs b/gateway/src/http_entrypoints.rs
index 332f50ed8a..c10e71ad61 100644
--- a/gateway/src/http_entrypoints.rs
+++ b/gateway/src/http_entrypoints.rs
@@ -81,18 +81,22 @@ impl GatewayApi for GatewayImpl {
     ) -> Result<HttpResponseOk<SpState>, HttpError> {
         let apictx = rqctx.context();
         let sp_id = path.into_inner().sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
 
-        let state = sp.state().await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            let state = sp.state().await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
+
+            let rot_state = sp
+                .rot_state(gateway_messages::RotBootInfo::HIGHEST_KNOWN_VERSION)
+                .await;
 
-        let rot_state = sp
-            .rot_state(gateway_messages::RotBootInfo::HIGHEST_KNOWN_VERSION)
-            .await;
+            let final_state = sp_state_from_comms(state, rot_state);
 
-        let final_state = sp_state_from_comms(state, rot_state);
-        Ok(HttpResponseOk(final_state))
+            Ok(HttpResponseOk(final_state))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_startup_options_get(
@@ -100,15 +104,18 @@ impl GatewayApi for GatewayImpl {
         path: Path<PathSp>,
     ) -> Result<HttpResponseOk<HostStartupOptions>, HttpError> {
         let apictx = rqctx.context();
-        let mgmt_switch = &apictx.mgmt_switch;
-        let sp_id = path.into_inner().sp.into();
-        let sp = mgmt_switch.sp(sp_id)?;
+        let handler = async {
+            let mgmt_switch = &apictx.mgmt_switch;
+            let sp_id = path.into_inner().sp.into();
+            let sp = mgmt_switch.sp(sp_id)?;
 
-        let options = sp.get_startup_options().await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            let options = sp.get_startup_options().await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseOk(options.into()))
+            Ok(HttpResponseOk(options.into()))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_startup_options_set(
@@ -119,13 +126,16 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let mgmt_switch = &apictx.mgmt_switch;
         let sp_id = path.into_inner().sp.into();
-        let sp = mgmt_switch.sp(sp_id)?;
+        let handler = async {
+            let sp = mgmt_switch.sp(sp_id)?;
 
-        sp.set_startup_options(body.into_inner().into()).await.map_err(
-            |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err },
-        )?;
+            sp.set_startup_options(body.into_inner().into()).await.map_err(
+                |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err },
+            )?;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_sensor_read_value(
@@ -135,12 +145,17 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let PathSpSensorId { sp, sensor_id } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let value = sp.read_sensor_value(sensor_id).await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let value =
+                sp.read_sensor_value(sensor_id).await.map_err(|err| {
+                    SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+                })?;
+
+            Ok(HttpResponseOk(value.into()))
+        };
 
-        Ok(HttpResponseOk(value.into()))
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_list(
@@ -149,12 +164,15 @@ impl GatewayApi for GatewayImpl {
     ) -> Result<HttpResponseOk<SpComponentList>, HttpError> {
         let apictx = rqctx.context();
         let sp_id = path.into_inner().sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let inventory = sp.inventory().await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let inventory = sp.inventory().await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseOk(sp_component_list_from_comms(inventory)))
+            Ok(HttpResponseOk(sp_component_list_from_comms(inventory)))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_get(
@@ -164,16 +182,21 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
-
-        let details = sp.component_details(component).await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
+
+            let details =
+                sp.component_details(component).await.map_err(|err| {
+                    SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+                })?;
+
+            Ok(HttpResponseOk(
+                details.entries.into_iter().map(Into::into).collect(),
+            ))
+        };
 
-        Ok(HttpResponseOk(
-            details.entries.into_iter().map(Into::into).collect(),
-        ))
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     // Implementation notes:
@@ -198,66 +221,79 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let ComponentCabooseSlot { firmware_slot } = query_params.into_inner();
-        let component = component_from_str(&component)?;
 
-        let from_utf8 = |key: &[u8], bytes| {
-            // This helper closure is only called with the ascii-printable [u8; 4]
-            // key constants we define above, so we can unwrap this conversion.
-            let key = str::from_utf8(key).unwrap();
-            String::from_utf8(bytes).map_err(|_| {
-                http_err_with_message(
-                    http::StatusCode::SERVICE_UNAVAILABLE,
-                    "InvalidCaboose",
-                    format!("non-utf8 data returned for caboose key {key}"),
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let ComponentCabooseSlot { firmware_slot } =
+                query_params.into_inner();
+            let component = component_from_str(&component)?;
+
+            let from_utf8 = |key: &[u8], bytes| {
+                // This helper closure is only called with the ascii-printable [u8; 4]
+                // key constants we define above, so we can unwrap this conversion.
+                let key = str::from_utf8(key).unwrap();
+                String::from_utf8(bytes).map_err(|_| {
+                    http_err_with_message(
+                        http::StatusCode::SERVICE_UNAVAILABLE,
+                        "InvalidCaboose",
+                        format!("non-utf8 data returned for caboose key {key}"),
+                    )
+                })
+            };
+
+            let git_commit =
+                sp.read_component_caboose(
+                    component,
+                    firmware_slot,
+                    CABOOSE_KEY_GIT_COMMIT,
                 )
-            })
-        };
+                .await
+                .map_err(|err| {
+                    SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+                })?;
+            let board =
+                sp.read_component_caboose(
+                    component,
+                    firmware_slot,
+                    CABOOSE_KEY_BOARD,
+                )
+                .await
+                .map_err(|err| {
+                    SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+                })?;
+            let name =
+                sp.read_component_caboose(
+                    component,
+                    firmware_slot,
+                    CABOOSE_KEY_NAME,
+                )
+                .await
+                .map_err(|err| {
+                    SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+                })?;
+            let version =
+                sp.read_component_caboose(
+                    component,
+                    firmware_slot,
+                    CABOOSE_KEY_VERSION,
+                )
+                .await
+                .map_err(|err| {
+                    SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+                })?;
 
-        let git_commit =
-            sp.read_component_caboose(
-                component,
-                firmware_slot,
-                CABOOSE_KEY_GIT_COMMIT,
-            )
-            .await
-            .map_err(|err| {
-                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-            })?;
-        let board = sp
-            .read_component_caboose(component, firmware_slot, CABOOSE_KEY_BOARD)
-            .await
-            .map_err(|err| SpCommsError::SpCommunicationFailed {
-                sp: sp_id,
-                err,
-            })?;
-        let name = sp
-            .read_component_caboose(component, firmware_slot, CABOOSE_KEY_NAME)
-            .await
-            .map_err(|err| SpCommsError::SpCommunicationFailed {
-                sp: sp_id,
-                err,
-            })?;
-        let version =
-            sp.read_component_caboose(
-                component,
-                firmware_slot,
-                CABOOSE_KEY_VERSION,
-            )
-            .await
-            .map_err(|err| {
-                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-            })?;
+            let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?;
+            let board = from_utf8(&CABOOSE_KEY_BOARD, board)?;
+            let name = from_utf8(&CABOOSE_KEY_NAME, name)?;
+            let version = from_utf8(&CABOOSE_KEY_VERSION, version)?;
 
-        let git_commit = from_utf8(&CABOOSE_KEY_GIT_COMMIT, git_commit)?;
-        let board = from_utf8(&CABOOSE_KEY_BOARD, board)?;
-        let name = from_utf8(&CABOOSE_KEY_NAME, name)?;
-        let version = from_utf8(&CABOOSE_KEY_VERSION, version)?;
+            let caboose =
+                SpComponentCaboose { git_commit, board, name, version };
 
-        let caboose = SpComponentCaboose { git_commit, board, name, version };
+            Ok(HttpResponseOk(caboose))
+        };
 
-        Ok(HttpResponseOk(caboose))
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_clear_status(
@@ -267,14 +303,18 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
 
-        sp.component_clear_status(component).await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            sp.component_clear_status(component).await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
+
+            Ok(HttpResponseUpdatedNoContent {})
+        };
 
-        Ok(HttpResponseUpdatedNoContent {})
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_active_slot_get(
@@ -284,15 +324,18 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
 
-        let slot =
-            sp.component_active_slot(component).await.map_err(|err| {
-                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-            })?;
+            let slot =
+                sp.component_active_slot(component).await.map_err(|err| {
+                    SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+                })?;
 
-        Ok(HttpResponseOk(SpComponentFirmwareSlot { slot }))
+            Ok(HttpResponseOk(SpComponentFirmwareSlot { slot }))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_active_slot_set(
@@ -304,16 +347,22 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
-        let slot = body.into_inner().slot;
-        let persist = query_params.into_inner().persist;
-
-        sp.set_component_active_slot(component, slot, persist).await.map_err(
-            |err| SpCommsError::SpCommunicationFailed { sp: sp_id, err },
-        )?;
-
-        Ok(HttpResponseUpdatedNoContent {})
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
+            let slot = body.into_inner().slot;
+            let persist = query_params.into_inner().persist;
+
+            sp.set_component_active_slot(component, slot, persist)
+                .await
+                .map_err(|err| SpCommsError::SpCommunicationFailed {
+                    sp: sp_id,
+                    err,
+                })?;
+
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_serial_console_attach(
@@ -321,6 +370,10 @@ impl GatewayApi for GatewayImpl {
         path: Path<PathSpComponent>,
         websocket: WebsocketUpgrade,
     ) -> WebsocketEndpointResult {
+        // TODO(eliza): I'm not sure whether there's a way to make
+        // `oximeter_instruments`'s HTTP latency tracker work with websockets
+        // requests? It would be nice to get the latency and any error returned
+        // prior to actually returning the websocket stream...
         let apictx = rqctx.context();
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
@@ -356,13 +409,15 @@ impl GatewayApi for GatewayImpl {
         // we don't use it at all to detach.
         let PathSpComponent { sp, component: _ } = path.into_inner();
         let sp_id = sp.into();
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            sp.serial_console_detach().await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        sp.serial_console_detach().await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
-
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_reset(
@@ -372,20 +427,23 @@ impl GatewayApi for GatewayImpl {
         let apictx = rqctx.context();
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
-
-        sp.reset_component_prepare(component)
-            // We always want to run with the watchdog when resetting as
-            // disabling the watchdog should be considered a debug only feature
-            .and_then(|()| sp.reset_component_trigger(component, false))
-            .await
-            .map_err(|err| SpCommsError::SpCommunicationFailed {
-                sp: sp_id,
-                err,
-            })?;
-
-        Ok(HttpResponseUpdatedNoContent {})
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
+
+            sp.reset_component_prepare(component)
+                // We always want to run with the watchdog when resetting as
+                // disabling the watchdog should be considered a debug only feature
+                .and_then(|()| sp.reset_component_trigger(component, false))
+                .await
+                .map_err(|err| SpCommsError::SpCommunicationFailed {
+                    sp: sp_id,
+                    err,
+                })?;
+
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_update(
@@ -398,19 +456,22 @@ impl GatewayApi for GatewayImpl {
 
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
-        let ComponentUpdateIdSlot { id, firmware_slot } =
-            query_params.into_inner();
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
+            let ComponentUpdateIdSlot { id, firmware_slot } =
+                query_params.into_inner();
 
-        // TODO-performance: this makes a full copy of the uploaded data
-        let image = body.as_bytes().to_vec();
+            // TODO-performance: this makes a full copy of the uploaded data
+            let image = body.as_bytes().to_vec();
 
-        sp.start_update(component, id, firmware_slot, image)
-            .await
-            .map_err(|err| SpCommsError::UpdateFailed { sp: sp_id, err })?;
+            sp.start_update(component, id, firmware_slot, image)
+                .await
+                .map_err(|err| SpCommsError::UpdateFailed { sp: sp_id, err })?;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_update_status(
@@ -421,14 +482,17 @@ impl GatewayApi for GatewayImpl {
 
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
 
-        let status = sp.update_status(component).await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            let status = sp.update_status(component).await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseOk(status.into()))
+            Ok(HttpResponseOk(status.into()))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_component_update_abort(
@@ -440,15 +504,18 @@ impl GatewayApi for GatewayImpl {
 
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let component = component_from_str(&component)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let component = component_from_str(&component)?;
 
-        let UpdateAbortBody { id } = body.into_inner();
-        sp.update_abort(component, id).await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            let UpdateAbortBody { id } = body.into_inner();
+            sp.update_abort(component, id).await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_rot_cmpa_get(
@@ -459,24 +526,26 @@ impl GatewayApi for GatewayImpl {
 
         let PathSpComponent { sp, component } = path.into_inner();
         let sp_id = sp.into();
+        let handler = async {
+            // Ensure the caller knows they're asking for the RoT
+            if component_from_str(&component)? != SpComponent::ROT {
+                return Err(HttpError::for_bad_request(
+                    Some("RequestUnsupportedForComponent".to_string()),
+                    "Only the RoT has a CFPA".into(),
+                ));
+            }
+
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let data = sp.read_rot_cmpa().await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        // Ensure the caller knows they're asking for the RoT
-        if component_from_str(&component)? != SpComponent::ROT {
-            return Err(HttpError::for_bad_request(
-                Some("RequestUnsupportedForComponent".to_string()),
-                "Only the RoT has a CFPA".into(),
-            ));
-        }
-
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let data = sp.read_rot_cmpa().await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
-
-        let base64_data =
-            base64::engine::general_purpose::STANDARD.encode(data);
+            let base64_data =
+                base64::engine::general_purpose::STANDARD.encode(data);
 
-        Ok(HttpResponseOk(RotCmpa { base64_data }))
+            Ok(HttpResponseOk(RotCmpa { base64_data }))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_rot_cfpa_get(
@@ -490,29 +559,32 @@ impl GatewayApi for GatewayImpl {
         let GetCfpaParams { slot } = params.into_inner();
         let sp_id = sp.into();
 
-        // Ensure the caller knows they're asking for the RoT
-        if component_from_str(&component)? != SpComponent::ROT {
-            return Err(HttpError::for_bad_request(
-                Some("RequestUnsupportedForComponent".to_string()),
-                "Only the RoT has a CFPA".into(),
-            ));
-        }
+        let handler = async {
+            // Ensure the caller knows they're asking for the RoT
+            if component_from_str(&component)? != SpComponent::ROT {
+                return Err(HttpError::for_bad_request(
+                    Some("RequestUnsupportedForComponent".to_string()),
+                    "Only the RoT has a CFPA".into(),
+                ));
+            }
+
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let data = match slot {
+                RotCfpaSlot::Active => sp.read_rot_active_cfpa().await,
+                RotCfpaSlot::Inactive => sp.read_rot_inactive_cfpa().await,
+                RotCfpaSlot::Scratch => sp.read_rot_scratch_cfpa().await,
+            }
+            .map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let data = match slot {
-            RotCfpaSlot::Active => sp.read_rot_active_cfpa().await,
-            RotCfpaSlot::Inactive => sp.read_rot_inactive_cfpa().await,
-            RotCfpaSlot::Scratch => sp.read_rot_scratch_cfpa().await,
-        }
-        .map_err(|err| SpCommsError::SpCommunicationFailed {
-            sp: sp_id,
-            err,
-        })?;
+            let base64_data =
+                base64::engine::general_purpose::STANDARD.encode(data);
 
-        let base64_data =
-            base64::engine::general_purpose::STANDARD.encode(data);
+            Ok(HttpResponseOk(RotCfpa { base64_data, slot }))
+        };
 
-        Ok(HttpResponseOk(RotCfpa { base64_data, slot }))
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_rot_boot_info(
@@ -526,20 +598,24 @@ impl GatewayApi for GatewayImpl {
         let GetRotBootInfoParams { version } = params.into_inner();
         let sp_id = sp.into();
 
-        // Ensure the caller knows they're asking for the RoT
-        if component_from_str(&component)? != SpComponent::ROT {
-            return Err(HttpError::for_bad_request(
-                Some("RequestUnsupportedForComponent".to_string()),
-                "rot_boot_info only makes sent for a RoT".into(),
-            ));
-        }
+        let handler = async {
+            // Ensure the caller knows they're asking for the RoT
+            if component_from_str(&component)? != SpComponent::ROT {
+                return Err(HttpError::for_bad_request(
+                    Some("RequestUnsupportedForComponent".to_string()),
+                    "rot_boot_info only makes sent for a RoT".into(),
+                ));
+            }
+
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let state = sp.rot_state(version).await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let state = sp.rot_state(version).await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            Ok(HttpResponseOk(state.into()))
+        };
 
-        Ok(HttpResponseOk(state.into()))
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn ignition_list(
@@ -547,17 +623,19 @@ impl GatewayApi for GatewayImpl {
     ) -> Result<HttpResponseOk<Vec<SpIgnitionInfo>>, HttpError> {
         let apictx = rqctx.context();
         let mgmt_switch = &apictx.mgmt_switch;
-
-        let out = mgmt_switch
-            .bulk_ignition_state()
-            .await?
-            .map(|(id, state)| SpIgnitionInfo {
-                id: id.into(),
-                details: state.into(),
-            })
-            .collect();
-
-        Ok(HttpResponseOk(out))
+        let handler = async {
+            let out = mgmt_switch
+                .bulk_ignition_state()
+                .await?
+                .map(|(id, state)| SpIgnitionInfo {
+                    id: id.into(),
+                    details: state.into(),
+                })
+                .collect();
+
+            Ok(HttpResponseOk(out))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn ignition_get(
@@ -568,19 +646,23 @@ impl GatewayApi for GatewayImpl {
         let mgmt_switch = &apictx.mgmt_switch;
 
         let sp_id = path.into_inner().sp.into();
-        let ignition_target = mgmt_switch.ignition_target(sp_id)?;
-
-        let state = mgmt_switch
-            .ignition_controller()
-            .ignition_state(ignition_target)
-            .await
-            .map_err(|err| SpCommsError::SpCommunicationFailed {
-                sp: sp_id,
-                err,
-            })?;
-
-        let info = SpIgnitionInfo { id: sp_id.into(), details: state.into() };
-        Ok(HttpResponseOk(info))
+        let handler = async {
+            let ignition_target = mgmt_switch.ignition_target(sp_id)?;
+
+            let state = mgmt_switch
+                .ignition_controller()
+                .ignition_state(ignition_target)
+                .await
+                .map_err(|err| SpCommsError::SpCommunicationFailed {
+                    sp: sp_id,
+                    err,
+                })?;
+
+            let info =
+                SpIgnitionInfo { id: sp_id.into(), details: state.into() };
+            Ok(HttpResponseOk(info))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn ignition_command(
@@ -591,18 +673,22 @@ impl GatewayApi for GatewayImpl {
         let mgmt_switch = &apictx.mgmt_switch;
         let PathSpIgnitionCommand { sp, command } = path.into_inner();
         let sp_id = sp.into();
-        let ignition_target = mgmt_switch.ignition_target(sp_id)?;
 
-        mgmt_switch
-            .ignition_controller()
-            .ignition_command(ignition_target, command.into())
-            .await
-            .map_err(|err| SpCommsError::SpCommunicationFailed {
-                sp: sp_id,
-                err,
-            })?;
+        let handler = async {
+            let ignition_target = mgmt_switch.ignition_target(sp_id)?;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            mgmt_switch
+                .ignition_controller()
+                .ignition_command(ignition_target, command.into())
+                .await
+                .map_err(|err| SpCommsError::SpCommunicationFailed {
+                    sp: sp_id,
+                    err,
+                })?;
+
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_power_state_get(
@@ -611,13 +697,16 @@ impl GatewayApi for GatewayImpl {
     ) -> Result<HttpResponseOk<PowerState>, HttpError> {
         let apictx = rqctx.context();
         let sp_id = path.into_inner().sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
 
-        let power_state = sp.power_state().await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            let power_state = sp.power_state().await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseOk(power_state.into()))
+            Ok(HttpResponseOk(power_state.into()))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_power_state_set(
@@ -627,14 +716,17 @@ impl GatewayApi for GatewayImpl {
     ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let apictx = rqctx.context();
         let sp_id = path.into_inner().sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
-        let power_state = body.into_inner();
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
+            let power_state = body.into_inner();
 
-        sp.set_power_state(power_state.into()).await.map_err(|err| {
-            SpCommsError::SpCommunicationFailed { sp: sp_id, err }
-        })?;
+            sp.set_power_state(power_state.into()).await.map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_installinator_image_id_set(
@@ -646,21 +738,23 @@ impl GatewayApi for GatewayImpl {
 
         let apictx = rqctx.context();
         let sp_id = path.into_inner().sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
 
-        let image_id = ipcc::InstallinatorImageId::from(body.into_inner());
+            let image_id = ipcc::InstallinatorImageId::from(body.into_inner());
 
-        sp.set_ipcc_key_lookup_value(
-            Key::InstallinatorImageId as u8,
-            image_id.serialize(),
-        )
-        .await
-        .map_err(|err| SpCommsError::SpCommunicationFailed {
-            sp: sp_id,
-            err,
-        })?;
+            sp.set_ipcc_key_lookup_value(
+                Key::InstallinatorImageId as u8,
+                image_id.serialize(),
+            )
+            .await
+            .map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_installinator_image_id_delete(
@@ -671,20 +765,22 @@ impl GatewayApi for GatewayImpl {
 
         let apictx = rqctx.context();
         let sp_id = path.into_inner().sp.into();
-        let sp = apictx.mgmt_switch.sp(sp_id)?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(sp_id)?;
 
-        // We clear the image ID by setting it to a 0-length vec.
-        sp.set_ipcc_key_lookup_value(
-            Key::InstallinatorImageId as u8,
-            Vec::new(),
-        )
-        .await
-        .map_err(|err| SpCommsError::SpCommunicationFailed {
-            sp: sp_id,
-            err,
-        })?;
+            // We clear the image ID by setting it to a 0-length vec.
+            sp.set_ipcc_key_lookup_value(
+                Key::InstallinatorImageId as u8,
+                Vec::new(),
+            )
+            .await
+            .map_err(|err| {
+                SpCommsError::SpCommunicationFailed { sp: sp_id, err }
+            })?;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_host_phase2_progress_get(
@@ -692,37 +788,41 @@ impl GatewayApi for GatewayImpl {
         path: Path<PathSp>,
     ) -> Result<HttpResponseOk<HostPhase2Progress>, HttpError> {
         let apictx = rqctx.context();
-        let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?;
-
-        let Some(progress) = sp.most_recent_host_phase2_request().await else {
-            return Ok(HttpResponseOk(HostPhase2Progress::None));
-        };
-
-        // Our `host_phase2_provider` is using an in-memory cache, so the only way
-        // we can fail to get the total size is if we no longer have the image that
-        // this SP most recently requested. We'll treat that as "no progress
-        // information", since it almost certainly means our progress info on this
-        // SP is very stale.
-        let Ok(total_size) =
-            apictx.host_phase2_provider.total_size(progress.hash).await
-        else {
-            return Ok(HttpResponseOk(HostPhase2Progress::None));
-        };
-
-        let image_id = HostPhase2RecoveryImageId {
-            sha256_hash: ArtifactHash(progress.hash),
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?;
+
+            let Some(progress) = sp.most_recent_host_phase2_request().await
+            else {
+                return Ok(HttpResponseOk(HostPhase2Progress::None));
+            };
+
+            // Our `host_phase2_provider` is using an in-memory cache, so the only way
+            // we can fail to get the total size is if we no longer have the image that
+            // this SP most recently requested. We'll treat that as "no progress
+            // information", since it almost certainly means our progress info on this
+            // SP is very stale.
+            let Ok(total_size) =
+                apictx.host_phase2_provider.total_size(progress.hash).await
+            else {
+                return Ok(HttpResponseOk(HostPhase2Progress::None));
+            };
+
+            let image_id = HostPhase2RecoveryImageId {
+                sha256_hash: ArtifactHash(progress.hash),
+            };
+
+            // `progress` tells us the offset the SP requested and the amount of data we
+            // sent starting at that offset; report the end of that chunk to our caller.
+            let offset = progress.offset.saturating_add(progress.data_sent);
+
+            Ok(HttpResponseOk(HostPhase2Progress::Available {
+                image_id,
+                offset,
+                total_size,
+                age: progress.received.elapsed(),
+            }))
         };
-
-        // `progress` tells us the offset the SP requested and the amount of data we
-        // sent starting at that offset; report the end of that chunk to our caller.
-        let offset = progress.offset.saturating_add(progress.data_sent);
-
-        Ok(HttpResponseOk(HostPhase2Progress::Available {
-            image_id,
-            offset,
-            total_size,
-            age: progress.received.elapsed(),
-        }))
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_host_phase2_progress_delete(
@@ -730,11 +830,14 @@ impl GatewayApi for GatewayImpl {
         path: Path<PathSp>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let apictx = rqctx.context();
-        let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?;
+        let handler = async {
+            let sp = apictx.mgmt_switch.sp(path.into_inner().sp.into())?;
 
-        sp.clear_most_recent_host_phase2_request().await;
+            sp.clear_most_recent_host_phase2_request().await;
 
-        Ok(HttpResponseUpdatedNoContent {})
+            Ok(HttpResponseUpdatedNoContent {})
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn recovery_host_phase2_upload(
@@ -742,44 +845,55 @@ impl GatewayApi for GatewayImpl {
         body: UntypedBody,
     ) -> Result<HttpResponseOk<HostPhase2RecoveryImageId>, HttpError> {
         let apictx = rqctx.context();
-
-        // TODO: this makes a full copy of the host image, potentially unnecessarily
-        // if it's malformed.
-        let image = body.as_bytes().to_vec();
-
-        let sha256_hash =
-            apictx.host_phase2_provider.insert(image).await.map_err(|err| {
-                // Any cache-insertion failure indicates a malformed image; map them
-                // to bad requests.
-                HttpError::for_bad_request(
-                    Some("BadHostPhase2Image".to_string()),
-                    err.to_string(),
-                )
-            })?;
-        let sha256_hash = ArtifactHash(sha256_hash);
-
-        Ok(HttpResponseOk(HostPhase2RecoveryImageId { sha256_hash }))
+        let handler = async {
+            // TODO: this makes a full copy of the host image, potentially unnecessarily
+            // if it's malformed.
+            let image = body.as_bytes().to_vec();
+
+            let sha256_hash =
+                apictx.host_phase2_provider.insert(image).await.map_err(
+                    |err| {
+                        // Any cache-insertion failure indicates a malformed image; map them
+                        // to bad requests.
+                        HttpError::for_bad_request(
+                            Some("BadHostPhase2Image".to_string()),
+                            err.to_string(),
+                        )
+                    },
+                )?;
+            let sha256_hash = ArtifactHash(sha256_hash);
+
+            Ok(HttpResponseOk(HostPhase2RecoveryImageId { sha256_hash }))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_local_switch_id(
         rqctx: RequestContext<Self::Context>,
     ) -> Result<HttpResponseOk<SpIdentifier>, HttpError> {
         let apictx = rqctx.context();
+        let handler = async {
+            let id = apictx.mgmt_switch.local_switch()?;
 
-        let id = apictx.mgmt_switch.local_switch()?;
-
-        Ok(HttpResponseOk(id.into()))
+            Ok(HttpResponseOk(id.into()))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 
     async fn sp_all_ids(
         rqctx: RequestContext<Self::Context>,
     ) -> Result<HttpResponseOk<Vec<SpIdentifier>>, HttpError> {
         let apictx = rqctx.context();
-
-        let all_ids =
-            apictx.mgmt_switch.all_sps()?.map(|(id, _)| id.into()).collect();
-
-        Ok(HttpResponseOk(all_ids))
+        let handler = async {
+            let all_ids = apictx
+                .mgmt_switch
+                .all_sps()?
+                .map(|(id, _)| id.into())
+                .collect();
+
+            Ok(HttpResponseOk(all_ids))
+        };
+        apictx.latencies.instrument_dropshot_handler(&rqctx, handler).await
     }
 }
 
diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs
index e1eed05334..e07df0cfb9 100644
--- a/gateway/src/lib.rs
+++ b/gateway/src/lib.rs
@@ -6,6 +6,7 @@ mod config;
 mod context;
 mod error;
 mod management_switch;
+pub mod metrics;
 mod serial_console;
 
 pub mod http_entrypoints; // TODO pub only for testing - is this right?
@@ -62,6 +63,8 @@ pub struct Server {
     /// `http_servers`
     all_servers_shutdown: FuturesUnordered<ShutdownWaitFuture>,
     request_body_max_bytes: usize,
+    /// handle to the SP sensor metrics subsystem
+    metrics: metrics::Metrics,
     log: Logger,
 }
 
@@ -140,6 +143,7 @@ impl Server {
                 config.host_phase2_recovery_image_cache_max_images,
             ));
         let apictx = ServerContext::new(
+            args.id,
             host_phase2_provider,
             config.switch,
             args.rack_id,
@@ -151,6 +155,9 @@ impl Server {
         let mut http_servers = HashMap::with_capacity(args.addresses.len());
         let all_servers_shutdown = FuturesUnordered::new();
 
+        let metrics =
+            metrics::Metrics::new(&log, &args, config.metrics, apictx.clone());
+
         for addr in args.addresses {
             start_dropshot_server(
                 &apictx,
@@ -167,6 +174,7 @@ impl Server {
             http_servers,
             all_servers_shutdown,
             request_body_max_bytes: config.dropshot.request_body_max_bytes,
+            metrics,
             log,
         })
     }
@@ -275,12 +283,14 @@ impl Server {
             server.close().await?;
         }
 
+        self.metrics.update_server_addrs(addresses).await;
+
         Ok(())
     }
 
     /// The rack_id will be set on a refresh of the SMF property when the sled
     /// agent starts.
-    pub fn set_rack_id(&self, rack_id: Option<Uuid>) {
+    pub fn set_rack_id(&mut self, rack_id: Option<Uuid>) {
         if let Some(rack_id) = rack_id {
             let val = self.apictx.rack_id.get_or_init(|| rack_id);
             if *val != rack_id {
@@ -291,20 +301,12 @@ impl Server {
                     "ignored_new_rack_id" => %rack_id);
             } else {
                 info!(self.apictx.log, "Set rack_id"; "rack_id" => %rack_id);
+                self.metrics.set_rack_id(rack_id);
             }
         } else {
             warn!(self.apictx.log, "SMF refresh called without a rack id");
         }
     }
-
-    // TODO does MGS register itself with oximeter?
-    // Register the Nexus server as a metric producer with `oximeter.
-    // pub async fn register_as_producer(&self) {
-    // self.apictx
-    // .nexus
-    // .register_as_producer(self.http_server_internal.local_addr())
-    // .await;
-    // }
 }
 
 /// Start an instance of the [Server].
@@ -327,6 +329,5 @@ pub async fn start_server(
         debug!(log, "registered DTrace probes");
     }
     let server = Server::start(config, args, log).await?;
-    // server.register_as_producer().await;
     Ok(server)
 }
diff --git a/gateway/src/management_switch.rs b/gateway/src/management_switch.rs
index a93c44d62c..23dfbe01a8 100644
--- a/gateway/src/management_switch.rs
+++ b/gateway/src/management_switch.rs
@@ -20,6 +20,7 @@ pub use self::location_map::SwitchPortConfig;
 pub use self::location_map::SwitchPortDescription;
 use self::location_map::ValidatedLocationConfig;
 use crate::error::SpCommsError;
+use crate::error::SpLookupError;
 use crate::error::StartupError;
 use gateway_messages::IgnitionState;
 use gateway_sp_comms::default_discovery_addr;
@@ -316,18 +317,18 @@ impl ManagementSwitch {
         self.location_map.get().is_some()
     }
 
-    fn location_map(&self) -> Result<&LocationMap, SpCommsError> {
+    fn location_map(&self) -> Result<&LocationMap, SpLookupError> {
         let discovery_result = self
             .location_map
             .get()
-            .ok_or(SpCommsError::DiscoveryNotYetComplete)?;
+            .ok_or(SpLookupError::DiscoveryNotYetComplete)?;
         discovery_result
             .as_ref()
-            .map_err(|s| SpCommsError::DiscoveryFailed { reason: s.clone() })
+            .map_err(|s| SpLookupError::DiscoveryFailed { reason: s.clone() })
     }
 
     /// Get the identifier of our local switch.
-    pub fn local_switch(&self) -> Result<SpIdentifier, SpCommsError> {
+    pub fn local_switch(&self) -> Result<SpIdentifier, SpLookupError> {
         let location_map = self.location_map()?;
         Ok(location_map.port_to_id(self.local_ignition_controller_port))
     }
@@ -347,11 +348,11 @@ impl ManagementSwitch {
     /// This method will fail if discovery is not yet complete (i.e., we don't
     /// know the logical identifiers of any SP yet!) or if `id` specifies an SP
     /// that doesn't exist in our discovered location map.
-    fn get_port(&self, id: SpIdentifier) -> Result<SwitchPort, SpCommsError> {
+    fn get_port(&self, id: SpIdentifier) -> Result<SwitchPort, SpLookupError> {
         let location_map = self.location_map()?;
         let port = location_map
             .id_to_port(id)
-            .ok_or(SpCommsError::SpDoesNotExist(id))?;
+            .ok_or(SpLookupError::SpDoesNotExist(id))?;
         Ok(port)
     }
 
@@ -362,7 +363,7 @@ impl ManagementSwitch {
     /// This method will fail if discovery is not yet complete (i.e., we don't
     /// know the logical identifiers of any SP yet!) or if `id` specifies an SP
     /// that doesn't exist in our discovered location map.
-    pub fn sp(&self, id: SpIdentifier) -> Result<&SingleSp, SpCommsError> {
+    pub fn sp(&self, id: SpIdentifier) -> Result<&SingleSp, SpLookupError> {
         let port = self.get_port(id)?;
         Ok(self.port_to_sp(port))
     }
@@ -377,7 +378,7 @@ impl ManagementSwitch {
     pub fn ignition_target(
         &self,
         id: SpIdentifier,
-    ) -> Result<u8, SpCommsError> {
+    ) -> Result<u8, SpLookupError> {
         let port = self.get_port(id)?;
         Ok(self.port_to_ignition_target[port.0])
     }
@@ -389,7 +390,7 @@ impl ManagementSwitch {
     /// therefore can't map our switch ports to SP identities).
     pub(crate) fn all_sps(
         &self,
-    ) -> Result<impl Iterator<Item = (SpIdentifier, &SingleSp)>, SpCommsError>
+    ) -> Result<impl Iterator<Item = (SpIdentifier, &SingleSp)>, SpLookupError>
     {
         let location_map = self.location_map()?;
         Ok(location_map
diff --git a/gateway/src/metrics.rs b/gateway/src/metrics.rs
new file mode 100644
index 0000000000..7c133f5d97
--- /dev/null
+++ b/gateway/src/metrics.rs
@@ -0,0 +1,1169 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+use crate::error::CommunicationError;
+use crate::management_switch::SpIdentifier;
+use crate::management_switch::SpType;
+use crate::MgsArguments;
+use crate::ServerContext;
+use anyhow::Context;
+use gateway_messages::measurement::MeasurementError;
+use gateway_messages::measurement::MeasurementKind;
+use gateway_messages::ComponentDetails;
+use gateway_messages::DeviceCapabilities;
+use gateway_sp_comms::SingleSp;
+use gateway_sp_comms::SpComponent;
+use gateway_sp_comms::VersionedSpState;
+use omicron_common::api::internal::nexus::ProducerEndpoint;
+use omicron_common::api::internal::nexus::ProducerKind;
+use omicron_common::backoff;
+use oximeter::types::Cumulative;
+use oximeter::types::ProducerRegistry;
+use oximeter::types::Sample;
+use oximeter::MetricsError;
+use std::borrow::Cow;
+use std::collections::hash_map;
+use std::collections::hash_map::HashMap;
+use std::net::IpAddr;
+use std::net::SocketAddr;
+use std::net::SocketAddrV6;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::sync::broadcast;
+use tokio::sync::oneshot;
+use tokio::sync::watch;
+use tokio::task::JoinHandle;
+use uuid::Uuid;
+
+oximeter::use_timeseries!("hardware-component.toml");
+use hardware_component as metric;
+
+/// Handle to the metrics tasks.
+pub struct Metrics {
+    /// If the metrics subsystem is disabled, this is `None`.
+    inner: Option<Handles>,
+}
+
+struct Handles {
+    addrs_tx: watch::Sender<Vec<SocketAddrV6>>,
+    rack_id_tx: Option<oneshot::Sender<Uuid>>,
+    server: JoinHandle<anyhow::Result<()>>,
+}
+
+/// Configuration for metrics.
+///
+/// In order to reduce the risk of a bad config file taking down the whole
+/// management network, we try to keep the metrics-specific portion of the
+/// config file as minimal as possible. At present, it only includes development
+/// configurations that shouldn't be present in production configs.
+#[derive(
+    Clone, Debug, Default, PartialEq, Eq, serde::Deserialize, serde::Serialize,
+)]
+#[serde(deny_unknown_fields)]
+pub struct MetricsConfig {
+    /// Completely disable the metrics subsystem.
+    ///
+    /// If `disabled = true`, sensor data metrics will not be collected, and the
+    /// metrics polling tasks will not be started.
+    #[serde(default)]
+    pub disabled: bool,
+
+    /// Override the Nexus address used to register the SP metrics Oximeter
+    /// producer. This is intended for use in development and testing.
+    ///
+    /// If this argument is not present, Nexus is discovered through DNS.
+    #[serde(default)]
+    pub dev_nexus_address: Option<SocketAddr>,
+
+    /// Allow the metrics producer endpoint to bind on loopback.
+    ///
+    /// This should be disabled in production, as Nexus will not be able to
+    /// reach the loopback interface, but is necessary for local development and
+    /// test purposes.
+    #[serde(default)]
+    pub dev_bind_loopback: bool,
+}
+
+/// Polls sensor readings from an individual SP.
+struct SpPoller {
+    spid: SpIdentifier,
+    known_state: Option<SpUnderstanding>,
+    components: HashMap<SpComponent, ComponentMetrics>,
+    log: slog::Logger,
+    rack_id: Uuid,
+    mgs_id: Uuid,
+    sample_tx: broadcast::Sender<Vec<Sample>>,
+}
+
+struct ComponentMetrics {
+    target: metric::HardwareComponent,
+    /// Counts of errors reported by sensors on this component.
+    sensor_errors: HashMap<SensorErrorKey, Cumulative<u64>>,
+    /// Counts of errors that occurred whilst polling the SP for measurements
+    /// from this component.
+    poll_errors: HashMap<&'static str, Cumulative<u64>>,
+}
+
+#[derive(Eq, PartialEq, Hash)]
+struct SensorErrorKey {
+    name: Cow<'static, str>,
+    kind: &'static str,
+    error: &'static str,
+}
+
+/// Manages a metrics server and stuff.
+struct ServerManager {
+    log: slog::Logger,
+    addrs: watch::Receiver<Vec<SocketAddrV6>>,
+    registry: ProducerRegistry,
+}
+
+#[derive(Debug)]
+struct Producer {
+    /// Receiver for samples produced by SP pollers.
+    sample_rx: broadcast::Receiver<Vec<Sample>>,
+    /// Logging context.
+    ///
+    /// We stick this on the producer because we would like to be able to log
+    /// when stale samples are dropped.
+    log: slog::Logger,
+}
+
+/// The maximum Dropshot request size for the metrics server.
+const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024;
+
+/// Poll interval for requesting sensor readings from SPs.
+///
+/// Bryan wants to try polling at 1Hz, so let's do that for now.
+const SP_POLL_INTERVAL: Duration = Duration::from_secs(1);
+
+///The interval at which we will ask Oximeter to collect our metric samples.
+///
+/// Every ten seconds seems good.
+const OXIMETER_COLLECTION_INTERVAL: Duration = Duration::from_secs(10);
+
+/// The expected number of SPs in a fully-loaded rack.
+///
+/// N.B. that there *might* be more than this; we shouldn't ever panic or
+/// otherwise misbehave if we see more than this number. This is just intended
+/// for sizing buffers/map allocations and so forth; we can always realloc if we
+/// see a bonus SP or two. That's why it's called "normal number of SPs" and not
+/// "MAX_SPS" or similar.
+///
+/// Additionally, note that we always determine the channel capacity based on
+/// the assumption that *someday*, the rack might be fully loaded with compute
+/// sleds, even if it isn't *right now*. A rack with 16 sleds could always grow
+/// another 16 later!
+const NORMAL_NUMBER_OF_SPS: usize =
+    32  // 32 compute sleds
+    + 2 // two switches
+    + 2 // two power shelves, someday.
+    ;
+
+/// What size should we make the
+const MAX_BUFFERED_SAMPLE_CHUNKS: usize = {
+    // Roughly how many times will we poll SPs for each metrics collection
+    // interval?
+    let polls_per_metrics_interval = {
+        let collection_interval_secs: usize =
+            OXIMETER_COLLECTION_INTERVAL.as_secs() as usize;
+        let poll_interval_secs: usize = SP_POLL_INTERVAL.as_secs() as usize;
+
+        collection_interval_secs / poll_interval_secs
+    };
+
+    // How many sample collection intervals do we want to allow to elapse before
+    // we start putting stuff on the floor?
+    //
+    // Let's say 16. Chosen totally arbitrarily but seems reasonable-ish.
+    let sloppiness = 16;
+    let capacity =
+        NORMAL_NUMBER_OF_SPS * polls_per_metrics_interval * sloppiness;
+    // Finally, the buffer capacity will probably be allocated in a power of two
+    // anyway, so let's make sure our thing is a power of two so we don't waste
+    // the allocation we're gonna get anyway.
+    capacity.next_power_of_two()
+};
+
+impl Metrics {
+    pub fn new(
+        log: &slog::Logger,
+        args: &MgsArguments,
+        cfg: Option<MetricsConfig>,
+        apictx: Arc<ServerContext>,
+    ) -> Self {
+        let &MgsArguments { id, rack_id, ref addresses } = args;
+
+        if cfg.as_ref().map(|c| c.disabled).unwrap_or(false) {
+            slog::warn!(&log, "metrics subsystem disabled by config");
+            return Self { inner: None };
+        }
+
+        // Create a channel for the SP poller tasks to send samples to the
+        // Oximeter producer endpoint.
+        //
+        // A broadcast channel is used here, not because we are actually
+        // multi-consumer (`Producer::produce` is never called concurrently),
+        // but because the broadcast channel has properly ring-buffer-like
+        // behavior, where earlier messages are discarded, rather than exerting
+        // backpressure on senders (as Tokio's MPSC channel does). This
+        // is what we want, as we would prefer a full buffer to result in
+        // clobbering the oldest measurements, rather than leaving the newest
+        // ones on the floor.
+        let (sample_tx, sample_rx) =
+            broadcast::channel(MAX_BUFFERED_SAMPLE_CHUNKS);
+
+        // Using a channel for this is, admittedly, a bit of an end-run around
+        // the `OnceLock` on the `ServerContext` that *also* stores the rack ID,
+        // but it has the nice benefit of allowing the `PollerManager` task to _await_
+        // the rack ID being set...we might want to change other code to use a
+        // similar approach in the future.
+        let (rack_id_tx, rack_id_rx) = oneshot::channel();
+        let rack_id_tx = if let Some(rack_id) = rack_id {
+            rack_id_tx.send(rack_id).expect(
+                "we just created the channel; it therefore will not be \
+                     closed",
+            );
+            None
+        } else {
+            Some(rack_id_tx)
+        };
+
+        tokio::spawn(start_pollers(
+            log.new(slog::o!("component" => "sensor-poller")),
+            apictx.clone(),
+            rack_id_rx,
+            id,
+            sample_tx,
+        ));
+
+        let (addrs_tx, addrs_rx) =
+            tokio::sync::watch::channel(addresses.clone());
+        let server = {
+            let log = log.new(slog::o!("component" => "producer-server"));
+            let registry = ProducerRegistry::with_id(id);
+            // Register the producer for SP sensor metrics.
+            registry
+                .register_producer(Producer { sample_rx, log: log.clone() })
+                // TODO(ben): when you change `register_producer` to not return
+                // a `Result`, delete this `expect`. thanks in advance! :)
+                .expect(
+                    "`ProducerRegistry::register_producer()` will never \
+                     actually return an `Err`, so this shouldn't ever \
+                     happen...",
+                );
+            // Also, register the producer for the HTTP API metrics.
+            registry
+                .register_producer(apictx.latencies.clone())
+                // TODO(ben): do this one too pls
+                .expect(
+                    "`ProducerRegistry::register_producer()` will never \
+                    actually return an `Err`, so this shouldn't ever \
+                    happen...",
+                );
+
+            tokio::spawn(
+                ServerManager { log, addrs: addrs_rx, registry }.run(cfg),
+            )
+        };
+        Self { inner: Some(Handles { addrs_tx, rack_id_tx, server }) }
+    }
+
+    pub fn set_rack_id(&mut self, rack_id: Uuid) {
+        let tx = self.inner.as_mut().and_then(|i| i.rack_id_tx.take());
+        if let Some(tx) = tx {
+            // If the task that starts sensor pollers has gone away already,
+            // we're probably shutting down, and shouldn't panic.
+            let _ = tx.send(rack_id);
+        }
+        // Ignoring duplicate attempt to set the rack ID...
+    }
+
+    pub async fn update_server_addrs(&self, new_addrs: &[SocketAddrV6]) {
+        if let Some(ref inner) = self.inner {
+            inner.addrs_tx.send_if_modified(|current_addrs| {
+                if current_addrs.len() == new_addrs.len()
+                    // N.B. that we could make this "faster" with a `HashSet`,
+                    // but...the size of this Vec of addresses is probably going to
+                    // two or three items, max, so the linear scan actually probably
+                    // outperforms it...
+                    && current_addrs.iter().all(|addr| new_addrs.contains(addr))
+                {
+                    return false;
+                }
+
+                // Reuse existing `Vec` capacity if possible.This is almost
+                // certainly not performance-critical, but it makes me feel happy.
+                current_addrs.clear();
+                current_addrs.extend_from_slice(new_addrs);
+                true
+            });
+        }
+    }
+}
+
+impl Drop for Metrics {
+    fn drop(&mut self) {
+        // Clean up our children on drop.
+        if let Some(ref mut inner) = self.inner {
+            inner.server.abort();
+        }
+    }
+}
+
+impl oximeter::Producer for Producer {
+    fn produce(
+        &mut self,
+    ) -> Result<Box<dyn Iterator<Item = Sample>>, MetricsError> {
+        // Drain all samples currently in the queue into a `Vec`.
+        //
+        // N.B. it may be tempting to pursue an alternative design where we
+        // implement `Iterator` for a `broadcast::Receiver<Vec<Sample>>` and
+        // just return that using `Receiver::resubscribe`...DON'T DO THAT! The
+        // `resubscribe` function creates a receiver at the current *tail* of
+        // the ringbuffer, so it won't see any samples produced *before* now.
+        // Which  is the opposite of what we want!
+        let mut samples = Vec::with_capacity(self.sample_rx.len());
+        // Because we receive the individual samples in a `Vec` of all samples
+        // produced by a poller, let's also sum the length of each of those
+        // `Vec`s here, so we can log it later.
+        let mut total_samples = 0;
+        // Also, track whether any sample chunks were dropped off the end of the
+        // ring buffer.
+        let mut dropped_chunks = 0;
+
+        use broadcast::error::TryRecvError;
+        loop {
+            match self.sample_rx.try_recv() {
+                Ok(sample_chunk) => {
+                    total_samples += sample_chunk.len();
+                    samples.push(sample_chunk)
+                }
+                // This error indicates that an old ringbuffer entry was
+                // overwritten. That's fine, just get the next one.
+                Err(TryRecvError::Lagged(dropped)) => {
+                    dropped_chunks += dropped;
+                }
+                // We've drained all currently available samples! We're done here!
+                Err(TryRecvError::Empty) => break,
+                // This should only happen when shutting down.
+                Err(TryRecvError::Closed) => {
+                    slog::debug!(&self.log, "sample producer channel closed");
+                    break;
+                }
+            }
+        }
+
+        if dropped_chunks > 0 {
+            slog::info!(
+                &self.log,
+                "produced metric samples. some old sample chunks were dropped!";
+                "samples" => total_samples,
+                "sample_chunks" => samples.len(),
+                "dropped_chunks" => dropped_chunks,
+            );
+        } else {
+            slog::debug!(
+                &self.log,
+                "produced metric samples";
+                "samples" => total_samples,
+                "sample_chunks" => samples.len(),
+            );
+        }
+
+        // There you go, that's all I've got.
+        Ok(Box::new(samples.into_iter().flatten()))
+    }
+}
+
+async fn start_pollers(
+    log: slog::Logger,
+    apictx: Arc<ServerContext>,
+    rack_id: oneshot::Receiver<Uuid>,
+    mgs_id: Uuid,
+    sample_tx: broadcast::Sender<Vec<Sample>>,
+) -> anyhow::Result<()> {
+    let switch = &apictx.mgmt_switch;
+
+    // First, wait until we know what the rack ID is known...
+    let rack_id = rack_id
+        .await
+        .context("rack ID sender has gone away...we must be shutting down")?;
+
+    // Wait for SP discovery to complete, if it hasn't already.
+    // TODO(eliza): presently, we busy-poll here. It would be nicer to
+    // replace the `OnceLock<Result<LocationMap, ...>` in `ManagementSwitch`
+    // with a `tokio::sync::watch`
+    let sps = backoff::retry_notify_ext(
+        backoff::retry_policy_local(),
+        || async { switch.all_sps().map_err(backoff::BackoffError::transient) },
+        |err, _, elapsed| {
+            let secs = elapsed.as_secs();
+            if secs < 30 {
+                slog::debug!(
+                    &log,
+                    "waiting for SP discovery to complete...";
+                    "elapsed" => ?elapsed,
+                    "error" => err,
+                );
+            } else if secs < 180 {
+                slog::info!(
+                    &log,
+                    "still waiting for SP discovery to complete...";
+                    "elapsed" => ?elapsed,
+                    "error" => err,
+                )
+            } else {
+                slog::warn!(
+                    &log,
+                    "we have been waiting for SP discovery to complete \
+                     for a pretty long time!";
+                    "elapsed" => ?elapsed,
+                    "error" => err,
+                )
+            }
+        },
+    )
+    .await
+    .context("we should never return a fatal error here")?;
+
+    slog::info!(
+        &log,
+        "starting to poll SP sensor data every {SP_POLL_INTERVAL:?}"
+    );
+
+    for (spid, _) in sps {
+        slog::info!(
+            &log,
+            "found a new little friend!";
+            "sp_slot" => ?spid.slot,
+            "chassis_type" => ?spid.typ,
+        );
+
+        let poller = SpPoller {
+            spid,
+            rack_id,
+            mgs_id,
+            log: log.new(slog::o!(
+                "sp_slot" => spid.slot,
+                "chassis_type" => format!("{:?}", spid.typ),
+            )),
+            components: HashMap::new(),
+            known_state: None,
+            sample_tx: sample_tx.clone(),
+        };
+        tokio::spawn(poller.run(apictx.clone()));
+    }
+
+    Ok(())
+}
+
+impl SpPoller {
+    async fn run(mut self, apictx: Arc<ServerContext>) {
+        let mut interval = tokio::time::interval(SP_POLL_INTERVAL);
+        let switch = &apictx.mgmt_switch;
+        let sp = match switch.sp(self.spid) {
+            Ok(sp) => sp,
+            Err(e) => {
+                // This should never happen, but it's not worth taking down the
+                // entire management network over that...
+                const MSG: &'static str =
+                    "the `SpPoller::run` function is only called after \
+                     discovery completes successfully, and the `SpIdentifier` \
+                     used was returned by the management switch, \
+                     so it should be valid.";
+                if cfg!(debug_assertions) {
+                    unreachable!(
+                        "{MSG} nonetheless, we saw a {e:?} error when looking \
+                         up {:?}",
+                        self.spid
+                    );
+                } else {
+                    slog::error!(
+                        &self.log,
+                        "THIS SHOULDN'T HAPPEN: {MSG}";
+                        "error" => e,
+                        "sp" => ?self.spid,
+                    );
+                    return;
+                }
+            }
+        };
+        loop {
+            interval.tick().await;
+            slog::trace!(&self.log, "interval elapsed, polling SP...");
+
+            match self.poll(sp).await {
+                // No sense cluttering the ringbuffer with empty vecs...
+                Ok(samples) if samples.is_empty() => {
+                    slog::trace!(
+                        &self.log,
+                        "polled SP, no samples returned";
+                        "num_samples" => 0usize
+                    );
+                }
+                Ok(samples) => {
+                    slog::trace!(
+                        &self.log,
+                        "polled SP successfully";
+                        "num_samples" => samples.len(),
+                    );
+
+                    if let Err(_) = self.sample_tx.send(samples) {
+                        slog::debug!(
+                            &self.log,
+                            "all sample receiver handles have been dropped! \
+                             presumably we are shutting down...";
+                        );
+                        return;
+                    }
+                }
+                // No SP is currently present for this ID. This may change in
+                // the future: a cubby that is not populated at present may have
+                // a sled added to it in the future. So, let's wait until it
+                // changes.
+                Err(CommunicationError::NoSpDiscovered) => {
+                    slog::info!(
+                        &self.log,
+                        "no SP is present for this slot. waiting for a \
+                         little buddy to appear...";
+                    );
+                    let mut watch = sp.sp_addr_watch().clone();
+                    loop {
+                        if let Some((addr, port)) = *watch.borrow_and_update() {
+                            // Ladies and gentlemen...we got him!
+                            slog::info!(
+                                &self.log,
+                                "found a SP, resuming polling.";
+                                "sp_addr" => ?addr,
+                                "sp_port" => ?port,
+                            );
+                            break;
+                        }
+
+                        // Wait for an address to be discovered.
+                        slog::debug!(&self.log, "waiting for a SP to appear.");
+                        if watch.changed().await.is_err() {
+                            slog::debug!(
+                                &self.log,
+                                "SP address watch has been closed, presumably \
+                                 we are shutting down";
+                            );
+                            return;
+                        }
+                    }
+                }
+                Err(error) => {
+                    slog::warn!(
+                        &self.log,
+                        "failed to poll SP, will try again momentarily...";
+                        "error" => %error,
+                    );
+                    // TODO(eliza): we should probably have a metric for failed
+                    // SP polls.
+                }
+            }
+        }
+    }
+
+    async fn poll(
+        &mut self,
+        sp: &SingleSp,
+    ) -> Result<Vec<Sample>, CommunicationError> {
+        let mut current_state = SpUnderstanding::from(sp.state().await?);
+        let mut samples = Vec::new();
+        // If the SP's state changes dramatically *during* a poll, it may be
+        // necessary to re-do the metrics scrape, thus the loop. Normally, we
+        // will only loop a single time, but may retry if necessary.
+        loop {
+            // Check if the SP's state has changed. If it has, we need to make sure
+            // we still know what all of its sensors are.
+            if Some(&current_state) != self.known_state.as_ref() {
+                // The SP's state appears to have changed. Time to make sure our
+                // understanding of its devices and identity is up to date!
+
+                let chassis_kind = match self.spid.typ {
+                    SpType::Sled => "sled",
+                    SpType::Switch => "switch",
+                    SpType::Power => "power",
+                };
+                let model = stringify_byte_string(&current_state.model[..]);
+                let serial =
+                    stringify_byte_string(&current_state.serial_number[..]);
+                let hubris_archive_id =
+                    hex::encode(&current_state.hubris_archive_id);
+
+                slog::debug!(
+                    &self.log,
+                    "our little friend seems to have changed in some kind of way";
+                    "current_state" => ?current_state,
+                    "known_state" => ?self.known_state,
+                    "new_model" => %model,
+                    "new_serial" => %serial,
+                    "new_hubris_archive_id" => %hubris_archive_id,
+                );
+
+                let inv_devices = sp.inventory().await?.devices;
+
+                // Clear out any previously-known devices, and preallocate capacity
+                // for all the new ones.
+                self.components.clear();
+                self.components.reserve(inv_devices.len());
+
+                for dev in inv_devices {
+                    // Skip devices which have nothing interesting for us.
+                    if !dev
+                        .capabilities
+                        .contains(DeviceCapabilities::HAS_MEASUREMENT_CHANNELS)
+                    {
+                        continue;
+                    }
+                    let component_id = match dev.component.as_str() {
+                        Some(c) => Cow::Owned(c.to_string()),
+                        None => {
+                            // These are supposed to always be strings. But, if we
+                            // see one that's not a string, fall back to the hex
+                            // representation rather than panicking.
+                            let hex = hex::encode(dev.component.id);
+                            slog::warn!(
+                                &self.log,
+                                "a SP component ID was not a string! this isn't \
+                                 supposed to happen!";
+                                "component" => %hex,
+                                "device" => ?dev,
+                            );
+                            Cow::Owned(hex)
+                        }
+                    };
+
+                    // TODO(eliza): i hate having to clone all these strings for
+                    // every device on the SP...it would be cool if Oximeter let us
+                    // reference count them...
+                    let target = metric::HardwareComponent {
+                        rack_id: self.rack_id,
+                        gateway_id: self.mgs_id,
+                        chassis_model: Cow::Owned(model.clone()),
+                        chassis_revision: current_state.revision,
+                        chassis_kind: Cow::Borrowed(chassis_kind),
+                        chassis_serial: Cow::Owned(serial.clone()),
+                        hubris_archive_id: Cow::Owned(
+                            hubris_archive_id.clone(),
+                        ),
+                        slot: self.spid.slot as u32,
+                        component_kind: Cow::Owned(dev.device),
+                        component_id,
+                        description: Cow::Owned(dev.description),
+                    };
+                    match self.components.entry(dev.component) {
+                        // Found a new device!
+                        hash_map::Entry::Vacant(entry) => {
+                            slog::debug!(
+                                &self.log,
+                                "discovered a new component!";
+                                "component_id" => %target.component_id,
+                                "component_kind" => %target.component_kind,
+                                "description" => %target.component_id,
+                            );
+                            entry.insert(ComponentMetrics {
+                                target,
+                                sensor_errors: HashMap::new(),
+                                poll_errors: HashMap::new(),
+                            });
+                        }
+                        // We previously had a known device for this thing, but
+                        // the metrics target has changed, so we should reset
+                        // its cumulative metrics.
+                        hash_map::Entry::Occupied(mut entry)
+                            if entry.get().target != target =>
+                        {
+                            slog::trace!(
+                                &self.log,
+                                "target has changed, resetting cumulative metrics \
+                                 for component";
+                                "component" => ?dev.component,
+                            );
+                            entry.insert(ComponentMetrics {
+                                target,
+                                sensor_errors: HashMap::new(),
+                                poll_errors: HashMap::new(),
+                            });
+                        }
+
+                        // The target for this device hasn't changed, don't reset it.
+                        hash_map::Entry::Occupied(_) => {}
+                    }
+                }
+
+                self.known_state = Some(current_state);
+            }
+
+            // We will need capacity for *at least* the number of components on the
+            // SP --- it will probably be more, as several components have multiple
+            // measurement channels which will produce independent samples (e.g. a
+            // power rail will likely have both voltage and current measurements,
+            // and a device may have multiple rails...) but, this way, we can avoid
+            // *some* amount of reallocating...
+            samples.reserve(self.components.len());
+            for (c, metrics) in &mut self.components {
+                // Metrics samples *should* always be well-formed. If we ever emit a
+                // messed up one, this is a programmer error, and therefore should
+                // fail in test, but should probably *not* take down the whole
+                // management gateway in a real-life rack, especially because it's
+                // probably going to happen again if we were to get restarted.
+                const BAD_SAMPLE: &str =
+                    "we emitted a bad metrics sample! this should never happen";
+                macro_rules! try_sample {
+                    ($sample:expr) => {
+                        match $sample {
+                            Ok(sample) => samples.push(sample),
+
+                            Err(err) => {
+                                slog::error!(
+                                    &self.log,
+                                    "{BAD_SAMPLE}!";
+                                    "error" => %err,
+                                );
+                                #[cfg(debug_assertions)]
+                                unreachable!("{BAD_SAMPLE}: {err}");
+                            }
+                        }
+                    }
+                }
+                let details = match sp.component_details(*c).await {
+                    Ok(deets) => deets,
+                    // SP seems gone!
+                    Err(CommunicationError::NoSpDiscovered) => {
+                        return Err(CommunicationError::NoSpDiscovered)
+                    }
+                    Err(error) => {
+                        slog::warn!(
+                            &self.log,
+                            "failed to read details on SP component";
+                            "sp_component" => %c,
+                            "error" => %error,
+                        );
+                        try_sample!(metrics.poll_error(comms_error_str(error)));
+                        continue;
+                    }
+                };
+                if details.entries.is_empty() {
+                    slog::warn!(
+                        &self.log,
+                        "a component which claimed to have measurement channels \
+                         had empty details. this seems weird...";
+                        "sp_component" => %c,
+                    );
+                    try_sample!(metrics.poll_error("no_measurement_channels"));
+                    continue;
+                }
+
+                let ComponentMetrics { sensor_errors, target, .. } = metrics;
+                for d in details.entries {
+                    let ComponentDetails::Measurement(m) = d else {
+                        // If the component details are switch port details rather
+                        // than measurement channels, ignore it for now.
+                        continue;
+                    };
+                    let sensor: Cow<'static, str> = Cow::Owned(m.name);
+
+                    // First, if there's a measurement error, increment the
+                    // error count metric. We will synthesize a missing sample
+                    // for the sensor's metric as well, after we produce the
+                    // measurement error sample.
+                    //
+                    // We do this first so that we only have to clone the
+                    // sensor's name if there's an error, rather than always
+                    // cloning it in *case* there's an error.
+                    if let Err(error) = m.value {
+                        let kind = match m.kind {
+                            MeasurementKind::Temperature => "temperature",
+                            MeasurementKind::Current => "current",
+                            MeasurementKind::Voltage => "voltage",
+                            MeasurementKind::Power => "power",
+                            MeasurementKind::InputCurrent => "input_current",
+                            MeasurementKind::InputVoltage => "input_voltage",
+                            MeasurementKind::Speed => "fan_speed",
+                        };
+                        let error = match error {
+                            MeasurementError::InvalidSensor => "invalid_sensor",
+                            MeasurementError::NoReading => "no_reading",
+                            MeasurementError::NotPresent => "not_present",
+                            MeasurementError::DeviceError => "device_error",
+                            MeasurementError::DeviceUnavailable => {
+                                "device_unavailable"
+                            }
+                            MeasurementError::DeviceTimeout => "device_timeout",
+                            MeasurementError::DeviceOff => "device_off",
+                        };
+                        let datum = sensor_errors
+                            .entry(SensorErrorKey {
+                                name: sensor.clone(),
+                                kind,
+                                error,
+                            })
+                            .or_insert(Cumulative::new(0));
+                        // TODO(eliza): perhaps we should treat this as
+                        // "level-triggered" and only increment the counter
+                        // when the sensor has *changed* to an errored
+                        // state after we have seen at least one good
+                        // measurement from it since the last time the error
+                        // was observed?
+                        datum.increment();
+                        try_sample!(Sample::new(
+                            target,
+                            &metric::SensorErrorCount {
+                                error: Cow::Borrowed(error),
+                                sensor: sensor.clone(),
+                                datum: *datum,
+                                sensor_kind: Cow::Borrowed(kind),
+                            },
+                        ));
+                    }
+
+                    // I don't love this massive `match`, but because the
+                    // `Sample::new_missing` constructor is a different function
+                    // from `Sample::new`, we need separate branches for the
+                    // error and not-error cases, rather than just doing
+                    // something to produce a datum from both the `Ok` and
+                    // `Error` cases...
+                    let sample = match (m.value, m.kind) {
+                        (Ok(datum), MeasurementKind::Temperature) => {
+                            Sample::new(
+                                target,
+                                &metric::Temperature { sensor, datum },
+                            )
+                        }
+                        (Err(_), MeasurementKind::Temperature) => {
+                            Sample::new_missing(
+                                target,
+                                &metric::Temperature { sensor, datum: 0.0 },
+                            )
+                        }
+                        (Ok(datum), MeasurementKind::Current) => Sample::new(
+                            target,
+                            &metric::Current { sensor, datum },
+                        ),
+                        (Err(_), MeasurementKind::Current) => {
+                            Sample::new_missing(
+                                target,
+                                &metric::Current { sensor, datum: 0.0 },
+                            )
+                        }
+                        (Ok(datum), MeasurementKind::Voltage) => Sample::new(
+                            target,
+                            &metric::Voltage { sensor, datum },
+                        ),
+
+                        (Err(_), MeasurementKind::Voltage) => {
+                            Sample::new_missing(
+                                target,
+                                &metric::Voltage { sensor, datum: 0.0 },
+                            )
+                        }
+                        (Ok(datum), MeasurementKind::Power) => Sample::new(
+                            target,
+                            &metric::Power { sensor, datum },
+                        ),
+                        (Err(_), MeasurementKind::Power) => {
+                            Sample::new_missing(
+                                target,
+                                &metric::Power { sensor, datum: 0.0 },
+                            )
+                        }
+                        (Ok(datum), MeasurementKind::InputCurrent) => {
+                            Sample::new(
+                                target,
+                                &metric::InputCurrent { sensor, datum },
+                            )
+                        }
+                        (Err(_), MeasurementKind::InputCurrent) => {
+                            Sample::new_missing(
+                                target,
+                                &metric::InputCurrent { sensor, datum: 0.0 },
+                            )
+                        }
+                        (Ok(datum), MeasurementKind::InputVoltage) => {
+                            Sample::new(
+                                target,
+                                &metric::InputVoltage { sensor, datum },
+                            )
+                        }
+                        (Err(_), MeasurementKind::InputVoltage) => {
+                            Sample::new_missing(
+                                target,
+                                &metric::InputVoltage { sensor, datum: 0.0 },
+                            )
+                        }
+                        (Ok(datum), MeasurementKind::Speed) => Sample::new(
+                            target,
+                            &metric::FanSpeed { sensor, datum },
+                        ),
+                        (Err(_), MeasurementKind::Speed) => {
+                            Sample::new_missing(
+                                target,
+                                &metric::FanSpeed { sensor, datum: 0.0 },
+                            )
+                        }
+                    };
+                    try_sample!(sample);
+                }
+            }
+
+            // Now, fetch the SP's state *again*. It is possible that, while we
+            // were scraping the SP's samples, the SP's identity changed in some
+            // way: perhaps its version was updated during the poll, or it
+            // was removed from the rack and replaced with an entirely different
+            // chassis! If that's the case, some of the samples we collected may
+            // have a metrics target describing the wrong thing (e.g. they could
+            // still have the previous firmware's `hubris_archive_id`, if the SP
+            // was updated). In that case, we need to throw away the samples we
+            // collected and try again, potentially rebuilding our understanding
+            // of the SP's inventory.
+            let state = SpUnderstanding::from(sp.state().await?);
+            if state == current_state {
+                // All good, the SP is still who we thought it was! We can
+                // "commit" this batch of samples
+                return Ok(samples);
+            }
+
+            slog::info!(
+                &self.log,
+                "SP's state changed mid-poll! discarding current samples and \
+                 starting over!";
+                "new_state" => ?state,
+                "current_state" => ?current_state,
+            );
+            // Let's reuse the buffer we already have for the next batch of
+            // samples.
+            samples.clear();
+            //...and try again with the new state.
+            current_state = state;
+        }
+    }
+}
+
+/// The fields of the `gateway_messages` `VersionedSpState` and
+/// `SpStateV1`/`SpStateV2`/`SpStateV3` that we actually care about for purposes
+/// of determining whether our understanding of the SP's components are still
+/// valid.
+///
+/// In particular, we throw out the RoT state and the SP's power state, because
+/// those changing won't actually invalidate our understanding of the SP's
+/// components.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+struct SpUnderstanding {
+    hubris_archive_id: [u8; 8],
+    serial_number: [u8; 32],
+    model: [u8; 32],
+    revision: u32,
+}
+
+impl From<VersionedSpState> for SpUnderstanding {
+    fn from(v: VersionedSpState) -> Self {
+        match v {
+            VersionedSpState::V1(gateway_messages::SpStateV1 {
+                hubris_archive_id,
+                serial_number,
+                model,
+                revision,
+                ..
+            }) => Self { hubris_archive_id, serial_number, model, revision },
+            VersionedSpState::V2(gateway_messages::SpStateV2 {
+                hubris_archive_id,
+                serial_number,
+                model,
+                revision,
+                ..
+            }) => Self { hubris_archive_id, serial_number, model, revision },
+            VersionedSpState::V3(gateway_messages::SpStateV3 {
+                hubris_archive_id,
+                serial_number,
+                model,
+                revision,
+                ..
+            }) => Self { hubris_archive_id, serial_number, model, revision },
+        }
+    }
+}
+
+// Reimplement this ourselves because we don't really care about
+// reading the RoT state at present. This is unfortunately copied
+// from `gateway_messages`.
+fn stringify_byte_string(bytes: &[u8]) -> String {
+    // We expect serial and model numbers to be ASCII and 0-padded: find the first 0
+    // byte and convert to a string. If that fails, hexlify the entire slice.
+    let first_zero = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
+
+    std::str::from_utf8(&bytes[..first_zero])
+        .map(|s| s.to_string())
+        .unwrap_or_else(|_err| hex::encode(bytes))
+}
+
+impl ServerManager {
+    async fn run(mut self, cfg: Option<MetricsConfig>) -> anyhow::Result<()> {
+        let (registration_address, bind_loopback) =
+            if let Some(MetricsConfig {
+                dev_bind_loopback,
+                dev_nexus_address,
+                ..
+            }) = cfg
+            {
+                if dev_bind_loopback || dev_nexus_address.is_some() {
+                    slog::warn!(
+                        &self.log,
+                        "using development metrics configuration overrides!";
+                        "nexus_address" => ?dev_nexus_address,
+                        "bind_loopback" => dev_bind_loopback,
+                    );
+                }
+                (dev_nexus_address, dev_bind_loopback)
+            } else {
+                (None, false)
+            };
+        let id = self.registry.producer_id();
+
+        let mut current_server: Option<oximeter_producer::Server> = None;
+        loop {
+            let current_ip = current_server.as_ref().map(|s| s.address().ip());
+            let mut new_ip = None;
+            for addr in self.addrs.borrow_and_update().iter() {
+                let &ip = addr.ip();
+                // Don't bind the metrics endpoint on ::1
+                if ip.is_loopback() && !bind_loopback {
+                    continue;
+                }
+                // If our current address is contained in the new addresses,
+                // no need to rebind.
+                if current_ip == Some(IpAddr::V6(ip)) {
+                    new_ip = None;
+                    break;
+                } else {
+                    new_ip = Some(ip);
+                }
+            }
+
+            if let Some(ip) = new_ip {
+                slog::debug!(
+                    &self.log,
+                    "rebinding producer server on new IP";
+                    "new_ip" => ?ip,
+                    "current_ip" => ?current_ip,
+                    "collection_interval" => ?OXIMETER_COLLECTION_INTERVAL,
+                    "producer_id" => ?id,
+                );
+                let server = {
+                    // Listen on any available socket, using the provided underlay IP.
+                    let address = SocketAddr::new(ip.into(), 0);
+
+                    let server_info = ProducerEndpoint {
+                        id,
+                        kind: ProducerKind::ManagementGateway,
+                        address,
+                        interval: OXIMETER_COLLECTION_INTERVAL,
+                    };
+                    let config = oximeter_producer::Config {
+                        server_info,
+                        registration_address,
+                        request_body_max_bytes: METRIC_REQUEST_MAX_SIZE,
+                        log: oximeter_producer::LogConfig::Logger(
+                            self.log.clone(),
+                        ),
+                    };
+                    oximeter_producer::Server::with_registry(
+                        self.registry.clone(),
+                        &config,
+                    )
+                    .context("failed to start producer server")?
+                };
+
+                slog::info!(
+                    &self.log,
+                    "bound metrics producer server";
+                    "collection_interval" => ?OXIMETER_COLLECTION_INTERVAL,
+                    "producer_id" => ?id,
+                    "address" => %server.address(),
+                );
+
+                if let Some(old_server) = current_server.replace(server) {
+                    let old_addr = old_server.address();
+                    if let Err(error) = old_server.close().await {
+                        slog::error!(
+                            &self.log,
+                            "failed to close old metrics producer server";
+                            "address" => %old_addr,
+                            "error" => %error,
+                        );
+                    } else {
+                        slog::debug!(
+                            &self.log,
+                            "old metrics producer server shut down";
+                            "address" => %old_addr,
+                        )
+                    }
+                }
+            }
+
+            // Wait for a subsequent address change.
+            self.addrs.changed().await?;
+        }
+    }
+}
+
+impl ComponentMetrics {
+    fn poll_error(
+        &mut self,
+        error_str: &'static str,
+    ) -> Result<Sample, MetricsError> {
+        let datum = self
+            .poll_errors
+            .entry(error_str)
+            .or_insert_with(|| Cumulative::new(0));
+        datum.increment();
+        Sample::new(
+            &self.target,
+            &metric::PollErrorCount {
+                error: Cow::Borrowed(error_str),
+                datum: *datum,
+            },
+        )
+    }
+}
+
+fn comms_error_str(error: CommunicationError) -> &'static str {
+    // TODO(eliza): a bunch of these probably can't be returned by the specific
+    // operations we try to do. It could be good to make the methods this code
+    // calls return a smaller enum of just the errors it might actually
+    // encounter? Figure this out later.
+    match error {
+        CommunicationError::NoSpDiscovered => "no_sp_discovered",
+        CommunicationError::InterfaceError(_) => "interface",
+        CommunicationError::ScopeIdChangingFrequently { .. } => {
+            "scope_id_changing_frequently"
+        }
+        CommunicationError::JoinMulticast { .. } => "join_multicast",
+        CommunicationError::UdpSendTo { .. } => "udp_send_to",
+        CommunicationError::UdpRecv(_) => "udp_recv",
+        CommunicationError::Deserialize { .. } => "deserialize",
+        CommunicationError::ExhaustedNumAttempts(_) => "exhausted_num_attempts",
+        CommunicationError::BadResponseType { .. } => "bad_response_type",
+        CommunicationError::SpError { .. } => "sp_error",
+        CommunicationError::BogusSerialConsoleState { .. } => {
+            "bogus_serial_console_state"
+        }
+        CommunicationError::VersionMismatch { .. } => {
+            "protocol_version_mismatch"
+        }
+        CommunicationError::TlvDeserialize { .. } => "tlv_deserialize",
+        CommunicationError::TlvDecode(_) => "tlv_decode",
+        CommunicationError::TlvPagination { .. } => "tlv_pagination",
+        CommunicationError::IpccKeyLookupValueTooLarge => {
+            "ipcc_key_lookup_value_too_large"
+        }
+        CommunicationError::UnexpectedTrailingData(_) => {
+            "unexpected_trailing_data"
+        }
+        CommunicationError::BadTrailingDataSize { .. } => {
+            "bad_trailing_data_size"
+        }
+    }
+}
diff --git a/gateway/tests/integration_tests/component_list.rs b/gateway/tests/integration_tests/component_list.rs
index ec876c0783..993dcc9e93 100644
--- a/gateway/tests/integration_tests/component_list.rs
+++ b/gateway/tests/integration_tests/component_list.rs
@@ -57,7 +57,71 @@ async fn component_list() {
                 capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
                     .bits(),
                 presence: SpComponentPresence::Failed,
-            }
+            },
+            SpComponentInfo {
+                component: "dev-1".to_string(),
+                device: "tmp117".to_string(),
+                serial_number: None,
+                description: "FAKE temperature sensor".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-2".to_string(),
+                device: "tmp117".to_string(),
+                serial_number: None,
+                description: "FAKE Southeast temperature sensor".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-6".to_string(),
+                device: "at24csw080".to_string(),
+                serial_number: None,
+                description: "FAKE U.2 Sharkfin A VPD".to_string(),
+                capabilities: 0,
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-7".to_string(),
+                device: "max5970".to_string(),
+                serial_number: None,
+                description: "FAKE U.2 Sharkfin A hot swap controller"
+                    .to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-8".to_string(),
+                device: "nvme_bmc".to_string(),
+                serial_number: None,
+                description: "FAKE U.2 A NVMe Basic Management Command"
+                    .to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-39".to_string(),
+                device: "tmp451".to_string(),
+                serial_number: None,
+                description: "FAKE T6 temperature sensor".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-53".to_string(),
+                device: "max31790".to_string(),
+                serial_number: None,
+                description: "FAKE Fan controller".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
         ]
     );
 
@@ -67,14 +131,89 @@ async fn component_list() {
 
     assert_eq!(
         resp.components,
-        &[SpComponentInfo {
-            component: SpComponent::SP3_HOST_CPU.const_as_str().to_string(),
-            device: SpComponent::SP3_HOST_CPU.const_as_str().to_string(),
-            serial_number: None,
-            description: "FAKE host cpu".to_string(),
-            capabilities: 0,
-            presence: SpComponentPresence::Present,
-        },]
+        &[
+            SpComponentInfo {
+                component: SpComponent::SP3_HOST_CPU.const_as_str().to_string(),
+                device: SpComponent::SP3_HOST_CPU.const_as_str().to_string(),
+                serial_number: None,
+                description: "FAKE host cpu".to_string(),
+                capabilities: 0,
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-0".to_string(),
+                device: "tmp117".to_string(),
+                serial_number: None,
+                description: "FAKE temperature sensor".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-1".to_string(),
+                device: "tmp117".to_string(),
+                serial_number: None,
+                description: "FAKE temperature sensor".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-2".to_string(),
+                device: "tmp117".to_string(),
+                serial_number: None,
+                description: "FAKE Southeast temperature sensor".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-6".to_string(),
+                device: "at24csw080".to_string(),
+                serial_number: None,
+                description: "FAKE U.2 Sharkfin A VPD".to_string(),
+                capabilities: 0,
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-7".to_string(),
+                device: "max5970".to_string(),
+                serial_number: None,
+                description: "FAKE U.2 Sharkfin A hot swap controller"
+                    .to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-8".to_string(),
+                device: "nvme_bmc".to_string(),
+                serial_number: None,
+                description: "FAKE U.2 A NVMe Basic Management Command"
+                    .to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-39".to_string(),
+                device: "tmp451".to_string(),
+                serial_number: None,
+                description: "FAKE T6 temperature sensor".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+            SpComponentInfo {
+                component: "dev-53".to_string(),
+                device: "max31790".to_string(),
+                serial_number: None,
+                description: "FAKE Fan controller".to_string(),
+                capabilities: DeviceCapabilities::HAS_MEASUREMENT_CHANNELS
+                    .bits(),
+                presence: SpComponentPresence::Present,
+            },
+        ]
     );
 
     // Get the component list for switch 0.
diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index f6e60bb558..b3d189691c 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -396,6 +396,8 @@ pub struct BackgroundTaskConfig {
     /// configuration for region snapshot replacement garbage collection
     pub region_snapshot_replacement_garbage_collection:
         RegionSnapshotReplacementGarbageCollectionConfig,
+    /// configuration for region snapshot replacement step task
+    pub region_snapshot_replacement_step: RegionSnapshotReplacementStepConfig,
 }
 
 #[serde_as]
@@ -648,6 +650,14 @@ pub struct RegionSnapshotReplacementGarbageCollectionConfig {
     pub period_secs: Duration,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct RegionSnapshotReplacementStepConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+}
+
 /// Configuration for a nexus server
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
 pub struct PackageConfig {
@@ -897,6 +907,7 @@ mod test {
             lookup_region_port.period_secs = 60
             region_snapshot_replacement_start.period_secs = 30
             region_snapshot_replacement_garbage_collection.period_secs = 30
+            region_snapshot_replacement_step.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             seed = 0
@@ -1067,6 +1078,10 @@ mod test {
                             RegionSnapshotReplacementGarbageCollectionConfig {
                                 period_secs: Duration::from_secs(30),
                             },
+                        region_snapshot_replacement_step:
+                            RegionSnapshotReplacementStepConfig {
+                                period_secs: Duration::from_secs(30),
+                            },
                     },
                     default_region_allocation_strategy:
                         crate::nexus_config::RegionAllocationStrategy::Random {
@@ -1145,6 +1160,7 @@ mod test {
             lookup_region_port.period_secs = 60
             region_snapshot_replacement_start.period_secs = 30
             region_snapshot_replacement_garbage_collection.period_secs = 30
+            region_snapshot_replacement_step.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             "##,
diff --git a/nexus-config/src/postgres_config.rs b/nexus-config/src/postgres_config.rs
index 2509ae4fca..0c72d2ba9e 100644
--- a/nexus-config/src/postgres_config.rs
+++ b/nexus-config/src/postgres_config.rs
@@ -5,6 +5,7 @@
 //! Common objects used for configuration
 
 use std::fmt;
+use std::net::SocketAddr;
 use std::ops::Deref;
 use std::str::FromStr;
 
@@ -32,6 +33,29 @@ impl PostgresConfigWithUrl {
     pub fn url(&self) -> String {
         self.url_raw.clone()
     }
+
+    /// Accesses the first ip / port pair within the URL.
+    ///
+    /// # Panics
+    ///
+    /// This method makes the assumption that the hostname has at least one
+    /// "host IP / port" pair which can be extracted. If the supplied URL
+    /// does not have such a pair, this function will panic.
+    // Yes, panicking in the above scenario sucks. But this type is already
+    // pretty ubiquitous within Omicron, and integration with the qorb
+    // connection pooling library requires access to database by SocketAddr.
+    pub fn address(&self) -> SocketAddr {
+        let tokio_postgres::config::Host::Tcp(host) =
+            &self.config.get_hosts()[0]
+        else {
+            panic!("Non-TCP hostname");
+        };
+        let ip: std::net::IpAddr =
+            host.parse().expect("Failed to parse host as IP address");
+
+        let port = self.config.get_ports()[0];
+        SocketAddr::new(ip, port)
+    }
 }
 
 impl FromStr for PostgresConfigWithUrl {
diff --git a/nexus/db-model/src/producer_endpoint.rs b/nexus/db-model/src/producer_endpoint.rs
index 74a7356adb..c2fab2de5a 100644
--- a/nexus/db-model/src/producer_endpoint.rs
+++ b/nexus/db-model/src/producer_endpoint.rs
@@ -22,6 +22,7 @@ impl_enum_type!(
     #[diesel(sql_type = ProducerKindEnum)]
     pub enum ProducerKind;
 
+    ManagementGateway => b"management_gateway"
     SledAgent => b"sled_agent"
     Service => b"service"
     Instance => b"instance"
@@ -30,6 +31,9 @@ impl_enum_type!(
 impl From<internal::nexus::ProducerKind> for ProducerKind {
     fn from(kind: internal::nexus::ProducerKind) -> Self {
         match kind {
+            internal::nexus::ProducerKind::ManagementGateway => {
+                ProducerKind::ManagementGateway
+            }
             internal::nexus::ProducerKind::SledAgent => ProducerKind::SledAgent,
             internal::nexus::ProducerKind::Service => ProducerKind::Service,
             internal::nexus::ProducerKind::Instance => ProducerKind::Instance,
@@ -40,6 +44,9 @@ impl From<internal::nexus::ProducerKind> for ProducerKind {
 impl From<ProducerKind> for internal::nexus::ProducerKind {
     fn from(kind: ProducerKind) -> Self {
         match kind {
+            ProducerKind::ManagementGateway => {
+                internal::nexus::ProducerKind::ManagementGateway
+            }
             ProducerKind::SledAgent => internal::nexus::ProducerKind::SledAgent,
             ProducerKind::Service => internal::nexus::ProducerKind::Service,
             ProducerKind::Instance => internal::nexus::ProducerKind::Instance,
diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
index f630bbbeac..f01f33c39d 100644
--- a/nexus/db-model/src/schema.rs
+++ b/nexus/db-model/src/schema.rs
@@ -144,7 +144,7 @@ table! {
         fec -> crate::SwitchLinkFecEnum,
         speed -> crate::SwitchLinkSpeedEnum,
         autoneg -> Bool,
-        lldp_link_config_id -> Uuid,
+        lldp_link_config_id -> Nullable<Uuid>,
     }
 }
 
diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs
index d0542874fb..eaed2990c5 100644
--- a/nexus/db-model/src/schema_versions.rs
+++ b/nexus/db-model/src/schema_versions.rs
@@ -17,7 +17,7 @@ use std::collections::BTreeMap;
 ///
 /// This must be updated when you change the database schema.  Refer to
 /// schema/crdb/README.adoc in the root of this repository for details.
-pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(90, 0, 0);
+pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(92, 0, 0);
 
 /// List of all past database schema versions, in *reverse* order
 ///
@@ -29,6 +29,8 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
         // |  leaving the first copy as an example for the next person.
         // v
         // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
+        KnownVersion::new(92, "lldp-link-config-nullable"),
+        KnownVersion::new(91, "add-management-gateway-producer-kind"),
         KnownVersion::new(90, "lookup-bgp-config-by-asn"),
         KnownVersion::new(89, "collapse_lldp_settings"),
         KnownVersion::new(88, "route-local-pref"),
diff --git a/nexus/db-model/src/switch_port.rs b/nexus/db-model/src/switch_port.rs
index 09f1327be2..bbcbb0748a 100644
--- a/nexus/db-model/src/switch_port.rs
+++ b/nexus/db-model/src/switch_port.rs
@@ -381,7 +381,7 @@ impl Into<external::SwitchPortConfig> for SwitchPortConfig {
 #[diesel(table_name = switch_port_settings_link_config)]
 pub struct SwitchPortLinkConfig {
     pub port_settings_id: Uuid,
-    pub lldp_link_config_id: Uuid,
+    pub lldp_link_config_id: Option<Uuid>,
     pub link_name: String,
     pub mtu: SqlU16,
     pub fec: SwitchLinkFec,
@@ -401,7 +401,7 @@ impl SwitchPortLinkConfig {
     ) -> Self {
         Self {
             port_settings_id,
-            lldp_link_config_id,
+            lldp_link_config_id: Some(lldp_link_config_id),
             link_name,
             fec,
             speed,
diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml
index 5192528944..c6c5caab6a 100644
--- a/nexus/db-queries/Cargo.toml
+++ b/nexus/db-queries/Cargo.toml
@@ -14,7 +14,6 @@ omicron-rpaths.workspace = true
 anyhow.workspace = true
 async-bb8-diesel.workspace = true
 async-trait.workspace = true
-bb8.workspace = true
 camino.workspace = true
 chrono.workspace = true
 const_format.workspace = true
@@ -22,6 +21,7 @@ diesel.workspace = true
 diesel-dtrace.workspace = true
 dropshot.workspace = true
 futures.workspace = true
+internal-dns.workspace = true
 ipnetwork.workspace = true
 macaddr.workspace = true
 once_cell.workspace = true
@@ -29,6 +29,7 @@ oxnet.workspace = true
 paste.workspace = true
 # See omicron-rpaths for more about the "pq-sys" dependency.
 pq-sys = "*"
+qorb = { workspace = true, features = [ "qtop" ] }
 rand.workspace = true
 ref-cast.workspace = true
 schemars.workspace = true
@@ -45,8 +46,9 @@ strum.workspace = true
 swrite.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["full"] }
-uuid.workspace = true
+url.workspace = true
 usdt.workspace = true
+uuid.workspace = true
 
 db-macros.workspace = true
 nexus-auth.workspace = true
diff --git a/nexus/db-queries/src/db/collection_attach.rs b/nexus/db-queries/src/db/collection_attach.rs
index 95e6afeb4b..c009d60483 100644
--- a/nexus/db-queries/src/db/collection_attach.rs
+++ b/nexus/db-queries/src/db/collection_attach.rs
@@ -578,9 +578,7 @@ where
 mod test {
     use super::*;
     use crate::db::{self, identity::Resource as IdentityResource};
-    use async_bb8_diesel::{
-        AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager,
-    };
+    use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection};
     use chrono::Utc;
     use db_macros::Resource;
     use diesel::expression_methods::ExpressionMethods;
@@ -617,8 +615,8 @@ mod test {
 
     async fn setup_db(
         pool: &crate::db::Pool,
-    ) -> bb8::PooledConnection<ConnectionManager<DbConnection>> {
-        let connection = pool.pool().get().await.unwrap();
+    ) -> crate::db::datastore::DataStoreConnection {
+        let connection = pool.claim().await.unwrap();
         (*connection)
             .batch_execute_async(
                 "CREATE SCHEMA IF NOT EXISTS test_schema; \
@@ -873,7 +871,7 @@ mod test {
             dev::test_setup_log("test_attach_missing_collection_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -902,7 +900,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_missing_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -939,7 +937,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -987,7 +985,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_once_synchronous");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1036,7 +1034,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_multiple_times");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1092,7 +1090,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_beyond_capacity_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1156,7 +1154,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_while_already_attached");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1263,7 +1261,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1318,7 +1316,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_deleted_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1363,7 +1361,7 @@ mod test {
         let logctx = dev::test_setup_log("test_attach_without_update_filter");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/collection_detach.rs b/nexus/db-queries/src/db/collection_detach.rs
index 03e09d41ca..bc547d5127 100644
--- a/nexus/db-queries/src/db/collection_detach.rs
+++ b/nexus/db-queries/src/db/collection_detach.rs
@@ -482,9 +482,7 @@ mod test {
     use super::*;
     use crate::db::collection_attach::DatastoreAttachTarget;
     use crate::db::{self, identity::Resource as IdentityResource};
-    use async_bb8_diesel::{
-        AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager,
-    };
+    use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection};
     use chrono::Utc;
     use db_macros::Resource;
     use diesel::expression_methods::ExpressionMethods;
@@ -521,8 +519,8 @@ mod test {
 
     async fn setup_db(
         pool: &crate::db::Pool,
-    ) -> bb8::PooledConnection<ConnectionManager<DbConnection>> {
-        let connection = pool.pool().get().await.unwrap();
+    ) -> crate::db::datastore::DataStoreConnection {
+        let connection = pool.claim().await.unwrap();
         (*connection)
             .batch_execute_async(
                 "CREATE SCHEMA IF NOT EXISTS test_schema; \
@@ -786,7 +784,7 @@ mod test {
             dev::test_setup_log("test_detach_missing_collection_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -814,7 +812,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_missing_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -850,7 +848,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -890,7 +888,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_while_already_detached");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -954,7 +952,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_deleted_resource_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -998,7 +996,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_without_update_filter");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/collection_detach_many.rs b/nexus/db-queries/src/db/collection_detach_many.rs
index 986cfb70b7..36755599d4 100644
--- a/nexus/db-queries/src/db/collection_detach_many.rs
+++ b/nexus/db-queries/src/db/collection_detach_many.rs
@@ -480,9 +480,7 @@ mod test {
     use super::*;
     use crate::db::collection_attach::DatastoreAttachTarget;
     use crate::db::{self, identity::Resource as IdentityResource};
-    use async_bb8_diesel::{
-        AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager,
-    };
+    use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection};
     use chrono::Utc;
     use db_macros::Resource;
     use diesel::expression_methods::ExpressionMethods;
@@ -519,8 +517,8 @@ mod test {
 
     async fn setup_db(
         pool: &crate::db::Pool,
-    ) -> bb8::PooledConnection<ConnectionManager<DbConnection>> {
-        let connection = pool.pool().get().await.unwrap();
+    ) -> crate::db::datastore::DataStoreConnection {
+        let connection = pool.claim().await.unwrap();
         (*connection)
             .batch_execute_async(
                 "CREATE SCHEMA IF NOT EXISTS test_schema; \
@@ -778,7 +776,7 @@ mod test {
             dev::test_setup_log("test_detach_missing_collection_fails");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -808,7 +806,7 @@ mod test {
             dev::test_setup_log("test_detach_missing_resource_succeeds");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -849,7 +847,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_once");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -892,7 +890,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_once_synchronous");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -937,7 +935,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_while_already_detached");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -993,7 +991,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_filter_collection");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1044,7 +1042,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_deleted_resource");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -1102,7 +1100,7 @@ mod test {
         let logctx = dev::test_setup_log("test_detach_many");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/collection_insert.rs b/nexus/db-queries/src/db/collection_insert.rs
index 69906e6498..3aaea6aeb1 100644
--- a/nexus/db-queries/src/db/collection_insert.rs
+++ b/nexus/db-queries/src/db/collection_insert.rs
@@ -406,9 +406,7 @@ where
 mod test {
     use super::*;
     use crate::db::{self, identity::Resource as IdentityResource};
-    use async_bb8_diesel::{
-        AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager,
-    };
+    use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection};
     use chrono::{DateTime, Utc};
     use db_macros::Resource;
     use diesel::expression_methods::ExpressionMethods;
@@ -443,8 +441,8 @@ mod test {
 
     async fn setup_db(
         pool: &crate::db::Pool,
-    ) -> bb8::PooledConnection<ConnectionManager<DbConnection>> {
-        let connection = pool.pool().get().await.unwrap();
+    ) -> crate::db::datastore::DataStoreConnection {
+        let connection = pool.claim().await.unwrap();
         (*connection)
             .batch_execute_async(
                 "CREATE SCHEMA IF NOT EXISTS test_schema; \
@@ -560,7 +558,7 @@ mod test {
         let logctx = dev::test_setup_log("test_collection_not_present");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
@@ -590,7 +588,7 @@ mod test {
         let logctx = dev::test_setup_log("test_collection_present");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         let conn = setup_db(&pool).await;
 
diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs
index 4169cc06bd..b997bf384f 100644
--- a/nexus/db-queries/src/db/datastore/db_metadata.rs
+++ b/nexus/db-queries/src/db/datastore/db_metadata.rs
@@ -511,7 +511,7 @@ mod test {
         let mut crdb = test_db::test_setup_database(&logctx.log).await;
 
         let cfg = db::Config { url: crdb.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
+        let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg));
         let datastore =
             Arc::new(DataStore::new(&logctx.log, pool, None).await.unwrap());
 
@@ -559,8 +559,8 @@ mod test {
         let mut crdb = test_db::test_setup_database(&logctx.log).await;
 
         let cfg = db::Config { url: crdb.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
-        let conn = pool.pool().get().await.unwrap();
+        let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg));
+        let conn = pool.claim().await.unwrap();
 
         // Mimic the layout of "schema/crdb".
         let config_dir = Utf8TempDir::new().unwrap();
@@ -671,8 +671,8 @@ mod test {
         let mut crdb = test_db::test_setup_database(&logctx.log).await;
 
         let cfg = db::Config { url: crdb.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
-        let conn = pool.pool().get().await.unwrap();
+        let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg));
+        let conn = pool.claim().await.unwrap();
 
         // Mimic the layout of "schema/crdb".
         let config_dir = Utf8TempDir::new().unwrap();
diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs
index 1774a25c48..8888f2caaa 100644
--- a/nexus/db-queries/src/db/datastore/inventory.rs
+++ b/nexus/db-queries/src/db/datastore/inventory.rs
@@ -2164,7 +2164,7 @@ mod test {
     }
 
     impl CollectionCounts {
-        async fn new(conn: &DataStoreConnection<'_>) -> anyhow::Result<Self> {
+        async fn new(conn: &DataStoreConnection) -> anyhow::Result<Self> {
             conn.transaction_async(|conn| async move {
                 conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL)
                     .await
diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs
index 2cd21754f8..d424e08b61 100644
--- a/nexus/db-queries/src/db/datastore/mod.rs
+++ b/nexus/db-queries/src/db/datastore/mod.rs
@@ -27,7 +27,7 @@ use crate::db::{
     error::{public_error_from_diesel, ErrorHandler},
 };
 use ::oximeter::types::ProducerRegistry;
-use async_bb8_diesel::{AsyncRunQueryDsl, ConnectionManager};
+use async_bb8_diesel::AsyncRunQueryDsl;
 use diesel::pg::Pg;
 use diesel::prelude::*;
 use diesel::query_builder::{QueryFragment, QueryId};
@@ -174,8 +174,8 @@ impl<U, T> RunnableQuery<U> for T where
 {
 }
 
-pub type DataStoreConnection<'a> =
-    bb8::PooledConnection<'a, ConnectionManager<DbConnection>>;
+pub type DataStoreConnection =
+    qorb::claim::Handle<async_bb8_diesel::Connection<DbConnection>>;
 
 pub struct DataStore {
     log: Logger,
@@ -279,8 +279,7 @@ impl DataStore {
         opctx: &OpContext,
     ) -> Result<DataStoreConnection, Error> {
         opctx.authorize(authz::Action::Query, &authz::DATABASE).await?;
-        let pool = self.pool.pool();
-        let connection = pool.get().await.map_err(|err| {
+        let connection = self.pool.claim().await.map_err(|err| {
             Error::unavail(&format!("Failed to access DB connection: {err}"))
         })?;
         Ok(connection)
@@ -294,7 +293,7 @@ impl DataStore {
     pub(super) async fn pool_connection_unauthorized(
         &self,
     ) -> Result<DataStoreConnection, Error> {
-        let connection = self.pool.pool().get().await.map_err(|err| {
+        let connection = self.pool.claim().await.map_err(|err| {
             Error::unavail(&format!("Failed to access DB connection: {err}"))
         })?;
         Ok(connection)
@@ -1587,7 +1586,7 @@ mod test {
             dev::test_setup_log("test_queries_do_not_require_full_table_scan");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
         let datastore =
             DataStore::new(&logctx.log, Arc::new(pool), None).await.unwrap();
         let conn = datastore.pool_connection_for_tests().await.unwrap();
@@ -1632,7 +1631,7 @@ mod test {
         let logctx = dev::test_setup_log("test_sled_ipv6_address_allocation");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
+        let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg));
         let datastore =
             Arc::new(DataStore::new(&logctx.log, pool, None).await.unwrap());
         let opctx = OpContext::for_tests(
diff --git a/nexus/db-queries/src/db/datastore/probe.rs b/nexus/db-queries/src/db/datastore/probe.rs
index f3e0614552..434bf25760 100644
--- a/nexus/db-queries/src/db/datastore/probe.rs
+++ b/nexus/db-queries/src/db/datastore/probe.rs
@@ -62,7 +62,7 @@ impl super::DataStore {
         use db::schema::probe::dsl;
         use db::schema::vpc_subnet::dsl as vpc_subnet_dsl;
 
-        let pool = self.pool_connection_authorized(opctx).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
 
         let probes = match pagparams {
             PaginatedBy::Id(pagparams) => {
@@ -77,7 +77,7 @@ impl super::DataStore {
         .filter(dsl::project_id.eq(authz_project.id()))
         .filter(dsl::time_deleted.is_null())
         .select(Probe::as_select())
-        .load_async(&*pool)
+        .load_async(&*conn)
         .await
         .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
 
@@ -99,7 +99,7 @@ impl super::DataStore {
             let db_subnet = vpc_subnet_dsl::vpc_subnet
                 .filter(vpc_subnet_dsl::id.eq(interface.subnet_id))
                 .select(VpcSubnet::as_select())
-                .first_async(&*pool)
+                .first_async(&*conn)
                 .await
                 .map_err(|e| {
                     public_error_from_diesel(e, ErrorHandler::Server)
@@ -126,7 +126,7 @@ impl super::DataStore {
         &self,
         opctx: &OpContext,
         probe: &Probe,
-        pool: &DataStoreConnection<'_>,
+        conn: &DataStoreConnection,
     ) -> LookupResult<ProbeInfo> {
         use db::schema::vpc_subnet::dsl as vpc_subnet_dsl;
 
@@ -143,7 +143,7 @@ impl super::DataStore {
         let db_subnet = vpc_subnet_dsl::vpc_subnet
             .filter(vpc_subnet_dsl::id.eq(interface.subnet_id))
             .select(VpcSubnet::as_select())
-            .first_async(&**pool)
+            .first_async(&**conn)
             .await
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
 
@@ -172,20 +172,20 @@ impl super::DataStore {
     ) -> ListResultVec<ProbeInfo> {
         use db::schema::probe::dsl;
 
-        let pool = self.pool_connection_authorized(opctx).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
 
         let probes = paginated(dsl::probe, dsl::id, pagparams)
             .filter(dsl::time_deleted.is_null())
             .filter(dsl::sled.eq(sled))
             .select(Probe::as_select())
-            .load_async(&*pool)
+            .load_async(&*conn)
             .await
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
 
         let mut result = Vec::with_capacity(probes.len());
 
         for probe in probes.into_iter() {
-            result.push(self.resolve_probe_info(opctx, &probe, &pool).await?);
+            result.push(self.resolve_probe_info(opctx, &probe, &conn).await?);
         }
 
         Ok(result)
@@ -200,7 +200,7 @@ impl super::DataStore {
     ) -> LookupResult<ProbeInfo> {
         use db::schema::probe;
         use db::schema::probe::dsl;
-        let pool = self.pool_connection_authorized(opctx).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
 
         let name_or_id = name_or_id.clone();
 
@@ -211,7 +211,7 @@ impl super::DataStore {
                 .filter(probe::project_id.eq(authz_project.id()))
                 .select(Probe::as_select())
                 .limit(1)
-                .first_async::<Probe>(&*pool)
+                .first_async::<Probe>(&*conn)
                 .await
                 .map_err(|e| {
                     public_error_from_diesel(
@@ -227,7 +227,7 @@ impl super::DataStore {
                 .filter(probe::project_id.eq(authz_project.id()))
                 .select(Probe::as_select())
                 .limit(1)
-                .first_async::<Probe>(&*pool)
+                .first_async::<Probe>(&*conn)
                 .await
                 .map_err(|e| {
                     public_error_from_diesel(
@@ -240,7 +240,7 @@ impl super::DataStore {
                 }),
         }?;
 
-        self.resolve_probe_info(opctx, &probe, &pool).await
+        self.resolve_probe_info(opctx, &probe, &conn).await
     }
 
     /// Add a probe to the data store.
@@ -253,7 +253,7 @@ impl super::DataStore {
     ) -> CreateResult<Probe> {
         //TODO in transaction
         use db::schema::probe::dsl;
-        let pool = self.pool_connection_authorized(opctx).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
 
         let _eip = self
             .allocate_probe_ephemeral_ip(
@@ -306,7 +306,7 @@ impl super::DataStore {
         let result = diesel::insert_into(dsl::probe)
             .values(probe.clone())
             .returning(Probe::as_returning())
-            .get_result_async(&*pool)
+            .get_result_async(&*conn)
             .await
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
 
@@ -322,7 +322,7 @@ impl super::DataStore {
     ) -> DeleteResult {
         use db::schema::probe;
         use db::schema::probe::dsl;
-        let pool = self.pool_connection_authorized(opctx).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
 
         let name_or_id = name_or_id.clone();
 
@@ -334,7 +334,7 @@ impl super::DataStore {
                 .filter(probe::project_id.eq(authz_project.id()))
                 .select(probe::id)
                 .limit(1)
-                .first_async::<Uuid>(&*pool)
+                .first_async::<Uuid>(&*conn)
                 .await
                 .map_err(|e| {
                     public_error_from_diesel(e, ErrorHandler::Server)
@@ -350,7 +350,7 @@ impl super::DataStore {
             .filter(dsl::id.eq(id))
             .filter(dsl::project_id.eq(authz_project.id()))
             .set(dsl::time_deleted.eq(Utc::now()))
-            .execute_async(&*pool)
+            .execute_async(&*conn)
             .await
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
 
diff --git a/nexus/db-queries/src/db/datastore/pub_test_utils.rs b/nexus/db-queries/src/db/datastore/pub_test_utils.rs
index 93a172bd15..bcf6a6c80f 100644
--- a/nexus/db-queries/src/db/datastore/pub_test_utils.rs
+++ b/nexus/db-queries/src/db/datastore/pub_test_utils.rs
@@ -29,7 +29,7 @@ pub async fn datastore_test(
     use crate::authn;
 
     let cfg = db::Config { url: db.pg_config().clone() };
-    let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
+    let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg));
     let datastore =
         Arc::new(DataStore::new(&logctx.log, pool, None).await.unwrap());
 
diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs
index 2e09c1ac13..59748aa4db 100644
--- a/nexus/db-queries/src/db/datastore/switch_port.rs
+++ b/nexus/db-queries/src/db/datastore/switch_port.rs
@@ -455,7 +455,7 @@ impl DataStore {
                 let lldp_link_ids: Vec<Uuid> = result
                     .links
                     .iter()
-                    .map(|link| link.lldp_link_config_id)
+                    .filter_map(|link| link.lldp_link_config_id)
                     .collect();
 
                 use db::schema::lldp_link_config;
@@ -1511,7 +1511,7 @@ async fn do_switch_port_settings_delete(
     // delete lldp configs
     use db::schema::lldp_link_config;
     let lldp_link_ids: Vec<Uuid> =
-        links.iter().map(|link| link.lldp_link_config_id).collect();
+        links.iter().filter_map(|link| link.lldp_link_config_id).collect();
     diesel::delete(lldp_link_config::dsl::lldp_link_config)
         .filter(lldp_link_config::id.eq_any(lldp_link_ids))
         .execute_async(conn)
diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs
index 14c3405a70..089a2914be 100644
--- a/nexus/db-queries/src/db/datastore/vmm.rs
+++ b/nexus/db-queries/src/db/datastore/vmm.rs
@@ -5,7 +5,6 @@
 //! [`DataStore`] helpers for working with VMM records.
 
 use super::DataStore;
-use crate::authz;
 use crate::context::OpContext;
 use crate::db;
 use crate::db::error::public_error_from_diesel;
@@ -40,8 +39,13 @@ use uuid::Uuid;
 
 /// The result of an [`DataStore::vmm_and_migration_update_runtime`] call,
 /// indicating which records were updated.
-#[derive(Copy, Clone, Debug)]
+#[derive(Clone, Debug)]
 pub struct VmmStateUpdateResult {
+    /// The VMM record that the update query found and possibly updated.
+    ///
+    /// NOTE: This is the record prior to the update!
+    pub found_vmm: Vmm,
+
     /// `true` if the VMM record was updated, `false` otherwise.
     pub vmm_updated: bool,
 
@@ -108,14 +112,10 @@ impl DataStore {
     pub async fn vmm_fetch(
         &self,
         opctx: &OpContext,
-        authz_instance: &authz::Instance,
         vmm_id: &PropolisUuid,
     ) -> LookupResult<Vmm> {
-        opctx.authorize(authz::Action::Read, authz_instance).await?;
-
         let vmm = dsl::vmm
             .filter(dsl::id.eq(vmm_id.into_untyped_uuid()))
-            .filter(dsl::instance_id.eq(authz_instance.id()))
             .filter(dsl::time_deleted.is_null())
             .select(Vmm::as_select())
             .get_result_async(&*self.pool_connection_authorized(opctx).await?)
@@ -233,13 +233,21 @@ impl DataStore {
             .transaction(&conn, |conn| {
                 let err = err.clone();
                 async move {
-                let vmm_updated = self
+                let vmm_update_result = self
                     .vmm_update_runtime_on_connection(
                         &conn,
                         &vmm_id,
                         new_runtime,
                     )
-                    .await.map(|r| match r.status { UpdateStatus::Updated => true, UpdateStatus::NotUpdatedButExists => false })?;
+                    .await?;
+
+
+                let found_vmm = vmm_update_result.found;
+                let vmm_updated = match vmm_update_result.status {
+                     UpdateStatus::Updated => true,
+                     UpdateStatus::NotUpdatedButExists => false
+                };
+
                 let migration_out_updated = match migration_out {
                     Some(migration) => {
                         let r = self.migration_update_source_on_connection(
@@ -287,6 +295,7 @@ impl DataStore {
                     None => false,
                 };
                 Ok(VmmStateUpdateResult {
+                    found_vmm,
                     vmm_updated,
                     migration_in_updated,
                     migration_out_updated,
diff --git a/nexus/db-queries/src/db/explain.rs b/nexus/db-queries/src/db/explain.rs
index 24fd993040..52844c204f 100644
--- a/nexus/db-queries/src/db/explain.rs
+++ b/nexus/db-queries/src/db/explain.rs
@@ -124,8 +124,7 @@ mod test {
     }
 
     async fn create_schema(pool: &db::Pool) {
-        pool.pool()
-            .get()
+        pool.claim()
             .await
             .unwrap()
             .batch_execute_async(
@@ -145,8 +144,8 @@ mod test {
         let logctx = dev::test_setup_log("test_explain_async");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
-        let conn = pool.pool().get().await.unwrap();
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
+        let conn = pool.claim().await.unwrap();
 
         create_schema(&pool).await;
 
@@ -170,8 +169,8 @@ mod test {
         let logctx = dev::test_setup_log("test_explain_full_table_scan");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
-        let conn = pool.pool().get().await.unwrap();
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
+        let conn = pool.claim().await.unwrap();
 
         create_schema(&pool).await;
 
diff --git a/nexus/db-queries/src/db/pagination.rs b/nexus/db-queries/src/db/pagination.rs
index 4fc1cf5966..9920440ade 100644
--- a/nexus/db-queries/src/db/pagination.rs
+++ b/nexus/db-queries/src/db/pagination.rs
@@ -354,7 +354,7 @@ mod test {
     async fn populate_users(pool: &db::Pool, values: &Vec<(i64, i64)>) {
         use schema::test_users::dsl;
 
-        let conn = pool.pool().get().await.unwrap();
+        let conn = pool.claim().await.unwrap();
 
         // The indexes here work around the check that prevents full table
         // scans.
@@ -392,7 +392,7 @@ mod test {
         pool: &db::Pool,
         query: BoxedQuery<schema::test_users::dsl::test_users>,
     ) -> Vec<User> {
-        let conn = pool.pool().get().await.unwrap();
+        let conn = pool.claim().await.unwrap();
         query.select(User::as_select()).load_async(&*conn).await.unwrap()
     }
 
@@ -402,7 +402,7 @@ mod test {
             dev::test_setup_log("test_paginated_single_column_ascending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
@@ -437,7 +437,7 @@ mod test {
             dev::test_setup_log("test_paginated_single_column_descending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
@@ -472,7 +472,7 @@ mod test {
             dev::test_setup_log("test_paginated_multicolumn_ascending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
@@ -526,7 +526,7 @@ mod test {
             dev::test_setup_log("test_paginated_multicolumn_descending");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = db::Pool::new(&logctx.log, &cfg);
+        let pool = db::Pool::new_single_host(&logctx.log, &cfg);
 
         use schema::test_users::dsl;
 
diff --git a/nexus/db-queries/src/db/pool.rs b/nexus/db-queries/src/db/pool.rs
index 497c8d97c5..dccee6fa3f 100644
--- a/nexus/db-queries/src/db/pool.rs
+++ b/nexus/db-queries/src/db/pool.rs
@@ -3,108 +3,155 @@
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
 //! Database connection pooling
-// This whole thing is a placeholder for prototyping.
-//
-// TODO-robustness TODO-resilience We will want to carefully think about the
-// connection pool that we use and its parameters.  It's not clear from the
-// survey so far whether an existing module is suitable for our purposes.  See
-// the Cueball Internals document for details on the sorts of behaviors we'd
-// like here.  Even if by luck we stick with bb8, we definitely want to think
-// through the various parameters.
-//
-// Notes about bb8's behavior:
-// * When the database is completely offline, and somebody wants a connection,
-//   it still waits for the connection timeout before giving up.  That seems
-//   like not what we want.  (To be clear, this is a failure mode where we know
-//   the database is offline, not one where it's partitioned and we can't tell.)
-// * Although the `build_unchecked()` builder allows the pool to start up with
-//   no connections established (good), it also _seems_ to not establish any
-//   connections even when it could, resulting in a latency bubble for the first
-//   operation after startup.  That's not what we're looking for.
-//
 // TODO-design Need TLS support (the types below hardcode NoTls).
 
 use super::Config as DbConfig;
-use async_bb8_diesel::ConnectionError;
-use async_bb8_diesel::ConnectionManager;
+use crate::db::pool_connection::{DieselPgConnector, DieselPgConnectorArgs};
+
+use qorb::backend;
+use qorb::policy::Policy;
+use qorb::resolver::{AllBackends, Resolver};
+use qorb::resolvers::dns::{DnsResolver, DnsResolverConfig};
+use qorb::service;
+use slog::Logger;
+use std::collections::BTreeMap;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use tokio::sync::watch;
 
 pub use super::pool_connection::DbConnection;
 
+type QorbConnection = async_bb8_diesel::Connection<DbConnection>;
+type QorbPool = qorb::pool::Pool<QorbConnection>;
+
 /// Wrapper around a database connection pool.
 ///
 /// Expected to be used as the primary interface to the database.
 pub struct Pool {
-    pool: bb8::Pool<ConnectionManager<DbConnection>>,
+    inner: QorbPool,
 }
 
-impl Pool {
-    pub fn new(log: &slog::Logger, db_config: &DbConfig) -> Self {
-        // Make sure diesel-dtrace's USDT probes are enabled.
-        usdt::register_probes().expect("Failed to register USDT DTrace probes");
-        Self::new_builder(log, db_config, bb8::Builder::new())
-    }
+// Provides an alternative to the DNS resolver for cases where we want to
+// contact the database without performing resolution.
+struct SingleHostResolver {
+    tx: watch::Sender<AllBackends>,
+}
 
-    pub fn new_failfast_for_tests(
-        log: &slog::Logger,
-        db_config: &DbConfig,
-    ) -> Self {
-        Self::new_builder(
-            log,
-            db_config,
-            bb8::Builder::new()
-                .connection_timeout(std::time::Duration::from_millis(1)),
-        )
+impl SingleHostResolver {
+    fn new(config: &DbConfig) -> Self {
+        let backends = Arc::new(BTreeMap::from([(
+            backend::Name::new("singleton"),
+            backend::Backend { address: config.url.address() },
+        )]));
+        let (tx, _rx) = watch::channel(backends.clone());
+        Self { tx }
     }
+}
 
-    fn new_builder(
-        log: &slog::Logger,
-        db_config: &DbConfig,
-        builder: bb8::Builder<ConnectionManager<DbConnection>>,
-    ) -> Self {
-        let url = db_config.url.url();
-        let log = log.new(o!(
-            "database_url" => url.clone(),
-            "component" => "db::Pool"
-        ));
-        info!(&log, "database connection pool");
-        let error_sink = LoggingErrorSink::new(log);
-        let manager = ConnectionManager::<DbConnection>::new(url);
-        let pool = builder
-            .connection_customizer(Box::new(
-                super::pool_connection::ConnectionCustomizer::new(),
-            ))
-            .error_sink(Box::new(error_sink))
-            .build_unchecked(manager);
-        Pool { pool }
+impl Resolver for SingleHostResolver {
+    fn monitor(&mut self) -> watch::Receiver<AllBackends> {
+        self.tx.subscribe()
     }
+}
 
-    /// Returns a reference to the underlying pool.
-    pub fn pool(&self) -> &bb8::Pool<ConnectionManager<DbConnection>> {
-        &self.pool
-    }
+fn make_dns_resolver(
+    bootstrap_dns: Vec<SocketAddr>,
+) -> qorb::resolver::BoxedResolver {
+    Box::new(DnsResolver::new(
+        service::Name(internal_dns::ServiceName::Cockroach.srv_name()),
+        bootstrap_dns,
+        DnsResolverConfig {
+            hardcoded_ttl: Some(tokio::time::Duration::MAX),
+            ..Default::default()
+        },
+    ))
 }
 
-#[derive(Clone, Debug)]
-struct LoggingErrorSink {
-    log: slog::Logger,
+fn make_single_host_resolver(
+    config: &DbConfig,
+) -> qorb::resolver::BoxedResolver {
+    Box::new(SingleHostResolver::new(config))
 }
 
-impl LoggingErrorSink {
-    fn new(log: slog::Logger) -> LoggingErrorSink {
-        LoggingErrorSink { log }
-    }
+fn make_postgres_connector(
+    log: &Logger,
+) -> qorb::backend::SharedConnector<QorbConnection> {
+    // Create postgres connections.
+    //
+    // We're currently relying on the DieselPgConnector doing the following:
+    // - Disallowing full table scans in its implementation of "on_acquire"
+    // - Creating async_bb8_diesel connections that also wrap DTraceConnections.
+    let user = "root";
+    let db = "omicron";
+    let args = vec![("sslmode", "disable")];
+    Arc::new(DieselPgConnector::new(
+        log,
+        DieselPgConnectorArgs { user, db, args },
+    ))
 }
 
-impl bb8::ErrorSink<ConnectionError> for LoggingErrorSink {
-    fn sink(&self, error: ConnectionError) {
-        error!(
-            &self.log,
-            "database connection error";
-            "error_message" => #%error
-        );
+impl Pool {
+    /// Creates a new qorb-backed connection pool to the database.
+    ///
+    /// Creating this pool does not necessarily wait for connections to become
+    /// available, as backends may shift over time.
+    pub fn new(log: &Logger, bootstrap_dns: Vec<SocketAddr>) -> Self {
+        // Make sure diesel-dtrace's USDT probes are enabled.
+        usdt::register_probes().expect("Failed to register USDT DTrace probes");
+
+        let resolver = make_dns_resolver(bootstrap_dns);
+        let connector = make_postgres_connector(log);
+
+        let policy = Policy::default();
+        Pool { inner: qorb::pool::Pool::new(resolver, connector, policy) }
+    }
+
+    /// Creates a new qorb-backed connection pool to a single instance of the
+    /// database.
+    ///
+    /// This is intended for tests that want to skip DNS resolution, relying
+    /// on a single instance of the database.
+    ///
+    /// In production, [Self::new] should be preferred.
+    pub fn new_single_host(log: &Logger, db_config: &DbConfig) -> Self {
+        // Make sure diesel-dtrace's USDT probes are enabled.
+        usdt::register_probes().expect("Failed to register USDT DTrace probes");
+
+        let resolver = make_single_host_resolver(db_config);
+        let connector = make_postgres_connector(log);
+
+        let policy = Policy::default();
+        Pool { inner: qorb::pool::Pool::new(resolver, connector, policy) }
+    }
+
+    /// Creates a new qorb-backed connection pool which returns an error
+    /// if claims are not available within one millisecond.
+    ///
+    /// This is intended for test-only usage, in particular for tests where
+    /// claim requests should rapidly return errors when a backend has been
+    /// intentionally disabled.
+    #[cfg(any(test, feature = "testing"))]
+    pub fn new_single_host_failfast(
+        log: &Logger,
+        db_config: &DbConfig,
+    ) -> Self {
+        // Make sure diesel-dtrace's USDT probes are enabled.
+        usdt::register_probes().expect("Failed to register USDT DTrace probes");
+
+        let resolver = make_single_host_resolver(db_config);
+        let connector = make_postgres_connector(log);
+
+        let policy = Policy {
+            claim_timeout: tokio::time::Duration::from_millis(1),
+            ..Default::default()
+        };
+        Pool { inner: qorb::pool::Pool::new(resolver, connector, policy) }
     }
 
-    fn boxed_clone(&self) -> Box<dyn bb8::ErrorSink<ConnectionError>> {
-        Box::new(self.clone())
+    /// Returns a connection from the pool
+    pub async fn claim(
+        &self,
+    ) -> anyhow::Result<qorb::claim::Handle<QorbConnection>> {
+        Ok(self.inner.claim().await?)
     }
 }
diff --git a/nexus/db-queries/src/db/pool_connection.rs b/nexus/db-queries/src/db/pool_connection.rs
index dae6a0ee51..9a33370a5a 100644
--- a/nexus/db-queries/src/db/pool_connection.rs
+++ b/nexus/db-queries/src/db/pool_connection.rs
@@ -4,46 +4,139 @@
 
 //! Customization that happens on each connection as they're acquired.
 
+use anyhow::anyhow;
+use async_bb8_diesel::AsyncR2D2Connection;
 use async_bb8_diesel::AsyncSimpleConnection;
-use async_bb8_diesel::Connection;
-use async_bb8_diesel::ConnectionError;
 use async_trait::async_trait;
-use bb8::CustomizeConnection;
+use diesel::Connection;
 use diesel::PgConnection;
 use diesel_dtrace::DTraceConnection;
+use qorb::backend::{self, Backend, Error};
+use slog::Logger;
+use url::Url;
 
 pub type DbConnection = DTraceConnection<PgConnection>;
 
 pub const DISALLOW_FULL_TABLE_SCAN_SQL: &str =
     "set disallow_full_table_scans = on; set large_full_scan_rows = 0;";
 
-/// A customizer for all new connections made to CockroachDB, from Diesel.
-#[derive(Debug)]
-pub(crate) struct ConnectionCustomizer {}
+/// A [backend::Connector] which provides access to [PgConnection].
+pub(crate) struct DieselPgConnector {
+    log: Logger,
+    user: String,
+    db: String,
+    args: Vec<(String, String)>,
+}
+
+pub(crate) struct DieselPgConnectorArgs<'a> {
+    pub(crate) user: &'a str,
+    pub(crate) db: &'a str,
+    pub(crate) args: Vec<(&'a str, &'a str)>,
+}
 
-impl ConnectionCustomizer {
-    pub(crate) fn new() -> Self {
-        Self {}
+impl DieselPgConnector {
+    /// Creates a new "connector" to a database, which
+    /// swaps out the IP address at runtime depending on the selected backend.
+    ///
+    /// Format of the url is:
+    ///
+    /// - postgresql://{user}@{address}/{db}
+    ///
+    /// Or, if arguments are supplied:
+    ///
+    /// - postgresql://{user}@{address}/{db}?{args}
+    pub(crate) fn new(log: &Logger, args: DieselPgConnectorArgs<'_>) -> Self {
+        let DieselPgConnectorArgs { user, db, args } = args;
+        Self {
+            log: log.clone(),
+            user: user.to_string(),
+            db: db.to_string(),
+            args: args
+                .into_iter()
+                .map(|(k, v)| (k.to_string(), v.to_string()))
+                .collect(),
+        }
     }
 
-    async fn disallow_full_table_scans(
+    fn to_url(
         &self,
-        conn: &mut Connection<DbConnection>,
-    ) -> Result<(), ConnectionError> {
-        conn.batch_execute_async(DISALLOW_FULL_TABLE_SCAN_SQL).await?;
-        Ok(())
+        address: std::net::SocketAddr,
+    ) -> Result<String, anyhow::Error> {
+        let user = &self.user;
+        let db = &self.db;
+        let mut url =
+            Url::parse(&format!("postgresql://{user}@{address}/{db}"))?;
+
+        for (k, v) in &self.args {
+            url.query_pairs_mut().append_pair(k, v);
+        }
+
+        Ok(url.as_str().to_string())
     }
 }
 
 #[async_trait]
-impl CustomizeConnection<Connection<DbConnection>, ConnectionError>
-    for ConnectionCustomizer
-{
+impl backend::Connector for DieselPgConnector {
+    type Connection = async_bb8_diesel::Connection<DbConnection>;
+
+    async fn connect(
+        &self,
+        backend: &Backend,
+    ) -> Result<Self::Connection, Error> {
+        let url = self.to_url(backend.address).map_err(Error::Other)?;
+
+        let conn = tokio::task::spawn_blocking(move || {
+            let pg_conn = DbConnection::establish(&url)
+                .map_err(|e| Error::Other(anyhow!(e)))?;
+            Ok::<_, Error>(async_bb8_diesel::Connection::new(pg_conn))
+        })
+        .await
+        .expect("Task panicked establishing connection")
+        .map_err(|e| {
+            warn!(
+                self.log,
+                "Failed to make connection";
+                "error" => e.to_string(),
+                "backend" => backend.address,
+            );
+            e
+        })?;
+        Ok(conn)
+    }
+
     async fn on_acquire(
         &self,
-        conn: &mut Connection<DbConnection>,
-    ) -> Result<(), ConnectionError> {
-        self.disallow_full_table_scans(conn).await?;
+        conn: &mut Self::Connection,
+    ) -> Result<(), Error> {
+        conn.batch_execute_async(DISALLOW_FULL_TABLE_SCAN_SQL).await.map_err(
+            |e| {
+                warn!(
+                    self.log,
+                    "Failed on_acquire execution";
+                    "error" => e.to_string()
+                );
+                Error::Other(anyhow!(e))
+            },
+        )?;
         Ok(())
     }
+
+    async fn is_valid(&self, conn: &mut Self::Connection) -> Result<(), Error> {
+        let is_broken = conn.is_broken_async().await;
+        if is_broken {
+            warn!(
+                self.log,
+                "Failed is_valid check; connection known to be broken"
+            );
+            return Err(Error::Other(anyhow!("Connection broken")));
+        }
+        conn.ping_async().await.map_err(|e| {
+            warn!(
+                self.log,
+                "Failed is_valid check; connection failed ping";
+                "error" => e.to_string()
+            );
+            Error::Other(anyhow!(e))
+        })
+    }
 }
diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs
index 7ea44b33fb..4d752d451b 100644
--- a/nexus/db-queries/src/db/queries/external_ip.rs
+++ b/nexus/db-queries/src/db/queries/external_ip.rs
@@ -918,7 +918,8 @@ mod tests {
             crate::db::datastore::test_utils::datastore_test(&logctx, &db)
                 .await;
             let cfg = crate::db::Config { url: db.pg_config().clone() };
-            let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
+            let pool =
+                Arc::new(crate::db::Pool::new_single_host(&logctx.log, &cfg));
             let db_datastore = Arc::new(
                 crate::db::DataStore::new(&logctx.log, Arc::clone(&pool), None)
                     .await
diff --git a/nexus/db-queries/src/db/queries/next_item.rs b/nexus/db-queries/src/db/queries/next_item.rs
index 769c891349..658d151a5b 100644
--- a/nexus/db-queries/src/db/queries/next_item.rs
+++ b/nexus/db-queries/src/db/queries/next_item.rs
@@ -616,7 +616,7 @@ mod tests {
     }
 
     async fn setup_test_schema(pool: &db::Pool) {
-        let connection = pool.pool().get().await.unwrap();
+        let connection = pool.claim().await.unwrap();
         (*connection)
             .batch_execute_async(
                 "CREATE SCHEMA IF NOT EXISTS test_schema; \
@@ -708,8 +708,9 @@ mod tests {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
-        let conn = pool.pool().get().await.unwrap();
+        let pool =
+            Arc::new(crate::db::Pool::new_single_host(&logctx.log, &cfg));
+        let conn = pool.claim().await.unwrap();
 
         // We're going to operate on a separate table, for simplicity.
         setup_test_schema(&pool).await;
@@ -770,8 +771,9 @@ mod tests {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
-        let conn = pool.pool().get().await.unwrap();
+        let pool =
+            Arc::new(crate::db::Pool::new_single_host(&logctx.log, &cfg));
+        let conn = pool.claim().await.unwrap();
 
         // We're going to operate on a separate table, for simplicity.
         setup_test_schema(&pool).await;
diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs
index 7cf378d53b..dbf37fda2e 100644
--- a/nexus/db-queries/src/db/queries/region_allocation.rs
+++ b/nexus/db-queries/src/db/queries/region_allocation.rs
@@ -507,8 +507,8 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = crate::db::Pool::new(&logctx.log, &cfg);
-        let conn = pool.pool().get().await.unwrap();
+        let pool = crate::db::Pool::new_single_host(&logctx.log, &cfg);
+        let conn = pool.claim().await.unwrap();
 
         let volume_id = Uuid::new_v4();
         let params = RegionParameters {
diff --git a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs
index 902d955a79..9d2ed04c85 100644
--- a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs
+++ b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs
@@ -568,8 +568,8 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = crate::db::Pool::new(&logctx.log, &cfg);
-        let conn = pool.pool().get().await.unwrap();
+        let pool = crate::db::Pool::new_single_host(&logctx.log, &cfg);
+        let conn = pool.claim().await.unwrap();
 
         let id = Uuid::nil();
         let project_id = Uuid::nil();
@@ -597,8 +597,8 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = crate::db::Pool::new(&logctx.log, &cfg);
-        let conn = pool.pool().get().await.unwrap();
+        let pool = crate::db::Pool::new_single_host(&logctx.log, &cfg);
+        let conn = pool.claim().await.unwrap();
 
         let id = Uuid::nil();
         let project_id = Uuid::nil();
@@ -624,8 +624,8 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = crate::db::Pool::new(&logctx.log, &cfg);
-        let conn = pool.pool().get().await.unwrap();
+        let pool = crate::db::Pool::new_single_host(&logctx.log, &cfg);
+        let conn = pool.claim().await.unwrap();
 
         let id = InstanceUuid::nil();
         let project_id = Uuid::nil();
@@ -650,8 +650,8 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = crate::db::Pool::new(&logctx.log, &cfg);
-        let conn = pool.pool().get().await.unwrap();
+        let pool = crate::db::Pool::new_single_host(&logctx.log, &cfg);
+        let conn = pool.claim().await.unwrap();
 
         let id = InstanceUuid::nil();
         let project_id = Uuid::nil();
diff --git a/nexus/db-queries/src/db/queries/vpc_subnet.rs b/nexus/db-queries/src/db/queries/vpc_subnet.rs
index 8cbf4495ca..85c771c050 100644
--- a/nexus/db-queries/src/db/queries/vpc_subnet.rs
+++ b/nexus/db-queries/src/db/queries/vpc_subnet.rs
@@ -313,8 +313,9 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
-        let conn = pool.pool().get().await.unwrap();
+        let pool =
+            Arc::new(crate::db::Pool::new_single_host(&logctx.log, &cfg));
+        let conn = pool.claim().await.unwrap();
         let explain = query.explain_async(&conn).await.unwrap();
         println!("{explain}");
         db.cleanup().await.unwrap();
@@ -352,7 +353,8 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
+        let pool =
+            Arc::new(crate::db::Pool::new_single_host(&logctx.log, &cfg));
         let db_datastore = Arc::new(
             crate::db::DataStore::new(&log, Arc::clone(&pool), None)
                 .await
@@ -544,7 +546,8 @@ mod test {
         let log = logctx.log.new(o!());
         let mut db = test_setup_database(&log).await;
         let cfg = crate::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg));
+        let pool =
+            Arc::new(crate::db::Pool::new_single_host(&logctx.log, &cfg));
         let db_datastore = Arc::new(
             crate::db::DataStore::new(&log, Arc::clone(&pool), None)
                 .await
diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml
index c87e1255b5..e63b155fc6 100644
--- a/nexus/examples/config-second.toml
+++ b/nexus/examples/config-second.toml
@@ -141,6 +141,7 @@ saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 region_snapshot_replacement_start.period_secs = 30
 region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
 
 [default_region_allocation_strategy]
 # allocate region on 3 random distinct zpools, on 3 random distinct sleds.
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
index f844adccbe..bca3f7f2c4 100644
--- a/nexus/examples/config.toml
+++ b/nexus/examples/config.toml
@@ -127,6 +127,7 @@ saga_recovery.period_secs = 600
 lookup_region_port.period_secs = 60
 region_snapshot_replacement_start.period_secs = 30
 region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
 
 [default_region_allocation_strategy]
 # allocate region on 3 random distinct zpools, on 3 random distinct sleds.
diff --git a/nexus/internal-api/src/lib.rs b/nexus/internal-api/src/lib.rs
index 7ac3e42f57..12e99ba23b 100644
--- a/nexus/internal-api/src/lib.rs
+++ b/nexus/internal-api/src/lib.rs
@@ -33,14 +33,14 @@ use omicron_common::{
             DiskRuntimeState, DownstairsClientStopRequest,
             DownstairsClientStopped, ProducerEndpoint,
             ProducerRegistrationResponse, RepairFinishInfo, RepairProgress,
-            RepairStartInfo, SledInstanceState,
+            RepairStartInfo, SledVmmState,
         },
     },
     update::ArtifactId,
 };
 use omicron_uuid_kinds::{
-    DemoSagaUuid, DownstairsKind, SledUuid, TypedUuid, UpstairsKind,
-    UpstairsRepairKind,
+    DemoSagaUuid, DownstairsKind, PropolisUuid, SledUuid, TypedUuid,
+    UpstairsKind, UpstairsRepairKind,
 };
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
@@ -108,15 +108,15 @@ pub trait NexusInternalApi {
         body: TypedBody<SwitchPutRequest>,
     ) -> Result<HttpResponseOk<SwitchPutResponse>, HttpError>;
 
-    /// Report updated state for an instance.
+    /// Report updated state for a VMM.
     #[endpoint {
         method = PUT,
-        path = "/instances/{instance_id}",
+        path = "/vmms/{propolis_id}",
     }]
     async fn cpapi_instances_put(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-        new_runtime_state: TypedBody<SledInstanceState>,
+        path_params: Path<VmmPathParam>,
+        new_runtime_state: TypedBody<SledVmmState>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
 
     #[endpoint {
@@ -568,6 +568,12 @@ pub struct InstancePathParam {
     pub instance_id: Uuid,
 }
 
+/// Path parameters for VMM requests (internal API)
+#[derive(Deserialize, JsonSchema)]
+pub struct VmmPathParam {
+    pub propolis_id: PropolisUuid,
+}
+
 #[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)]
 pub struct CollectorIdPathParams {
     /// The ID of the oximeter collector.
diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml
index a531b66df4..1c62e553a8 100644
--- a/nexus/reconfigurator/execution/Cargo.toml
+++ b/nexus/reconfigurator/execution/Cargo.toml
@@ -16,6 +16,7 @@ dns-service-client.workspace = true
 chrono.workspace = true
 futures.workspace = true
 internal-dns.workspace = true
+newtype-uuid.workspace = true
 nexus-config.workspace = true
 nexus-db-model.workspace = true
 nexus-db-queries.workspace = true
diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs
index 9ca14f8e24..1c878a9ada 100644
--- a/nexus/reconfigurator/execution/src/dns.rs
+++ b/nexus/reconfigurator/execution/src/dns.rs
@@ -467,6 +467,7 @@ mod test {
     use internal_dns::resolver::Resolver;
     use internal_dns::ServiceName;
     use internal_dns::DNS_ZONE;
+    use newtype_uuid::GenericUuid;
     use nexus_db_model::DnsGroup;
     use nexus_db_model::Silo;
     use nexus_db_queries::authn;
@@ -478,6 +479,8 @@ mod test {
     use nexus_reconfigurator_planning::blueprint_builder::EnsureMultiple;
     use nexus_reconfigurator_planning::example::example;
     use nexus_reconfigurator_preparation::PlanningInputFromDb;
+    use nexus_sled_agent_shared::inventory::OmicronZoneConfig;
+    use nexus_sled_agent_shared::inventory::OmicronZoneType;
     use nexus_sled_agent_shared::inventory::ZoneKind;
     use nexus_test_utils::resource_helpers::create_silo;
     use nexus_test_utils::resource_helpers::DiskTestBuilder;
@@ -490,6 +493,9 @@ mod test {
     use nexus_types::deployment::CockroachDbClusterVersion;
     use nexus_types::deployment::CockroachDbPreserveDowngrade;
     use nexus_types::deployment::CockroachDbSettings;
+    pub use nexus_types::deployment::OmicronZoneExternalFloatingAddr;
+    pub use nexus_types::deployment::OmicronZoneExternalFloatingIp;
+    pub use nexus_types::deployment::OmicronZoneExternalSnatIp;
     use nexus_types::deployment::SledFilter;
     use nexus_types::external_api::params;
     use nexus_types::external_api::shared;
@@ -539,6 +545,212 @@ mod test {
         }
     }
 
+    /// **********************************************************************
+    /// DEPRECATION WARNING:
+    ///
+    /// Remove when `deprecated_omicron_zone_config_to_blueprint_zone_config`
+    /// is deleted.
+    /// **********************************************************************
+    ///
+    /// Errors from converting an [`OmicronZoneType`] into a [`BlueprintZoneType`].
+    #[derive(Debug, Clone)]
+    pub enum InvalidOmicronZoneType {
+        #[allow(unused)]
+        ExternalIpIdRequired { kind: ZoneKind },
+    }
+
+    /// **********************************************************************
+    /// DEPRECATION WARNING: Do not call this function in new code !!!
+    /// **********************************************************************
+    ///
+    /// Convert an [`OmicronZoneConfig`] to a [`BlueprintZoneConfig`].
+    ///
+    /// A `BlueprintZoneConfig` is a superset of `OmicronZoneConfig` and
+    /// contains auxiliary information not present in an `OmicronZoneConfig`.
+    /// Therefore, the only valid direction for a real system to take is a
+    /// lossy conversion from `BlueprintZoneConfig` to `OmicronZoneConfig`.
+    /// This function, however, does the opposite. We therefore have to inject
+    /// fake information to fill in the unknown fields in the generated
+    /// `OmicronZoneConfig`.
+    ///
+    /// This is bad, and we should generally feel bad for doing it :). At
+    /// the time this was done we were backporting the blueprint system into
+    /// RSS while trying not to change too much code. This was a judicious
+    /// shortcut used right before a release for stability reasons. As the
+    /// number of zones managed by the reconfigurator has grown, the use
+    /// of this function has become more egregious, and so it was removed
+    /// from the production code path and into this test module. This move
+    /// itself is a judicious shortcut. We have a test in this module,
+    /// `test_blueprint_internal_dns_basic`, that is the last caller of this
+    /// function, and so we have moved this function into this module.
+    ///
+    /// Ideally, we would get rid of this function altogether and use another
+    /// method for generating `BlueprintZoneConfig` structures. Unfortunately,
+    /// there are still a few remaining zones that need to be implemented in the
+    /// `BlueprintBuilder`, and some of them require custom code. Until that is
+    /// done, we don't have a good way of generating a test representation of
+    /// the real system that would properly serve this test. We could generate
+    /// a `BlueprintZoneConfig` by hand for each zone type in this test, on
+    /// top of the more modern `SystemDescription` setup, but that isn't much
+    /// different than what we do in this test. We'd also eventually remove it
+    /// for better test setup when our `BlueprintBuilder` is capable of properly
+    /// constructing all zone types. Instead, we do the simple thing, and reuse
+    /// what we alreaady have.
+    ///
+    /// # Errors
+    ///
+    /// If `config.zone_type` is a zone that has an external IP address (Nexus,
+    /// boundary NTP, external DNS), `external_ip_id` must be `Some(_)` or this
+    /// method will return an error.
+    pub fn deprecated_omicron_zone_config_to_blueprint_zone_config(
+        config: OmicronZoneConfig,
+        disposition: BlueprintZoneDisposition,
+        external_ip_id: Option<ExternalIpUuid>,
+    ) -> Result<BlueprintZoneConfig, InvalidOmicronZoneType> {
+        let kind = config.zone_type.kind();
+        let zone_type = match config.zone_type {
+            OmicronZoneType::BoundaryNtp {
+                address,
+                dns_servers,
+                domain,
+                nic,
+                ntp_servers,
+                snat_cfg,
+            } => {
+                let external_ip_id = external_ip_id.ok_or(
+                    InvalidOmicronZoneType::ExternalIpIdRequired { kind },
+                )?;
+                BlueprintZoneType::BoundaryNtp(
+                    blueprint_zone_type::BoundaryNtp {
+                        address,
+                        ntp_servers,
+                        dns_servers,
+                        domain,
+                        nic,
+                        external_ip: OmicronZoneExternalSnatIp {
+                            id: external_ip_id,
+                            snat_cfg,
+                        },
+                    },
+                )
+            }
+            OmicronZoneType::Clickhouse { address, dataset } => {
+                BlueprintZoneType::Clickhouse(blueprint_zone_type::Clickhouse {
+                    address,
+                    dataset,
+                })
+            }
+            OmicronZoneType::ClickhouseKeeper { address, dataset } => {
+                BlueprintZoneType::ClickhouseKeeper(
+                    blueprint_zone_type::ClickhouseKeeper { address, dataset },
+                )
+            }
+            OmicronZoneType::ClickhouseServer { address, dataset } => {
+                BlueprintZoneType::ClickhouseServer(
+                    blueprint_zone_type::ClickhouseServer { address, dataset },
+                )
+            }
+            OmicronZoneType::CockroachDb { address, dataset } => {
+                BlueprintZoneType::CockroachDb(
+                    blueprint_zone_type::CockroachDb { address, dataset },
+                )
+            }
+            OmicronZoneType::Crucible { address, dataset } => {
+                BlueprintZoneType::Crucible(blueprint_zone_type::Crucible {
+                    address,
+                    dataset,
+                })
+            }
+            OmicronZoneType::CruciblePantry { address } => {
+                BlueprintZoneType::CruciblePantry(
+                    blueprint_zone_type::CruciblePantry { address },
+                )
+            }
+            OmicronZoneType::ExternalDns {
+                dataset,
+                dns_address,
+                http_address,
+                nic,
+            } => {
+                let external_ip_id = external_ip_id.ok_or(
+                    InvalidOmicronZoneType::ExternalIpIdRequired { kind },
+                )?;
+                BlueprintZoneType::ExternalDns(
+                    blueprint_zone_type::ExternalDns {
+                        dataset,
+                        http_address,
+                        dns_address: OmicronZoneExternalFloatingAddr {
+                            id: external_ip_id,
+                            addr: dns_address,
+                        },
+                        nic,
+                    },
+                )
+            }
+            OmicronZoneType::InternalDns {
+                dataset,
+                dns_address,
+                gz_address,
+                gz_address_index,
+                http_address,
+            } => BlueprintZoneType::InternalDns(
+                blueprint_zone_type::InternalDns {
+                    dataset,
+                    http_address,
+                    dns_address,
+                    gz_address,
+                    gz_address_index,
+                },
+            ),
+            OmicronZoneType::InternalNtp {
+                address,
+                dns_servers,
+                domain,
+                ntp_servers,
+            } => BlueprintZoneType::InternalNtp(
+                blueprint_zone_type::InternalNtp {
+                    address,
+                    ntp_servers,
+                    dns_servers,
+                    domain,
+                },
+            ),
+            OmicronZoneType::Nexus {
+                external_dns_servers,
+                external_ip,
+                external_tls,
+                internal_address,
+                nic,
+            } => {
+                let external_ip_id = external_ip_id.ok_or(
+                    InvalidOmicronZoneType::ExternalIpIdRequired { kind },
+                )?;
+                BlueprintZoneType::Nexus(blueprint_zone_type::Nexus {
+                    internal_address,
+                    external_ip: OmicronZoneExternalFloatingIp {
+                        id: external_ip_id,
+                        ip: external_ip,
+                    },
+                    nic,
+                    external_tls,
+                    external_dns_servers,
+                })
+            }
+            OmicronZoneType::Oximeter { address } => {
+                BlueprintZoneType::Oximeter(blueprint_zone_type::Oximeter {
+                    address,
+                })
+            }
+        };
+        Ok(BlueprintZoneConfig {
+            disposition,
+            id: OmicronZoneUuid::from_untyped_uuid(config.id),
+            underlay_address: config.underlay_address,
+            filesystem_pool: config.filesystem_pool,
+            zone_type,
+        })
+    }
+
     /// test blueprint_internal_dns_config(): trivial case of an empty blueprint
     #[test]
     fn test_blueprint_internal_dns_empty() {
@@ -589,7 +801,7 @@ mod test {
                         .zones
                         .into_iter()
                         .map(|config| -> BlueprintZoneConfig {
-                            BlueprintZoneConfig::from_omicron_zone_config(
+                            deprecated_omicron_zone_config_to_blueprint_zone_config(
                                 config,
                                 BlueprintZoneDisposition::InService,
                                 // We don't get external IP IDs in inventory
diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index 37c276fa07..ae4309d8f9 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -110,6 +110,7 @@ use super::tasks::region_replacement;
 use super::tasks::region_replacement_driver;
 use super::tasks::region_snapshot_replacement_garbage_collect::*;
 use super::tasks::region_snapshot_replacement_start::*;
+use super::tasks::region_snapshot_replacement_step::*;
 use super::tasks::saga_recovery;
 use super::tasks::service_firewall_rules;
 use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker;
@@ -165,6 +166,7 @@ pub struct BackgroundTasks {
     pub task_lookup_region_port: Activator,
     pub task_region_snapshot_replacement_start: Activator,
     pub task_region_snapshot_replacement_garbage_collection: Activator,
+    pub task_region_snapshot_replacement_step: Activator,
 
     // Handles to activate background tasks that do not get used by Nexus
     // at-large.  These background tasks are implementation details as far as
@@ -249,6 +251,7 @@ impl BackgroundTasksInitializer {
             task_region_snapshot_replacement_start: Activator::new(),
             task_region_snapshot_replacement_garbage_collection: Activator::new(
             ),
+            task_region_snapshot_replacement_step: Activator::new(),
 
             task_internal_dns_propagation: Activator::new(),
             task_external_dns_propagation: Activator::new(),
@@ -312,6 +315,7 @@ impl BackgroundTasksInitializer {
             task_lookup_region_port,
             task_region_snapshot_replacement_start,
             task_region_snapshot_replacement_garbage_collection,
+            task_region_snapshot_replacement_step,
             // Add new background tasks here.  Be sure to use this binding in a
             // call to `Driver::register()` below.  That's what actually wires
             // up the Activator to the corresponding background task.
@@ -761,7 +765,7 @@ impl BackgroundTasksInitializer {
                 .region_snapshot_replacement_garbage_collection
                 .period_secs,
             task_impl: Box::new(RegionSnapshotReplacementGarbageCollect::new(
-                datastore,
+                datastore.clone(),
                 sagas.clone(),
             )),
             opctx: opctx.child(BTreeMap::new()),
@@ -769,6 +773,21 @@ impl BackgroundTasksInitializer {
             activator: task_region_snapshot_replacement_garbage_collection,
         });
 
+        driver.register(TaskDefinition {
+            name: "region_snapshot_replacement_step",
+            description:
+                "detect what volumes were affected by a region snapshot \
+                replacement, and run the step saga for them",
+            period: config.region_snapshot_replacement_step.period_secs,
+            task_impl: Box::new(RegionSnapshotReplacementFindAffected::new(
+                datastore,
+                sagas.clone(),
+            )),
+            opctx: opctx.child(BTreeMap::new()),
+            watchers: vec![],
+            activator: task_region_snapshot_replacement_step,
+        });
+
         driver
     }
 }
diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs
index a81080ec75..ca6e7e4271 100644
--- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs
+++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs
@@ -28,8 +28,8 @@
 //! remains alive and continues to own its virtual provisioning resources.
 //!
 //! Cleanup of instance resources when an instance's *active* VMM is destroyed
-//! is handled elsewhere, by `notify_instance_updated` and (eventually) the
-//! `instance-update` saga.
+//! is handled elsewhere, by `process_vmm_update` and the `instance-update`
+//! saga.
 
 use crate::app::background::BackgroundTask;
 use anyhow::Context;
diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs
index dbbfcb3b14..2b1e3eedca 100644
--- a/nexus/src/app/background/tasks/blueprint_execution.rs
+++ b/nexus/src/app/background/tasks/blueprint_execution.rs
@@ -83,7 +83,7 @@ impl BlueprintExecutor {
                       "target_id" => %blueprint.id);
             return json!({
                 "target_id": blueprint.id.to_string(),
-                "error": "blueprint disabled"
+                "enabled": false,
             });
         }
 
@@ -111,6 +111,7 @@ impl BlueprintExecutor {
 
                 json!({
                     "target_id": blueprint.id.to_string(),
+                    "enabled": true,
                     "needs_saga_recovery": needs_saga_recovery,
                 })
             }
@@ -119,6 +120,7 @@ impl BlueprintExecutor {
                     errors.into_iter().map(|e| format!("{:#}", e)).collect();
                 json!({
                     "target_id": blueprint.id.to_string(),
+                    "enabled": true,
                     "errors": errors
                 })
             }
@@ -316,6 +318,7 @@ mod test {
             value,
             json!({
                 "target_id": blueprint_id,
+                "enabled": true,
                 "needs_saga_recovery": false,
             })
         );
@@ -410,6 +413,7 @@ mod test {
             value,
             json!({
                 "target_id": blueprint.1.id.to_string(),
+                "enabled": true,
                 "needs_saga_recovery": false,
             })
         );
@@ -427,7 +431,7 @@ mod test {
         assert_eq!(
             value,
             json!({
-                "error": "blueprint disabled",
+                "enabled": false,
                 "target_id": blueprint.1.id.to_string()
             })
         );
diff --git a/nexus/src/app/background/tasks/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs
index 31bc00441d..70fcf713bc 100644
--- a/nexus/src/app/background/tasks/blueprint_load.rs
+++ b/nexus/src/app/background/tasks/blueprint_load.rs
@@ -78,6 +78,7 @@ impl BackgroundTask for TargetBlueprintLoader {
             };
 
             // Decide what to do with the new blueprint
+            let enabled = new_bp_target.enabled;
             let Some((old_bp_target, old_blueprint)) = self.last.as_deref()
             else {
                 // We've found a target blueprint for the first time.
@@ -97,6 +98,7 @@ impl BackgroundTask for TargetBlueprintLoader {
                     "time_created": time_created,
                     "time_found": chrono::Utc::now(),
                     "status": "first target blueprint",
+                    "enabled": enabled,
                 });
             };
 
@@ -116,7 +118,8 @@ impl BackgroundTask for TargetBlueprintLoader {
                     "target_id": target_id,
                     "time_created": time_created,
                     "time_found": chrono::Utc::now(),
-                    "status": "target blueprint updated"
+                    "status": "target blueprint updated",
+                    "enabled": enabled,
                 })
             } else {
                 // The new target id matches the old target id
@@ -159,6 +162,7 @@ impl BackgroundTask for TargetBlueprintLoader {
                         "time_created": time_created,
                         "time_found": chrono::Utc::now(),
                         "status": format!("target blueprint {status}"),
+                        "enabled": enabled,
                     })
                 } else {
                     // We found a new target blueprint that exactly
@@ -173,7 +177,8 @@ impl BackgroundTask for TargetBlueprintLoader {
                     json!({
                         "target_id": target_id,
                         "time_created": time_created,
-                        "status": "target blueprint unchanged"
+                        "status": "target blueprint unchanged",
+                        "enabled": enabled,
                     })
                 }
             }
diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs
index f63c21105e..ae78392ea3 100644
--- a/nexus/src/app/background/tasks/instance_watcher.rs
+++ b/nexus/src/app/background/tasks/instance_watcher.rs
@@ -19,9 +19,9 @@ use nexus_types::identity::Asset;
 use nexus_types::identity::Resource;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::InstanceState;
-use omicron_common::api::internal::nexus::SledInstanceState;
+use omicron_common::api::internal::nexus::SledVmmState;
 use omicron_uuid_kinds::GenericUuid;
-use omicron_uuid_kinds::InstanceUuid;
+use omicron_uuid_kinds::PropolisUuid;
 use oximeter::types::ProducerRegistry;
 use sled_agent_client::Client as SledAgentClient;
 use std::borrow::Cow;
@@ -81,12 +81,12 @@ impl InstanceWatcher {
         let client = client.clone();
 
         async move {
-            slog::trace!(opctx.log, "checking on instance...");
-            let rsp = client
-                .instance_get_state(&InstanceUuid::from_untyped_uuid(
-                    target.instance_id,
-                ))
-                .await;
+            let vmm_id = PropolisUuid::from_untyped_uuid(target.vmm_id);
+            slog::trace!(
+                opctx.log, "checking on VMM"; "propolis_id" => %vmm_id
+            );
+
+            let rsp = client.vmm_get_state(&vmm_id).await;
             let mut check = Check {
                 target,
                 outcome: Default::default(),
@@ -151,7 +151,7 @@ impl InstanceWatcher {
                 }
             };
 
-            let new_runtime_state: SledInstanceState = state.into();
+            let new_runtime_state: SledVmmState = state.into();
             check.outcome =
                 CheckOutcome::Success(new_runtime_state.vmm_state.state.into());
             debug!(
@@ -159,10 +159,10 @@ impl InstanceWatcher {
                 "updating instance state";
                 "state" => ?new_runtime_state.vmm_state.state,
             );
-            match crate::app::instance::notify_instance_updated(
+            match crate::app::instance::process_vmm_update(
                 &datastore,
                 &opctx,
-                InstanceUuid::from_untyped_uuid(target.instance_id),
+                PropolisUuid::from_untyped_uuid(target.vmm_id),
                 &new_runtime_state,
             )
             .await
@@ -176,7 +176,7 @@ impl InstanceWatcher {
                         _ => Err(Incomplete::UpdateFailed),
                     };
                 }
-                Ok(Some(saga)) => {
+                Ok(Some((_, saga))) => {
                     check.update_saga_queued = true;
                     if let Err(e) = sagas.saga_start(saga).await {
                         warn!(opctx.log, "update saga failed"; "error" => ?e);
diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs
index 7ba68d0b80..6089ba8d65 100644
--- a/nexus/src/app/background/tasks/mod.rs
+++ b/nexus/src/app/background/tasks/mod.rs
@@ -27,6 +27,7 @@ pub mod region_replacement;
 pub mod region_replacement_driver;
 pub mod region_snapshot_replacement_garbage_collect;
 pub mod region_snapshot_replacement_start;
+pub mod region_snapshot_replacement_step;
 pub mod saga_recovery;
 pub mod service_firewall_rules;
 pub mod sync_service_zone_nat;
diff --git a/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs
new file mode 100644
index 0000000000..d78e304b75
--- /dev/null
+++ b/nexus/src/app/background/tasks/region_snapshot_replacement_step.rs
@@ -0,0 +1,775 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Background task for detecting volumes affected by a region snapshot
+//! replacement, creating records for those, and triggering the "step" saga for
+//! them.
+//!
+//! After the region snapshot replacement start saga finishes, the snapshot's
+//! volume is no longer in a degraded state: the requested read-only region was
+//! cloned to a new region, and the reference was replaced in the construction
+//! request.  Any disk that is now created using the snapshot as a source will
+//! work without issues.
+//!
+//! The problem now is volumes that still reference the replaced read-only
+//! region, and any Upstairs constructed from a VCR that references that region.
+//! This task's responsibility is to find all volumes that reference the
+//! replaced read-only region, create a record for them, and trigger the region
+//! snapshot replacement step saga. This is a much less involved process than
+//! region replacement: no continuous monitoring and driving is required. See
+//! the "region snapshot replacement step" saga's docstring for more
+//! information.
+
+use crate::app::authn;
+use crate::app::background::BackgroundTask;
+use crate::app::saga::StartSaga;
+use crate::app::sagas;
+use crate::app::sagas::region_snapshot_replacement_step::*;
+use crate::app::sagas::region_snapshot_replacement_step_garbage_collect::*;
+use crate::app::sagas::NexusSaga;
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use nexus_db_model::RegionSnapshotReplacementStep;
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::DataStore;
+use nexus_types::identity::Asset;
+use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus;
+use serde_json::json;
+use std::sync::Arc;
+
+pub struct RegionSnapshotReplacementFindAffected {
+    datastore: Arc<DataStore>,
+    sagas: Arc<dyn StartSaga>,
+}
+
+impl RegionSnapshotReplacementFindAffected {
+    pub fn new(datastore: Arc<DataStore>, sagas: Arc<dyn StartSaga>) -> Self {
+        RegionSnapshotReplacementFindAffected { datastore, sagas }
+    }
+
+    async fn send_start_request(
+        &self,
+        opctx: &OpContext,
+        request: RegionSnapshotReplacementStep,
+    ) -> Result<(), omicron_common::api::external::Error> {
+        let params = sagas::region_snapshot_replacement_step::Params {
+            serialized_authn: authn::saga::Serialized::for_opctx(opctx),
+            request,
+        };
+
+        let saga_dag = SagaRegionSnapshotReplacementStep::prepare(&params)?;
+        self.sagas.saga_start(saga_dag).await
+    }
+
+    async fn send_garbage_collect_request(
+        &self,
+        opctx: &OpContext,
+        request: RegionSnapshotReplacementStep,
+    ) -> Result<(), omicron_common::api::external::Error> {
+        let Some(old_snapshot_volume_id) = request.old_snapshot_volume_id
+        else {
+            // This state is illegal!
+            let s = format!(
+                "request {} old snapshot volume id is None!",
+                request.id,
+            );
+
+            return Err(omicron_common::api::external::Error::internal_error(
+                &s,
+            ));
+        };
+
+        let params =
+            sagas::region_snapshot_replacement_step_garbage_collect::Params {
+                serialized_authn: authn::saga::Serialized::for_opctx(opctx),
+                old_snapshot_volume_id,
+                request,
+            };
+
+        let saga_dag =
+            SagaRegionSnapshotReplacementStepGarbageCollect::prepare(&params)?;
+        self.sagas.saga_start(saga_dag).await
+    }
+
+    async fn clean_up_region_snapshot_replacement_step_volumes(
+        &self,
+        opctx: &OpContext,
+        status: &mut RegionSnapshotReplacementStepStatus,
+    ) {
+        let log = &opctx.log;
+
+        let requests = match self
+            .datastore
+            .region_snapshot_replacement_steps_requiring_garbage_collection(
+                opctx,
+            )
+            .await
+        {
+            Ok(requests) => requests,
+
+            Err(e) => {
+                let s = format!("querying for steps to collect failed! {e}");
+                error!(&log, "{s}");
+                status.errors.push(s);
+                return;
+            }
+        };
+
+        for request in requests {
+            let request_id = request.id;
+
+            let result =
+                self.send_garbage_collect_request(opctx, request.clone()).await;
+
+            match result {
+                Ok(()) => {
+                    let s = format!(
+                        "region snapshot replacement step garbage \
+                    collect request ok for {request_id}"
+                    );
+
+                    info!(
+                        &log,
+                        "{s}";
+                        "request.volume_id" => %request.volume_id,
+                        "request.old_snapshot_volume_id" => ?request.old_snapshot_volume_id,
+                    );
+                    status.step_garbage_collect_invoked_ok.push(s);
+                }
+
+                Err(e) => {
+                    let s = format!(
+                        "sending region snapshot replacement step garbage \
+                        collect request failed: {e}",
+                    );
+                    error!(
+                        &log,
+                        "{s}";
+                        "request.volume_id" => %request.volume_id,
+                        "request.old_snapshot_volume_id" => ?request.old_snapshot_volume_id,
+                    );
+                    status.errors.push(s);
+                }
+            }
+        }
+    }
+
+    // Any request in state Running means that the target replacement has
+    // occurred already, meaning the region snapshot being replaced is not
+    // present as a target in the snapshot's volume construction request
+    // anymore. Any future usage of that snapshot (as a source for a disk or
+    // otherwise) will get a volume construction request that references the
+    // replacement read-only region.
+    //
+    // "step" records are created here for each volume found that still
+    // references the replaced region snapshot, most likely having been created
+    // by copying the snapshot's volume construction request before the target
+    // replacement occurred. These volumes also need to have target replacement
+    // performed, and this is captured in this "step" record.
+    async fn create_step_records_for_affected_volumes(
+        &self,
+        opctx: &OpContext,
+        status: &mut RegionSnapshotReplacementStepStatus,
+    ) {
+        let log = &opctx.log;
+
+        // Find all region snapshot replacement requests in state "Running"
+        let requests = match self
+            .datastore
+            .get_running_region_snapshot_replacements(opctx)
+            .await
+        {
+            Ok(requests) => requests,
+
+            Err(e) => {
+                let s = format!(
+                    "get_running_region_snapshot_replacements failed: {e}",
+                );
+
+                error!(&log, "{s}");
+                status.errors.push(s);
+                return;
+            }
+        };
+
+        for request in requests {
+            // Find all volumes that reference the replaced snapshot
+            let region_snapshot = match self
+                .datastore
+                .region_snapshot_get(
+                    request.old_dataset_id,
+                    request.old_region_id,
+                    request.old_snapshot_id,
+                )
+                .await
+            {
+                Ok(Some(region_snapshot)) => region_snapshot,
+
+                Ok(None) => {
+                    let s = format!(
+                        "region snapshot {} {} {} not found!",
+                        request.old_dataset_id,
+                        request.old_region_id,
+                        request.old_snapshot_id,
+                    );
+                    error!(&log, "{s}");
+                    status.errors.push(s);
+
+                    continue;
+                }
+
+                Err(e) => {
+                    let s = format!(
+                        "error querying for region snapshot {} {} {}: {e}",
+                        request.old_dataset_id,
+                        request.old_region_id,
+                        request.old_snapshot_id,
+                    );
+                    error!(&log, "{s}");
+                    status.errors.push(s);
+
+                    continue;
+                }
+            };
+
+            let snapshot_addr = match region_snapshot.snapshot_addr.parse() {
+                Ok(addr) => addr,
+
+                Err(e) => {
+                    let s = format!(
+                        "region snapshot addr {} could not be parsed: {e}",
+                        region_snapshot.snapshot_addr,
+                    );
+                    error!(&log, "{s}");
+                    status.errors.push(s);
+
+                    continue;
+                }
+            };
+
+            let volumes = match self
+                .datastore
+                .find_volumes_referencing_socket_addr(&opctx, snapshot_addr)
+                .await
+            {
+                Ok(volumes) => volumes,
+
+                Err(e) => {
+                    let s = format!("error finding referenced volumes: {e}");
+                    error!(
+                        log,
+                        "{s}";
+                        "request id" => ?request.id,
+                    );
+                    status.errors.push(s);
+
+                    continue;
+                }
+            };
+
+            for volume in volumes {
+                // Any volume referencing the old socket addr needs to be
+                // replaced. Create a "step" record for this.
+                //
+                // Note: this function returns a conflict error if there already
+                // exists a step record referencing this volume ID because a
+                // volume repair record is also created using that volume ID,
+                // and only one of those can exist for a given volume at a time.
+                //
+                // Also note: this function returns a conflict error if another
+                // step record references this volume id in the "old snapshot
+                // volume id" column - this is ok! Region snapshot replacement
+                // step records are created for some volume id, and a null old
+                // snapshot volume id:
+                //
+                //   volume_id: references snapshot_addr
+                //   old_snapshot_volume_id: null
+                //
+                // The region snapshot replacement step saga will create a
+                // volume to stash the reference to snapshot_addr, and then call
+                // `volume_replace_snapshot`. This will swap snapshot_addr
+                // reference into the old snapshot volume for later deletion:
+                //
+                //   volume_id: does _not_ reference snapshot_addr anymore
+                //   old_snapshot_volume_id: now references snapshot_addr
+                //
+                // If `find_volumes_referencing_socket_addr` is executed before
+                // that volume is deleted, it will return the old snapshot
+                // volume id above, and then this for loop tries to make a
+                // region snapshot replacement step record for it!
+                //
+                // Allowing a region snapshot replacement step record to be
+                // created in this case would mean that (depending on when the
+                // functions execute), an indefinite amount of work would be
+                // created, continually "moving" the snapshot_addr from
+                // temporary volume to temporary volume.
+
+                match self
+                    .datastore
+                    .create_region_snapshot_replacement_step(
+                        opctx,
+                        request.id,
+                        volume.id(),
+                    )
+                    .await
+                {
+                    Ok(step_request_id) => {
+                        let s = format!("created {step_request_id}");
+                        info!(
+                            log,
+                            "{s}";
+                            "request id" => ?request.id,
+                            "volume id" => ?volume.id(),
+                        );
+                        status.step_records_created_ok.push(s);
+                    }
+
+                    Err(e) => {
+                        let s = format!("error creating step request: {e}");
+                        error!(
+                            log,
+                            "{s}";
+                            "request id" => ?request.id,
+                            "volume id" => ?volume.id(),
+                        );
+                        status.errors.push(s);
+                    }
+                }
+            }
+        }
+    }
+
+    async fn invoke_step_saga_for_affected_volumes(
+        &self,
+        opctx: &OpContext,
+        status: &mut RegionSnapshotReplacementStepStatus,
+    ) {
+        let log = &opctx.log;
+
+        // Once all region snapshot replacement step records have been created,
+        // trigger sagas as appropriate.
+
+        let step_requests = match self
+            .datastore
+            .get_requested_region_snapshot_replacement_steps(opctx)
+            .await
+        {
+            Ok(step_requests) => step_requests,
+
+            Err(e) => {
+                let s = format!(
+                    "query for requested region snapshot replacement step \
+                    requests failed: {e}"
+                );
+                error!(&log, "{s}");
+                status.errors.push(s);
+
+                return;
+            }
+        };
+
+        for request in step_requests {
+            let request_id = request.id;
+
+            match self.send_start_request(opctx, request.clone()).await {
+                Ok(()) => {
+                    let s = format!(
+                        "region snapshot replacement step saga invoked ok for \
+                        {request_id}"
+                    );
+
+                    info!(
+                        &log,
+                        "{s}";
+                        "request.request_id" => %request.request_id,
+                        "request.volume_id" => %request.volume_id,
+                    );
+                    status.step_invoked_ok.push(s);
+                }
+
+                Err(e) => {
+                    let s = format!(
+                        "invoking region snapshot replacement step saga for \
+                        {request_id} failed: {e}"
+                    );
+
+                    error!(
+                        &log,
+                        "{s}";
+                        "request.request_id" => %request.request_id,
+                        "request.volume_id" => %request.volume_id,
+                    );
+                    status.errors.push(s);
+                }
+            };
+        }
+    }
+}
+
+impl BackgroundTask for RegionSnapshotReplacementFindAffected {
+    fn activate<'a>(
+        &'a mut self,
+        opctx: &'a OpContext,
+    ) -> BoxFuture<'a, serde_json::Value> {
+        async move {
+            let log = &opctx.log;
+            info!(
+                &log,
+                "region snapshot replacement find affected volumes task started"
+            );
+
+            let mut status = RegionSnapshotReplacementStepStatus::default();
+
+            // Importantly, clean old steps up before finding affected volumes!
+            // Otherwise, will continue to find the snapshot in volumes to
+            // delete, and will continue to see conflicts in next function.
+            self.clean_up_region_snapshot_replacement_step_volumes(
+                opctx,
+                &mut status,
+            )
+            .await;
+
+            self.create_step_records_for_affected_volumes(opctx, &mut status)
+                .await;
+
+            self.invoke_step_saga_for_affected_volumes(opctx, &mut status)
+                .await;
+
+            info!(
+                &log,
+                "region snapshot replacement find affected volumes task done"
+            );
+
+            json!(status)
+        }
+        .boxed()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::app::background::init::test::NoopStartSaga;
+    use nexus_db_model::RegionSnapshot;
+    use nexus_db_model::RegionSnapshotReplacement;
+    use nexus_db_model::RegionSnapshotReplacementStep;
+    use nexus_db_model::RegionSnapshotReplacementStepState;
+    use nexus_db_model::Volume;
+    use nexus_test_utils_macros::nexus_test;
+    use sled_agent_client::types::CrucibleOpts;
+    use sled_agent_client::types::VolumeConstructionRequest;
+    use uuid::Uuid;
+
+    type ControlPlaneTestContext =
+        nexus_test_utils::ControlPlaneTestContext<crate::Server>;
+
+    async fn add_fake_volume_for_snapshot_addr(
+        datastore: &DataStore,
+        snapshot_addr: String,
+    ) -> Uuid {
+        let new_volume_id = Uuid::new_v4();
+
+        let volume_construction_request = VolumeConstructionRequest::Volume {
+            id: new_volume_id,
+            block_size: 0,
+            sub_volumes: vec![],
+            read_only_parent: Some(Box::new(
+                VolumeConstructionRequest::Region {
+                    block_size: 0,
+                    blocks_per_extent: 0,
+                    extent_count: 0,
+                    gen: 0,
+                    opts: CrucibleOpts {
+                        id: Uuid::new_v4(),
+                        target: vec![snapshot_addr],
+                        lossy: false,
+                        flush_timeout: None,
+                        key: None,
+                        cert_pem: None,
+                        key_pem: None,
+                        root_cert_pem: None,
+                        control: None,
+                        read_only: true,
+                    },
+                },
+            )),
+        };
+
+        let volume_data =
+            serde_json::to_string(&volume_construction_request).unwrap();
+
+        let volume = Volume::new(new_volume_id, volume_data);
+
+        datastore.volume_create(volume).await.unwrap();
+
+        new_volume_id
+    }
+
+    #[nexus_test(server = crate::Server)]
+    async fn test_region_snapshot_replacement_step_task(
+        cptestctx: &ControlPlaneTestContext,
+    ) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+
+        let starter = Arc::new(NoopStartSaga::new());
+        let mut task = RegionSnapshotReplacementFindAffected::new(
+            datastore.clone(),
+            starter.clone(),
+        );
+
+        // Noop test
+        let result: RegionSnapshotReplacementStepStatus =
+            serde_json::from_value(task.activate(&opctx).await).unwrap();
+        assert_eq!(result, RegionSnapshotReplacementStepStatus::default());
+        assert_eq!(starter.count_reset(), 0);
+
+        // Add a region snapshot replacement request for a fake region snapshot.
+
+        let dataset_id = Uuid::new_v4();
+        let region_id = Uuid::new_v4();
+        let snapshot_id = Uuid::new_v4();
+        let snapshot_addr = String::from("[fd00:1122:3344::101]:9876");
+
+        let fake_region_snapshot = RegionSnapshot::new(
+            dataset_id,
+            region_id,
+            snapshot_id,
+            snapshot_addr.clone(),
+        );
+
+        datastore.region_snapshot_create(fake_region_snapshot).await.unwrap();
+
+        let request =
+            RegionSnapshotReplacement::new(dataset_id, region_id, snapshot_id);
+
+        let request_id = request.id;
+
+        datastore
+            .insert_region_snapshot_replacement_request_with_volume_id(
+                &opctx,
+                request,
+                Uuid::new_v4(),
+            )
+            .await
+            .unwrap();
+
+        // Transition that to Allocating -> ReplacementDone -> DeletingOldVolume
+        // -> Running
+
+        let operating_saga_id = Uuid::new_v4();
+
+        datastore
+            .set_region_snapshot_replacement_allocating(
+                &opctx,
+                request_id,
+                operating_saga_id,
+            )
+            .await
+            .unwrap();
+
+        let new_region_id = Uuid::new_v4();
+        let old_snapshot_volume_id = Uuid::new_v4();
+
+        datastore
+            .set_region_snapshot_replacement_replacement_done(
+                &opctx,
+                request_id,
+                operating_saga_id,
+                new_region_id,
+                old_snapshot_volume_id,
+            )
+            .await
+            .unwrap();
+
+        datastore
+            .set_region_snapshot_replacement_deleting_old_volume(
+                &opctx,
+                request_id,
+                operating_saga_id,
+            )
+            .await
+            .unwrap();
+
+        datastore
+            .set_region_snapshot_replacement_running(
+                &opctx,
+                request_id,
+                operating_saga_id,
+            )
+            .await
+            .unwrap();
+
+        // Add some fake volumes that reference the region snapshot being
+        // replaced
+
+        let new_volume_1_id = add_fake_volume_for_snapshot_addr(
+            &datastore,
+            snapshot_addr.clone(),
+        )
+        .await;
+        let new_volume_2_id = add_fake_volume_for_snapshot_addr(
+            &datastore,
+            snapshot_addr.clone(),
+        )
+        .await;
+
+        // Add some fake volumes that do not
+
+        let other_volume_1_id = add_fake_volume_for_snapshot_addr(
+            &datastore,
+            String::from("[fd00:1122:3344::101]:1000"),
+        )
+        .await;
+
+        let other_volume_2_id = add_fake_volume_for_snapshot_addr(
+            &datastore,
+            String::from("[fd12:5544:3344::912]:3901"),
+        )
+        .await;
+
+        // Activate the task - it should pick the running request up and try to
+        // run the region snapshot replacement step saga for the volumes
+
+        let result: RegionSnapshotReplacementStepStatus =
+            serde_json::from_value(task.activate(&opctx).await).unwrap();
+
+        let requested_region_snapshot_replacement_steps = datastore
+            .get_requested_region_snapshot_replacement_steps(&opctx)
+            .await
+            .unwrap();
+
+        assert_eq!(requested_region_snapshot_replacement_steps.len(), 2);
+
+        for step in &requested_region_snapshot_replacement_steps {
+            let s: String = format!("created {}", step.id);
+            assert!(result.step_records_created_ok.contains(&s));
+
+            let s: String = format!(
+                "region snapshot replacement step saga invoked ok for {}",
+                step.id
+            );
+            assert!(result.step_invoked_ok.contains(&s));
+
+            if step.volume_id == new_volume_1_id
+                || step.volume_id == new_volume_2_id
+            {
+                // ok!
+            } else if step.volume_id == other_volume_1_id
+                || step.volume_id == other_volume_2_id
+            {
+                // error!
+                assert!(false);
+            } else {
+                // error!
+                assert!(false);
+            }
+        }
+
+        // No garbage collection would be invoked yet, as the step records are
+        // not in state Complete
+        assert!(result.step_garbage_collect_invoked_ok.is_empty());
+
+        assert_eq!(result.errors.len(), 0);
+
+        assert_eq!(starter.count_reset(), 2);
+    }
+
+    #[nexus_test(server = crate::Server)]
+    async fn test_region_snapshot_replacement_step_task_gc(
+        cptestctx: &ControlPlaneTestContext,
+    ) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+
+        let starter = Arc::new(NoopStartSaga::new());
+        let mut task = RegionSnapshotReplacementFindAffected::new(
+            datastore.clone(),
+            starter.clone(),
+        );
+
+        // Noop test
+        let result: RegionSnapshotReplacementStepStatus =
+            serde_json::from_value(task.activate(&opctx).await).unwrap();
+        assert_eq!(result, RegionSnapshotReplacementStepStatus::default());
+        assert_eq!(starter.count_reset(), 0);
+
+        // Now, add some Complete records and make sure the garbage collection
+        // saga is invoked.
+
+        datastore
+            .insert_region_snapshot_replacement_step(&opctx, {
+                let mut record = RegionSnapshotReplacementStep::new(
+                    Uuid::new_v4(),
+                    Uuid::new_v4(),
+                );
+
+                record.replacement_state =
+                    RegionSnapshotReplacementStepState::Complete;
+                record.old_snapshot_volume_id = Some(Uuid::new_v4());
+
+                record
+            })
+            .await
+            .unwrap();
+
+        datastore
+            .insert_region_snapshot_replacement_step(&opctx, {
+                let mut record = RegionSnapshotReplacementStep::new(
+                    Uuid::new_v4(),
+                    Uuid::new_v4(),
+                );
+
+                record.replacement_state =
+                    RegionSnapshotReplacementStepState::Complete;
+                record.old_snapshot_volume_id = Some(Uuid::new_v4());
+
+                record
+            })
+            .await
+            .unwrap();
+
+        // Activate the task - it should pick the complete steps up and try to
+        // run the region snapshot replacement step garbage collect saga
+
+        let result: RegionSnapshotReplacementStepStatus =
+            serde_json::from_value(task.activate(&opctx).await).unwrap();
+
+        let region_snapshot_replacement_steps_requiring_gc = datastore
+            .region_snapshot_replacement_steps_requiring_garbage_collection(
+                &opctx,
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(region_snapshot_replacement_steps_requiring_gc.len(), 2);
+
+        eprintln!("{:?}", result);
+
+        for step in &region_snapshot_replacement_steps_requiring_gc {
+            let s: String = format!(
+                "region snapshot replacement step garbage collect request ok \
+                for {}",
+                step.id
+            );
+            assert!(result.step_garbage_collect_invoked_ok.contains(&s));
+        }
+
+        assert!(result.step_records_created_ok.is_empty());
+
+        assert!(result.step_invoked_ok.is_empty());
+
+        assert_eq!(result.errors.len(), 0);
+
+        assert_eq!(starter.count_reset(), 2);
+    }
+}
diff --git a/nexus/src/app/background/tasks/saga_recovery.rs b/nexus/src/app/background/tasks/saga_recovery.rs
index 7b0fe1b331..42069ac4ed 100644
--- a/nexus/src/app/background/tasks/saga_recovery.rs
+++ b/nexus/src/app/background/tasks/saga_recovery.rs
@@ -517,7 +517,7 @@ mod test {
     ) -> (dev::db::CockroachInstance, Arc<db::DataStore>) {
         let db = test_setup_database(&log).await;
         let cfg = nexus_db_queries::db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(log, &cfg));
+        let pool = Arc::new(db::Pool::new_single_host(log, &cfg));
         let db_datastore = Arc::new(
             db::DataStore::new(&log, Arc::clone(&pool), None).await.unwrap(),
         );
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
index 3106ab9f2a..b715b6bbd3 100644
--- a/nexus/src/app/instance.rs
+++ b/nexus/src/app/instance.rs
@@ -60,7 +60,7 @@ use propolis_client::support::WebSocketStream;
 use sagas::instance_common::ExternalIpAttach;
 use sled_agent_client::types::InstanceMigrationTargetParams;
 use sled_agent_client::types::InstanceProperties;
-use sled_agent_client::types::InstancePutStateBody;
+use sled_agent_client::types::VmmPutStateBody;
 use std::matches;
 use std::net::SocketAddr;
 use std::sync::Arc;
@@ -154,7 +154,7 @@ pub(crate) enum InstanceStateChangeRequest {
 }
 
 impl From<InstanceStateChangeRequest>
-    for sled_agent_client::types::InstanceStateRequested
+    for sled_agent_client::types::VmmStateRequested
 {
     fn from(value: InstanceStateChangeRequest) -> Self {
         match value {
@@ -176,7 +176,7 @@ enum InstanceStateChangeRequestAction {
 
     /// Request the appropriate state change from the sled with the specified
     /// UUID.
-    SendToSled(SledUuid),
+    SendToSled { sled_id: SledUuid, propolis_id: PropolisUuid },
 }
 
 /// What is the higher level operation that is calling
@@ -553,7 +553,6 @@ impl super::Nexus {
         if let Err(e) = self
             .instance_request_state(
                 opctx,
-                &authz_instance,
                 state.instance(),
                 state.vmm(),
                 InstanceStateChangeRequest::Reboot,
@@ -632,7 +631,6 @@ impl super::Nexus {
         if let Err(e) = self
             .instance_request_state(
                 opctx,
-                &authz_instance,
                 state.instance(),
                 state.vmm(),
                 InstanceStateChangeRequest::Stop,
@@ -664,21 +662,18 @@ impl super::Nexus {
     /// this sled, this operation rudely terminates it.
     pub(crate) async fn instance_ensure_unregistered(
         &self,
-        opctx: &OpContext,
-        authz_instance: &authz::Instance,
+        propolis_id: &PropolisUuid,
         sled_id: &SledUuid,
-    ) -> Result<Option<nexus::SledInstanceState>, InstanceStateChangeError>
-    {
-        opctx.authorize(authz::Action::Modify, authz_instance).await?;
+    ) -> Result<Option<nexus::SledVmmState>, InstanceStateChangeError> {
         let sa = self.sled_client(&sled_id).await?;
-        sa.instance_unregister(&InstanceUuid::from_untyped_uuid(
-            authz_instance.id(),
-        ))
-        .await
-        .map(|res| res.into_inner().updated_runtime.map(Into::into))
-        .map_err(|e| {
-            InstanceStateChangeError::SledAgent(SledAgentInstancePutError(e))
-        })
+        sa.vmm_unregister(propolis_id)
+            .await
+            .map(|res| res.into_inner().updated_runtime.map(Into::into))
+            .map_err(|e| {
+                InstanceStateChangeError::SledAgent(SledAgentInstancePutError(
+                    e,
+                ))
+            })
     }
 
     /// Determines the action to take on an instance's active VMM given a
@@ -712,8 +707,11 @@ impl super::Nexus {
         // Requests that operate on active instances have to be directed to the
         // instance's current sled agent. If there is none, the request needs to
         // be handled specially based on its type.
-        let sled_id = if let Some(vmm) = vmm_state {
-            SledUuid::from_untyped_uuid(vmm.sled_id)
+        let (sled_id, propolis_id) = if let Some(vmm) = vmm_state {
+            (
+                SledUuid::from_untyped_uuid(vmm.sled_id),
+                PropolisUuid::from_untyped_uuid(vmm.id),
+            )
         } else {
             match effective_state {
                 // If there's no active sled because the instance is stopped,
@@ -814,7 +812,10 @@ impl super::Nexus {
         };
 
         if allowed {
-            Ok(InstanceStateChangeRequestAction::SendToSled(sled_id))
+            Ok(InstanceStateChangeRequestAction::SendToSled {
+                sled_id,
+                propolis_id,
+            })
         } else {
             Err(Error::invalid_request(format!(
                 "instance state cannot be changed from state \"{}\"",
@@ -826,26 +827,25 @@ impl super::Nexus {
     pub(crate) async fn instance_request_state(
         &self,
         opctx: &OpContext,
-        authz_instance: &authz::Instance,
         prev_instance_state: &db::model::Instance,
         prev_vmm_state: &Option<db::model::Vmm>,
         requested: InstanceStateChangeRequest,
     ) -> Result<(), InstanceStateChangeError> {
-        opctx.authorize(authz::Action::Modify, authz_instance).await?;
-        let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id());
-
         match self.select_runtime_change_action(
             prev_instance_state,
             prev_vmm_state,
             &requested,
         )? {
             InstanceStateChangeRequestAction::AlreadyDone => Ok(()),
-            InstanceStateChangeRequestAction::SendToSled(sled_id) => {
+            InstanceStateChangeRequestAction::SendToSled {
+                sled_id,
+                propolis_id,
+            } => {
                 let sa = self.sled_client(&sled_id).await?;
                 let instance_put_result = sa
-                    .instance_put_state(
-                        &instance_id,
-                        &InstancePutStateBody { state: requested.into() },
+                    .vmm_put_state(
+                        &propolis_id,
+                        &VmmPutStateBody { state: requested.into() },
                     )
                     .await
                     .map(|res| res.into_inner().updated_runtime.map(Into::into))
@@ -862,7 +862,7 @@ impl super::Nexus {
                 // Ok(None) here, in which case, there's nothing to write back.
                 match instance_put_result {
                     Ok(Some(ref state)) => self
-                        .notify_instance_updated(opctx, instance_id, state)
+                        .notify_vmm_updated(opctx, propolis_id, state)
                         .await
                         .map_err(Into::into),
                     Ok(None) => Ok(()),
@@ -1120,13 +1120,13 @@ impl super::Nexus {
             .sled_client(&SledUuid::from_untyped_uuid(initial_vmm.sled_id))
             .await?;
         let instance_register_result = sa
-            .instance_register(
-                &instance_id,
+            .vmm_register(
+                propolis_id,
                 &sled_agent_client::types::InstanceEnsureBody {
                     hardware: instance_hardware,
                     instance_runtime: db_instance.runtime().clone().into(),
                     vmm_runtime: initial_vmm.clone().into(),
-                    propolis_id: *propolis_id,
+                    instance_id,
                     propolis_addr: SocketAddr::new(
                         initial_vmm.propolis_ip.ip(),
                         initial_vmm.propolis_port.into(),
@@ -1141,8 +1141,7 @@ impl super::Nexus {
 
         match instance_register_result {
             Ok(state) => {
-                self.notify_instance_updated(opctx, instance_id, &state)
-                    .await?;
+                self.notify_vmm_updated(opctx, *propolis_id, &state).await?;
             }
             Err(e) => {
                 if e.instance_unhealthy() {
@@ -1321,19 +1320,22 @@ impl super::Nexus {
 
     /// Invoked by a sled agent to publish an updated runtime state for an
     /// Instance.
-    pub(crate) async fn notify_instance_updated(
+    pub(crate) async fn notify_vmm_updated(
         &self,
         opctx: &OpContext,
-        instance_id: InstanceUuid,
-        new_runtime_state: &nexus::SledInstanceState,
+        propolis_id: PropolisUuid,
+        new_runtime_state: &nexus::SledVmmState,
     ) -> Result<(), Error> {
-        let saga = notify_instance_updated(
+        let Some((instance_id, saga)) = process_vmm_update(
             &self.db_datastore,
             opctx,
-            instance_id,
+            propolis_id,
             new_runtime_state,
         )
-        .await?;
+        .await?
+        else {
+            return Ok(());
+        };
 
         // We don't need to wait for the instance update saga to run to
         // completion to return OK to the sled-agent --- all it needs to care
@@ -1344,53 +1346,51 @@ impl super::Nexus {
         // one is eventually executed.
         //
         // Therefore, just spawn the update saga in a new task, and return.
-        if let Some(saga) = saga {
-            info!(opctx.log, "starting update saga for {instance_id}";
-                "instance_id" => %instance_id,
-                "vmm_state" => ?new_runtime_state.vmm_state,
-                "migration_state" => ?new_runtime_state.migrations(),
-            );
-            let sagas = self.sagas.clone();
-            let task_instance_updater =
-                self.background_tasks.task_instance_updater.clone();
-            let log = opctx.log.clone();
-            tokio::spawn(async move {
-                // TODO(eliza): maybe we should use the lower level saga API so
-                // we can see if the saga failed due to the lock being held and
-                // retry it immediately?
-                let running_saga = async move {
-                    let runnable_saga = sagas.saga_prepare(saga).await?;
-                    runnable_saga.start().await
-                }
-                .await;
-                let result = match running_saga {
-                    Err(error) => {
-                        error!(&log, "failed to start update saga for {instance_id}";
-                            "instance_id" => %instance_id,
-                            "error" => %error,
-                        );
-                        // If we couldn't start the update saga for this
-                        // instance, kick the instance-updater background task
-                        // to try and start it again in a timely manner.
-                        task_instance_updater.activate();
-                        return;
-                    }
-                    Ok(saga) => {
-                        saga.wait_until_stopped().await.into_omicron_result()
-                    }
-                };
-                if let Err(error) = result {
-                    error!(&log, "update saga for {instance_id} failed";
+        info!(opctx.log, "starting update saga for {instance_id}";
+            "instance_id" => %instance_id,
+            "vmm_state" => ?new_runtime_state.vmm_state,
+            "migration_state" => ?new_runtime_state.migrations(),
+        );
+        let sagas = self.sagas.clone();
+        let task_instance_updater =
+            self.background_tasks.task_instance_updater.clone();
+        let log = opctx.log.clone();
+        tokio::spawn(async move {
+            // TODO(eliza): maybe we should use the lower level saga API so
+            // we can see if the saga failed due to the lock being held and
+            // retry it immediately?
+            let running_saga = async move {
+                let runnable_saga = sagas.saga_prepare(saga).await?;
+                runnable_saga.start().await
+            }
+            .await;
+            let result = match running_saga {
+                Err(error) => {
+                    error!(&log, "failed to start update saga for {instance_id}";
                         "instance_id" => %instance_id,
                         "error" => %error,
                     );
-                    // If we couldn't complete the update saga for this
+                    // If we couldn't start the update saga for this
                     // instance, kick the instance-updater background task
                     // to try and start it again in a timely manner.
                     task_instance_updater.activate();
+                    return;
                 }
-            });
-        }
+                Ok(saga) => {
+                    saga.wait_until_stopped().await.into_omicron_result()
+                }
+            };
+            if let Err(error) = result {
+                error!(&log, "update saga for {instance_id} failed";
+                    "instance_id" => %instance_id,
+                    "error" => %error,
+                );
+                // If we couldn't complete the update saga for this
+                // instance, kick the instance-updater background task
+                // to try and start it again in a timely manner.
+                task_instance_updater.activate();
+            }
+        });
 
         Ok(())
     }
@@ -1830,21 +1830,27 @@ impl super::Nexus {
     }
 }
 
-/// Invoked by a sled agent to publish an updated runtime state for an
-/// Instance, returning an update saga for that instance (if one must be
-/// executed).
-pub(crate) async fn notify_instance_updated(
+/// Writes the VMM and migration state supplied in `new_runtime_state` to the
+/// database (provided that it's newer than what's already there).
+///
+/// # Return value
+///
+/// - `Ok(Some(instance_id, saga))` if the new VMM state obsoletes the current
+///   instance state. The caller should execute the returned instance update
+///   saga to reconcile the instance to the new VMM state.
+/// - `Ok(None)` if the new state was successfully published but does not
+///   require an instance update.
+/// - `Err` if an error occurred.
+pub(crate) async fn process_vmm_update(
     datastore: &DataStore,
     opctx: &OpContext,
-    instance_id: InstanceUuid,
-    new_runtime_state: &nexus::SledInstanceState,
-) -> Result<Option<steno::SagaDag>, Error> {
+    propolis_id: PropolisUuid,
+    new_runtime_state: &nexus::SledVmmState,
+) -> Result<Option<(InstanceUuid, steno::SagaDag)>, Error> {
     use sagas::instance_update;
 
     let migrations = new_runtime_state.migrations();
-    let propolis_id = new_runtime_state.propolis_id;
     info!(opctx.log, "received new VMM runtime state from sled agent";
-        "instance_id" => %instance_id,
         "propolis_id" => %propolis_id,
         "vmm_state" => ?new_runtime_state.vmm_state,
         "migration_state" => ?migrations,
@@ -1864,21 +1870,34 @@ pub(crate) async fn notify_instance_updated(
     // prepare and return it.
     if instance_update::update_saga_needed(
         &opctx.log,
-        instance_id,
+        propolis_id,
         new_runtime_state,
         &result,
     ) {
+        let instance_id =
+            InstanceUuid::from_untyped_uuid(result.found_vmm.instance_id);
+
         let (.., authz_instance) = LookupPath::new(&opctx, datastore)
             .instance_id(instance_id.into_untyped_uuid())
             .lookup_for(authz::Action::Modify)
             .await?;
-        let saga = instance_update::SagaInstanceUpdate::prepare(
+
+        match instance_update::SagaInstanceUpdate::prepare(
             &instance_update::Params {
                 serialized_authn: authn::saga::Serialized::for_opctx(opctx),
                 authz_instance,
             },
-        )?;
-        Ok(Some(saga))
+        ) {
+            Ok(saga) => Ok(Some((instance_id, saga))),
+            Err(e) => {
+                error!(opctx.log, "failed to prepare instance update saga";
+                       "error" => ?e,
+                       "instance_id" => %instance_id,
+                       "propolis_id" => %propolis_id);
+
+                Err(e)
+            }
+        }
     } else {
         Ok(None)
     }
diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs
index 6e431aaca7..049673d2ee 100644
--- a/nexus/src/app/sagas/instance_common.rs
+++ b/nexus/src/app/sagas/instance_common.rs
@@ -25,6 +25,12 @@ use super::NexusActionContext;
 /// The port propolis-server listens on inside the propolis zone.
 const DEFAULT_PROPOLIS_PORT: u16 = 12400;
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub(super) struct VmmAndSledIds {
+    pub(super) vmm_id: PropolisUuid,
+    pub(super) sled_id: SledUuid,
+}
+
 /// Reserves resources for a new VMM whose instance has `ncpus` guest logical
 /// processors and `guest_memory` bytes of guest RAM. The selected sled is
 /// random within the set of sleds allowed by the supplied `constraints`.
@@ -213,12 +219,12 @@ pub async fn instance_ip_move_state(
 /// the Attaching or Detaching state so that concurrent attempts to start the
 /// instance will notice that the IP state is in flux and ask the caller to
 /// retry.
-pub async fn instance_ip_get_instance_state(
+pub(super) async fn instance_ip_get_instance_state(
     sagactx: &NexusActionContext,
     serialized_authn: &authn::saga::Serialized,
     authz_instance: &authz::Instance,
     verb: &str,
-) -> Result<Option<SledUuid>, ActionError> {
+) -> Result<Option<VmmAndSledIds>, ActionError> {
     // XXX: we can get instance state (but not sled ID) in same transaction
     //      as attach (but not detach) wth current design. We need to re-query
     //      for sled ID anyhow, so keep consistent between attach/detach.
@@ -236,7 +242,11 @@ pub async fn instance_ip_get_instance_state(
         inst_and_vmm.vmm().as_ref().map(|vmm| vmm.runtime.state);
     let found_instance_state =
         inst_and_vmm.instance().runtime_state.nexus_state;
-    let mut sled_id = inst_and_vmm.sled_id();
+    let mut propolis_and_sled_id =
+        inst_and_vmm.vmm().as_ref().map(|vmm| VmmAndSledIds {
+            vmm_id: PropolisUuid::from_untyped_uuid(vmm.id),
+            sled_id: SledUuid::from_untyped_uuid(vmm.sled_id),
+        });
 
     slog::debug!(
         osagactx.log(), "evaluating instance state for IP attach/detach";
@@ -257,7 +267,7 @@ pub async fn instance_ip_get_instance_state(
     match (found_instance_state, found_vmm_state) {
         // If there's no VMM, the instance is definitely not on any sled.
         (InstanceState::NoVmm, _) | (_, Some(VmmState::SagaUnwound)) => {
-            sled_id = None;
+            propolis_and_sled_id = None;
         }
 
         // If the instance is running normally or rebooting, it's resident on
@@ -340,7 +350,7 @@ pub async fn instance_ip_get_instance_state(
         }
     }
 
-    Ok(sled_id)
+    Ok(propolis_and_sled_id)
 }
 
 /// Adds a NAT entry to DPD, routing packets bound for `target_ip` to a
@@ -441,18 +451,19 @@ pub async fn instance_ip_remove_nat(
 /// Inform the OPTE port for a running instance that it should start
 /// sending/receiving traffic on a given IP address.
 ///
-/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly
-/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`).
-pub async fn instance_ip_add_opte(
+/// This call is a no-op if the instance is not active (`propolis_and_sled` is
+/// `None`) or the calling saga is explicitly set to be inactive in the event of
+/// a double attach/detach (`!target_ip.do_saga`).
+pub(super) async fn instance_ip_add_opte(
     sagactx: &NexusActionContext,
-    authz_instance: &authz::Instance,
-    sled_uuid: Option<SledUuid>,
+    vmm_and_sled: Option<VmmAndSledIds>,
     target_ip: ModifyStateForExternalIp,
 ) -> Result<(), ActionError> {
     let osagactx = sagactx.user_data();
 
     // No physical sled? Don't inform OPTE.
-    let Some(sled_uuid) = sled_uuid else {
+    let Some(VmmAndSledIds { vmm_id: propolis_id, sled_id }) = vmm_and_sled
+    else {
         return Ok(());
     };
 
@@ -470,17 +481,14 @@ pub async fn instance_ip_add_opte(
 
     osagactx
         .nexus()
-        .sled_client(&sled_uuid)
+        .sled_client(&sled_id)
         .await
         .map_err(|_| {
             ActionError::action_failed(Error::unavail(
                 "sled agent client went away mid-attach/detach",
             ))
         })?
-        .instance_put_external_ip(
-            &InstanceUuid::from_untyped_uuid(authz_instance.id()),
-            &sled_agent_body,
-        )
+        .vmm_put_external_ip(&propolis_id, &sled_agent_body)
         .await
         .map_err(|e| {
             ActionError::action_failed(match e {
@@ -499,18 +507,20 @@ pub async fn instance_ip_add_opte(
 /// Inform the OPTE port for a running instance that it should cease
 /// sending/receiving traffic on a given IP address.
 ///
-/// This call is a no-op if `sled_uuid` is `None` or the saga is explicitly
-/// set to be inactive in event of double attach/detach (`!target_ip.do_saga`).
-pub async fn instance_ip_remove_opte(
+/// This call is a no-op if the instance is not active (`propolis_and_sled` is
+/// `None`) or the calling saga is explicitly set to be inactive in the event of
+/// a double attach/detach (`!target_ip.do_saga`).
+pub(super) async fn instance_ip_remove_opte(
     sagactx: &NexusActionContext,
-    authz_instance: &authz::Instance,
-    sled_uuid: Option<SledUuid>,
+    propolis_and_sled: Option<VmmAndSledIds>,
     target_ip: ModifyStateForExternalIp,
 ) -> Result<(), ActionError> {
     let osagactx = sagactx.user_data();
 
     // No physical sled? Don't inform OPTE.
-    let Some(sled_uuid) = sled_uuid else {
+    let Some(VmmAndSledIds { vmm_id: propolis_id, sled_id }) =
+        propolis_and_sled
+    else {
         return Ok(());
     };
 
@@ -528,17 +538,14 @@ pub async fn instance_ip_remove_opte(
 
     osagactx
         .nexus()
-        .sled_client(&sled_uuid)
+        .sled_client(&sled_id)
         .await
         .map_err(|_| {
             ActionError::action_failed(Error::unavail(
                 "sled agent client went away mid-attach/detach",
             ))
         })?
-        .instance_delete_external_ip(
-            &InstanceUuid::from_untyped_uuid(authz_instance.id()),
-            &sled_agent_body,
-        )
+        .vmm_delete_external_ip(&propolis_id, &sled_agent_body)
         .await
         .map_err(|e| {
             ActionError::action_failed(match e {
diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs
index d19230892f..0b6d8cc0f8 100644
--- a/nexus/src/app/sagas/instance_create.rs
+++ b/nexus/src/app/sagas/instance_create.rs
@@ -1220,8 +1220,7 @@ pub mod test {
     }
 
     async fn no_instances_or_disks_on_sled(sled_agent: &SledAgent) -> bool {
-        sled_agent.instance_count().await == 0
-            && sled_agent.disk_count().await == 0
+        sled_agent.vmm_count().await == 0 && sled_agent.disk_count().await == 0
     }
 
     pub(crate) async fn verify_clean_slate(
diff --git a/nexus/src/app/sagas/instance_ip_attach.rs b/nexus/src/app/sagas/instance_ip_attach.rs
index a14054cf66..e6fb8654ea 100644
--- a/nexus/src/app/sagas/instance_ip_attach.rs
+++ b/nexus/src/app/sagas/instance_ip_attach.rs
@@ -5,7 +5,7 @@
 use super::instance_common::{
     instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state,
     instance_ip_move_state, instance_ip_remove_opte, ExternalIpAttach,
-    ModifyStateForExternalIp,
+    ModifyStateForExternalIp, VmmAndSledIds,
 };
 use super::{ActionRegistry, NexusActionContext, NexusSaga};
 use crate::app::sagas::declare_saga_actions;
@@ -13,7 +13,7 @@ use crate::app::{authn, authz};
 use nexus_db_model::{IpAttachState, Ipv4NatEntry};
 use nexus_types::external_api::views;
 use omicron_common::api::external::Error;
-use omicron_uuid_kinds::{GenericUuid, InstanceUuid, SledUuid};
+use omicron_uuid_kinds::{GenericUuid, InstanceUuid};
 use serde::Deserialize;
 use serde::Serialize;
 use steno::ActionError;
@@ -161,7 +161,7 @@ async fn siia_begin_attach_ip_undo(
 
 async fn siia_get_instance_state(
     sagactx: NexusActionContext,
-) -> Result<Option<SledUuid>, ActionError> {
+) -> Result<Option<VmmAndSledIds>, ActionError> {
     let params = sagactx.saga_params::<Params>()?;
     instance_ip_get_instance_state(
         &sagactx,
@@ -177,7 +177,10 @@ async fn siia_nat(
     sagactx: NexusActionContext,
 ) -> Result<Option<Ipv4NatEntry>, ActionError> {
     let params = sagactx.saga_params::<Params>()?;
-    let sled_id = sagactx.lookup::<Option<SledUuid>>("instance_state")?;
+    let sled_id = sagactx
+        .lookup::<Option<VmmAndSledIds>>("instance_state")?
+        .map(|ids| ids.sled_id);
+
     let target_ip = sagactx.lookup::<ModifyStateForExternalIp>("target_ip")?;
     instance_ip_add_nat(
         &sagactx,
@@ -245,28 +248,18 @@ async fn siia_nat_undo(
 async fn siia_update_opte(
     sagactx: NexusActionContext,
 ) -> Result<(), ActionError> {
-    let params = sagactx.saga_params::<Params>()?;
-    let sled_id = sagactx.lookup::<Option<SledUuid>>("instance_state")?;
+    let ids = sagactx.lookup::<Option<VmmAndSledIds>>("instance_state")?;
     let target_ip = sagactx.lookup::<ModifyStateForExternalIp>("target_ip")?;
-    instance_ip_add_opte(&sagactx, &params.authz_instance, sled_id, target_ip)
-        .await
+    instance_ip_add_opte(&sagactx, ids, target_ip).await
 }
 
 async fn siia_update_opte_undo(
     sagactx: NexusActionContext,
 ) -> Result<(), anyhow::Error> {
     let log = sagactx.user_data().log();
-    let params = sagactx.saga_params::<Params>()?;
-    let sled_id = sagactx.lookup::<Option<SledUuid>>("instance_state")?;
+    let ids = sagactx.lookup::<Option<VmmAndSledIds>>("instance_state")?;
     let target_ip = sagactx.lookup::<ModifyStateForExternalIp>("target_ip")?;
-    if let Err(e) = instance_ip_remove_opte(
-        &sagactx,
-        &params.authz_instance,
-        sled_id,
-        target_ip,
-    )
-    .await
-    {
+    if let Err(e) = instance_ip_remove_opte(&sagactx, ids, target_ip).await {
         error!(log, "siia_update_opte_undo: failed to notify sled-agent: {e}");
     }
     Ok(())
@@ -436,8 +429,14 @@ pub(crate) mod test {
         }
 
         // Sled agent has a record of the new external IPs.
+        let VmmAndSledIds { vmm_id, .. } =
+            crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids(
+                cptestctx,
+                &instance_id,
+            )
+            .await;
         let mut eips = sled_agent.external_ips.lock().await;
-        let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default();
+        let my_eips = eips.entry(vmm_id).or_default();
         assert!(my_eips
             .iter()
             .any(|v| matches!(v, InstanceExternalIpBody::Floating(_))));
@@ -458,7 +457,7 @@ pub(crate) mod test {
 
     pub(crate) async fn verify_clean_slate(
         cptestctx: &ControlPlaneTestContext,
-        instance_id: Uuid,
+        instance_id: InstanceUuid,
     ) {
         use nexus_db_queries::db::schema::external_ip::dsl;
 
@@ -471,7 +470,7 @@ pub(crate) mod test {
         assert!(dsl::external_ip
             .filter(dsl::kind.eq(IpKind::Floating))
             .filter(dsl::time_deleted.is_null())
-            .filter(dsl::parent_id.eq(instance_id))
+            .filter(dsl::parent_id.eq(instance_id.into_untyped_uuid()))
             .filter(dsl::state.ne(IpAttachState::Detached))
             .select(ExternalIp::as_select())
             .first_async::<ExternalIp>(&*conn)
@@ -492,8 +491,14 @@ pub(crate) mod test {
             .is_none());
 
         // No IP bindings remain on sled-agent.
+        let VmmAndSledIds { vmm_id, .. } =
+            crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids(
+                cptestctx,
+                &instance_id,
+            )
+            .await;
         let mut eips = sled_agent.external_ips.lock().await;
-        let my_eips = eips.entry(instance_id).or_default();
+        let my_eips = eips.entry(vmm_id).or_default();
         assert!(my_eips.is_empty());
     }
 
@@ -512,9 +517,10 @@ pub(crate) mod test {
         let instance =
             create_instance(client, PROJECT_NAME, INSTANCE_NAME).await;
 
+        let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id);
         crate::app::sagas::test_helpers::instance_simulate(
             cptestctx,
-            &InstanceUuid::from_untyped_uuid(instance.identity.id),
+            &instance_id,
         )
         .await;
 
@@ -522,7 +528,7 @@ pub(crate) mod test {
             test_helpers::action_failure_can_unwind::<SagaInstanceIpAttach, _, _>(
                 nexus,
                 || Box::pin(new_test_params(&opctx, datastore, use_float) ),
-                || Box::pin(verify_clean_slate(&cptestctx, instance.id())),
+                || Box::pin(verify_clean_slate(&cptestctx, instance_id)),
                 log,
             )
             .await;
@@ -544,9 +550,10 @@ pub(crate) mod test {
         let instance =
             create_instance(client, PROJECT_NAME, INSTANCE_NAME).await;
 
+        let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id);
         crate::app::sagas::test_helpers::instance_simulate(
             cptestctx,
-            &InstanceUuid::from_untyped_uuid(instance.identity.id),
+            &instance_id,
         )
         .await;
 
@@ -558,7 +565,7 @@ pub(crate) mod test {
             >(
                 nexus,
                 || Box::pin(new_test_params(&opctx, datastore, use_float)),
-                || Box::pin(verify_clean_slate(&cptestctx, instance.id())),
+                || Box::pin(verify_clean_slate(&cptestctx, instance_id)),
                 log,
             )
             .await;
diff --git a/nexus/src/app/sagas/instance_ip_detach.rs b/nexus/src/app/sagas/instance_ip_detach.rs
index a5b51ce375..d9da9fc05c 100644
--- a/nexus/src/app/sagas/instance_ip_detach.rs
+++ b/nexus/src/app/sagas/instance_ip_detach.rs
@@ -5,7 +5,7 @@
 use super::instance_common::{
     instance_ip_add_nat, instance_ip_add_opte, instance_ip_get_instance_state,
     instance_ip_move_state, instance_ip_remove_nat, instance_ip_remove_opte,
-    ModifyStateForExternalIp,
+    ModifyStateForExternalIp, VmmAndSledIds,
 };
 use super::{ActionRegistry, NexusActionContext, NexusSaga};
 use crate::app::sagas::declare_saga_actions;
@@ -15,7 +15,7 @@ use nexus_db_model::IpAttachState;
 use nexus_db_queries::db::lookup::LookupPath;
 use nexus_types::external_api::views;
 use omicron_common::api::external::NameOrId;
-use omicron_uuid_kinds::{GenericUuid, InstanceUuid, SledUuid};
+use omicron_uuid_kinds::{GenericUuid, InstanceUuid};
 use ref_cast::RefCast;
 use serde::Deserialize;
 use serde::Serialize;
@@ -155,7 +155,7 @@ async fn siid_begin_detach_ip_undo(
 
 async fn siid_get_instance_state(
     sagactx: NexusActionContext,
-) -> Result<Option<SledUuid>, ActionError> {
+) -> Result<Option<VmmAndSledIds>, ActionError> {
     let params = sagactx.saga_params::<Params>()?;
     instance_ip_get_instance_state(
         &sagactx,
@@ -168,7 +168,9 @@ async fn siid_get_instance_state(
 
 async fn siid_nat(sagactx: NexusActionContext) -> Result<(), ActionError> {
     let params = sagactx.saga_params::<Params>()?;
-    let sled_id = sagactx.lookup::<Option<SledUuid>>("instance_state")?;
+    let sled_id = sagactx
+        .lookup::<Option<VmmAndSledIds>>("instance_state")?
+        .map(|ids| ids.sled_id);
     let target_ip = sagactx.lookup::<ModifyStateForExternalIp>("target_ip")?;
     instance_ip_remove_nat(
         &sagactx,
@@ -184,7 +186,9 @@ async fn siid_nat_undo(
 ) -> Result<(), anyhow::Error> {
     let log = sagactx.user_data().log();
     let params = sagactx.saga_params::<Params>()?;
-    let sled_id = sagactx.lookup::<Option<SledUuid>>("instance_state")?;
+    let sled_id = sagactx
+        .lookup::<Option<VmmAndSledIds>>("instance_state")?
+        .map(|ids| ids.sled_id);
     let target_ip = sagactx.lookup::<ModifyStateForExternalIp>("target_ip")?;
     if let Err(e) = instance_ip_add_nat(
         &sagactx,
@@ -204,33 +208,18 @@ async fn siid_nat_undo(
 async fn siid_update_opte(
     sagactx: NexusActionContext,
 ) -> Result<(), ActionError> {
-    let params = sagactx.saga_params::<Params>()?;
-    let sled_id = sagactx.lookup::<Option<SledUuid>>("instance_state")?;
+    let ids = sagactx.lookup::<Option<VmmAndSledIds>>("instance_state")?;
     let target_ip = sagactx.lookup::<ModifyStateForExternalIp>("target_ip")?;
-    instance_ip_remove_opte(
-        &sagactx,
-        &params.authz_instance,
-        sled_id,
-        target_ip,
-    )
-    .await
+    instance_ip_remove_opte(&sagactx, ids, target_ip).await
 }
 
 async fn siid_update_opte_undo(
     sagactx: NexusActionContext,
 ) -> Result<(), anyhow::Error> {
     let log = sagactx.user_data().log();
-    let params = sagactx.saga_params::<Params>()?;
-    let sled_id = sagactx.lookup::<Option<SledUuid>>("instance_state")?;
+    let ids = sagactx.lookup::<Option<VmmAndSledIds>>("instance_state")?;
     let target_ip = sagactx.lookup::<ModifyStateForExternalIp>("target_ip")?;
-    if let Err(e) = instance_ip_add_opte(
-        &sagactx,
-        &params.authz_instance,
-        sled_id,
-        target_ip,
-    )
-    .await
-    {
+    if let Err(e) = instance_ip_add_opte(&sagactx, ids, target_ip).await {
         error!(log, "siid_update_opte_undo: failed to notify sled-agent: {e}");
     }
     Ok(())
@@ -410,8 +399,14 @@ pub(crate) mod test {
         }
 
         // Sled agent has removed its records of the external IPs.
+        let VmmAndSledIds { vmm_id, .. } =
+            crate::app::sagas::test_helpers::instance_fetch_vmm_and_sled_ids(
+                cptestctx,
+                &instance_id,
+            )
+            .await;
         let mut eips = sled_agent.external_ips.lock().await;
-        let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default();
+        let my_eips = eips.entry(vmm_id).or_default();
         assert!(my_eips.is_empty());
 
         // DB only has record for SNAT.
diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs
index 19bef2f046..24d11fcae2 100644
--- a/nexus/src/app/sagas/instance_migrate.rs
+++ b/nexus/src/app/sagas/instance_migrate.rs
@@ -437,20 +437,10 @@ async fn sim_ensure_destination_propolis_undo(
     sagactx: NexusActionContext,
 ) -> Result<(), anyhow::Error> {
     let osagactx = sagactx.user_data();
-    let params = sagactx.saga_params::<Params>()?;
-    let opctx = crate::context::op_context_for_saga_action(
-        &sagactx,
-        &params.serialized_authn,
-    );
-
+    let dst_propolis_id = sagactx.lookup::<PropolisUuid>("dst_propolis_id")?;
     let dst_sled_id = sagactx.lookup::<SledUuid>("dst_sled_id")?;
     let db_instance =
         sagactx.lookup::<db::model::Instance>("set_migration_ids")?;
-    let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore())
-        .instance_id(db_instance.id())
-        .lookup_for(authz::Action::Modify)
-        .await
-        .map_err(ActionError::action_failed)?;
 
     info!(osagactx.log(), "unregistering destination vmm for migration unwind";
           "instance_id" => %db_instance.id(),
@@ -465,7 +455,7 @@ async fn sim_ensure_destination_propolis_undo(
     // needed.
     match osagactx
         .nexus()
-        .instance_ensure_unregistered(&opctx, &authz_instance, &dst_sled_id)
+        .instance_ensure_unregistered(&dst_propolis_id, &dst_sled_id)
         .await
     {
         Ok(_) => Ok(()),
@@ -500,12 +490,6 @@ async fn sim_instance_migrate(
 
     let src_propolis_id = db_instance.runtime().propolis_id.unwrap();
     let dst_vmm = sagactx.lookup::<db::model::Vmm>("dst_vmm_record")?;
-    let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore())
-        .instance_id(db_instance.id())
-        .lookup_for(authz::Action::Modify)
-        .await
-        .map_err(ActionError::action_failed)?;
-
     info!(osagactx.log(), "initiating migration from destination sled";
           "instance_id" => %db_instance.id(),
           "dst_vmm_record" => ?dst_vmm,
@@ -529,7 +513,6 @@ async fn sim_instance_migrate(
         .nexus()
         .instance_request_state(
             &opctx,
-            &authz_instance,
             &db_instance,
             &Some(dst_vmm),
             InstanceStateChangeRequest::Migrate(
diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs
index 55fc312ae7..b6b78bd43c 100644
--- a/nexus/src/app/sagas/instance_start.rs
+++ b/nexus/src/app/sagas/instance_start.rs
@@ -538,6 +538,7 @@ async fn sis_ensure_registered_undo(
     let params = sagactx.saga_params::<Params>()?;
     let datastore = osagactx.datastore();
     let instance_id = InstanceUuid::from_untyped_uuid(params.db_instance.id());
+    let propolis_id = sagactx.lookup::<PropolisUuid>("propolis_id")?;
     let sled_id = sagactx.lookup::<SledUuid>("sled_id")?;
     let opctx = crate::context::op_context_for_saga_action(
         &sagactx,
@@ -546,11 +547,12 @@ async fn sis_ensure_registered_undo(
 
     info!(osagactx.log(), "start saga: unregistering instance from sled";
           "instance_id" => %instance_id,
+          "propolis_id" => %propolis_id,
           "sled_id" => %sled_id);
 
     // Fetch the latest record so that this callee can drive the instance into
     // a Failed state if the unregister call fails.
-    let (.., authz_instance, db_instance) = LookupPath::new(&opctx, &datastore)
+    let (.., db_instance) = LookupPath::new(&opctx, &datastore)
         .instance_id(instance_id.into_untyped_uuid())
         .fetch()
         .await
@@ -563,7 +565,7 @@ async fn sis_ensure_registered_undo(
     // returned.
     if let Err(e) = osagactx
         .nexus()
-        .instance_ensure_unregistered(&opctx, &authz_instance, &sled_id)
+        .instance_ensure_unregistered(&propolis_id, &sled_id)
         .await
     {
         error!(osagactx.log(),
@@ -644,7 +646,6 @@ async fn sis_ensure_running(
 ) -> Result<(), ActionError> {
     let osagactx = sagactx.user_data();
     let params = sagactx.saga_params::<Params>()?;
-    let datastore = osagactx.datastore();
     let opctx = crate::context::op_context_for_saga_action(
         &sagactx,
         &params.serialized_authn,
@@ -659,17 +660,10 @@ async fn sis_ensure_running(
           "instance_id" => %instance_id,
           "sled_id" => %sled_id);
 
-    let (.., authz_instance) = LookupPath::new(&opctx, &datastore)
-        .instance_id(instance_id.into_untyped_uuid())
-        .lookup_for(authz::Action::Modify)
-        .await
-        .map_err(ActionError::action_failed)?;
-
     match osagactx
         .nexus()
         .instance_request_state(
             &opctx,
-            &authz_instance,
             &db_instance,
             &Some(db_vmm),
             crate::app::instance::InstanceStateChangeRequest::Run,
diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs
index 5f226480b8..4c4c4deff2 100644
--- a/nexus/src/app/sagas/instance_update/mod.rs
+++ b/nexus/src/app/sagas/instance_update/mod.rs
@@ -30,10 +30,9 @@
 //! Nexus' `cpapi_instances_put` internal API endpoint, when a Nexus'
 //! `instance-watcher` background task *pulls* instance states from sled-agents
 //! periodically, or as the return value of an API call from Nexus to a
-//! sled-agent. When a Nexus receives a new [`SledInstanceState`] from a
-//! sled-agent through any of these mechanisms, the Nexus will write any changed
-//! state to the `vmm` and/or `migration` tables directly on behalf of the
-//! sled-agent.
+//! sled-agent. When a Nexus receives a new [`SledVmmState`] from a sled-agent
+//! through any of these mechanisms, the Nexus will write any changed state to
+//! the `vmm` and/or `migration` tables directly on behalf of the sled-agent.
 //!
 //! Although Nexus is technically the party responsible for the database query
 //! that writes VMM and migration state updates received from sled-agent, it is
@@ -236,9 +235,9 @@
 //! updates is perhaps the simplest one: _avoiding unnecessary update sagas_.
 //! The `cpapi_instances_put` API endpoint and instance-watcher background tasks
 //! handle changes to VMM and migration states by calling the
-//! [`notify_instance_updated`] method, which writes the new states to the
-//! database and (potentially) starts an update saga. Naively, this method would
-//! *always* start an update saga, but remember that --- as we discussed
+//! [`process_vmm_update`] method, which writes the new states to the database
+//! and (potentially) starts an update saga. Naively, this method would *always*
+//! start an update saga, but remember that --- as we discussed
 //! [above](#background) --- many VMM/migration state changes don't actually
 //! require modifying the instance record. For example, if an instance's VMM
 //! transitions from [`VmmState::Starting`] to [`VmmState::Running`], that
@@ -271,7 +270,7 @@
 //! delayed. To improve the timeliness of update sagas, we will also explicitly
 //! activate the background task at any point where we know that an update saga
 //! *should* run but we were not able to run it. If an update saga cannot be
-//! started, whether by [`notify_instance_updated`], a `start-instance-update`
+//! started, whether by [`notify_vmm_updated`], a `start-instance-update`
 //! saga attempting to start its real saga, or an `instance-update` saga
 //! chaining into a new one as its last action, the `instance-watcher`
 //! background task is activated. Similarly, when a `start-instance-update` saga
@@ -326,7 +325,8 @@
 //!     crate::app::db::datastore::DataStore::instance_updater_inherit_lock
 //! [instance_updater_unlock]:
 //!     crate::app::db::datastore::DataStore::instance_updater_unlock
-//! [`notify_instance_updated`]: crate::app::Nexus::notify_instance_updated
+//! [`notify_vmm_updated`]: crate::app::Nexus::notify_vmm_updated
+//! [`process_vmm_update`]: crate::app::instance::process_vmm_update
 //!
 //! [dist-locking]:
 //!     https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html
@@ -362,7 +362,7 @@ use nexus_db_queries::{authn, authz};
 use nexus_types::identity::Resource;
 use omicron_common::api::external::Error;
 use omicron_common::api::internal::nexus;
-use omicron_common::api::internal::nexus::SledInstanceState;
+use omicron_common::api::internal::nexus::SledVmmState;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::InstanceUuid;
 use omicron_uuid_kinds::PropolisUuid;
@@ -388,8 +388,8 @@ pub(crate) use self::start::{Params, SagaInstanceUpdate};
 mod destroyed;
 
 /// Returns `true` if an `instance-update` saga should be executed as a result
-/// of writing the provided [`SledInstanceState`] to the database with the
-/// provided [`VmmStateUpdateResult`].
+/// of writing the provided [`SledVmmState`] to the database with the provided
+/// [`VmmStateUpdateResult`].
 ///
 /// We determine this only after actually updating the database records,
 /// because we don't know whether a particular VMM or migration state is
@@ -407,8 +407,8 @@ mod destroyed;
 /// VMM/migration states.
 pub fn update_saga_needed(
     log: &slog::Logger,
-    instance_id: InstanceUuid,
-    state: &SledInstanceState,
+    propolis_id: PropolisUuid,
+    state: &SledVmmState,
     result: &VmmStateUpdateResult,
 ) -> bool {
     // Currently, an instance-update saga is required if (and only if):
@@ -443,8 +443,7 @@ pub fn update_saga_needed(
         debug!(log,
             "new VMM runtime state from sled agent requires an \
              instance-update saga";
-            "instance_id" => %instance_id,
-            "propolis_id" => %state.propolis_id,
+            "propolis_id" => %propolis_id,
             "vmm_needs_update" => vmm_needs_update,
             "migration_in_needs_update" => migration_in_needs_update,
             "migration_out_needs_update" => migration_out_needs_update,
diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs
index 926b983460..bd3ae62996 100644
--- a/nexus/src/app/sagas/mod.rs
+++ b/nexus/src/app/sagas/mod.rs
@@ -41,6 +41,8 @@ pub mod region_replacement_finish;
 pub mod region_replacement_start;
 pub mod region_snapshot_replacement_garbage_collect;
 pub mod region_snapshot_replacement_start;
+pub mod region_snapshot_replacement_step;
+pub mod region_snapshot_replacement_step_garbage_collect;
 pub mod snapshot_create;
 pub mod snapshot_delete;
 pub mod test_saga;
@@ -198,6 +200,12 @@ fn make_action_registry() -> ActionRegistry {
     <region_snapshot_replacement_garbage_collect::SagaRegionSnapshotReplacementGarbageCollect as NexusSaga>::register_actions(
         &mut registry,
     );
+    <region_snapshot_replacement_step::SagaRegionSnapshotReplacementStep as NexusSaga>::register_actions(
+        &mut registry,
+    );
+    <region_snapshot_replacement_step_garbage_collect::SagaRegionSnapshotReplacementStepGarbageCollect as NexusSaga>::register_actions(
+        &mut registry,
+    );
 
     #[cfg(test)]
     <test_saga::SagaTest as NexusSaga>::register_actions(&mut registry);
diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs
new file mode 100644
index 0000000000..600bb155bf
--- /dev/null
+++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs
@@ -0,0 +1,603 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Region snapshot replacement is distinct from region replacement: replacing
+//! parts of a volume's read-only parent (and all the layers under it) is easier
+//! because this does _not_ incur a live repair or reconciliation. Each part of
+//! a read-only region set contains the same data that will never be modified.
+//!
+//! A region snapshot replacement request starts off in the "Requested" state,
+//! just like a region replacement request. A background task will search for
+//! region snapshot replacement requests in this state and trigger the "region
+//! snapshot replacement start" saga. This will allocate a new region to replace
+//! the requested one, and modify the snapshot VCR accordingly. If any disks are
+//! then created using that snapshot as a source, they will have the replacement
+//! and will not need a replace request.
+//!
+//! However, any past use of that snapshot as a source means that the Volume
+//! created from that will have a copy of the unmodified snapshot Volume as a
+//! read-only parent. Any construction of the Volume will be referencing the
+//! replaced region snapshot (which could be gone if it is expunged). It is this
+//! saga's responsibility to update all Volumes that reference the region
+//! snapshot being replaced, and send a replacement request to any Upstairs that
+//! were constructed.
+//!
+//! Some difficulty comes from the requirement to notify existing Upstairs that
+//! reference the replaced read-only part, but even this is not as difficult as
+//! region replacement: Nexus does not have to continually monitor and drive
+//! either live repair or reconciliation, just ensure that the read-only
+//! replacement occurs. Read-only replacements should be basically
+//! instantaneous.
+//!
+//! A replace request only needs to be done once per Upstairs that has the old
+//! reference. This is done as a "region snapshot replacement step", and once
+//! all those are done, the region snapshot replacement request can be
+//! "completed".
+//!
+//! Region snapshot replacement steps need to be written into the database and
+//! have an associated state and operating saga id for the same reason that
+//! region snapshot replacement requests do: multiple background tasks will
+//! invoke multiple sagas, and there needs to be some exclusive access.
+//!
+//! See the documentation for the "region snapshot replacement step garbage
+//! collect" saga for the next step in the process.
+
+use super::{
+    ActionRegistry, NexusActionContext, NexusSaga, SagaInitError,
+    ACTION_GENERATE_ID,
+};
+use crate::app::db::datastore::ExistingTarget;
+use crate::app::db::datastore::ReplacementTarget;
+use crate::app::db::datastore::VolumeToDelete;
+use crate::app::db::datastore::VolumeWithTarget;
+use crate::app::db::lookup::LookupPath;
+use crate::app::sagas::declare_saga_actions;
+use crate::app::{authn, authz, db};
+use nexus_db_model::VmmState;
+use nexus_types::identity::Resource;
+use omicron_common::api::external::Error;
+use propolis_client::types::ReplaceResult;
+use serde::Deserialize;
+use serde::Serialize;
+use sled_agent_client::types::CrucibleOpts;
+use sled_agent_client::types::VolumeConstructionRequest;
+use std::net::SocketAddrV6;
+use steno::ActionError;
+use steno::Node;
+use uuid::Uuid;
+
+// region snapshot replacement step saga: input parameters
+
+#[derive(Debug, Deserialize, Serialize)]
+pub(crate) struct Params {
+    pub serialized_authn: authn::saga::Serialized,
+    pub request: db::model::RegionSnapshotReplacementStep,
+}
+
+// region snapshot replacement step saga: actions
+
+declare_saga_actions! {
+    region_snapshot_replacement_step;
+    SET_SAGA_ID -> "unused_1" {
+        + rsrss_set_saga_id
+        - rsrss_set_saga_id_undo
+    }
+    CREATE_REPLACE_PARAMS -> "replace_params" {
+        + rsrss_create_replace_params
+    }
+    CREATE_FAKE_VOLUME -> "unused_2" {
+        + rssrs_create_fake_volume
+        - rssrs_create_fake_volume_undo
+    }
+    REPLACE_SNAPSHOT_IN_VOLUME -> "unused_3" {
+        + rsrss_replace_snapshot_in_volume
+        - rsrss_replace_snapshot_in_volume_undo
+    }
+    NOTIFY_UPSTAIRS -> "unused_4" {
+        + rsrss_notify_upstairs
+    }
+    UPDATE_REQUEST_RECORD -> "unused_5" {
+        + rsrss_update_request_record
+    }
+}
+
+// region snapshot replacement step saga: definition
+
+#[derive(Debug)]
+pub(crate) struct SagaRegionSnapshotReplacementStep;
+impl NexusSaga for SagaRegionSnapshotReplacementStep {
+    const NAME: &'static str = "region-snapshot-replacement-step";
+    type Params = Params;
+
+    fn register_actions(registry: &mut ActionRegistry) {
+        region_snapshot_replacement_step_register_actions(registry);
+    }
+
+    fn make_saga_dag(
+        _params: &Self::Params,
+        mut builder: steno::DagBuilder,
+    ) -> Result<steno::Dag, SagaInitError> {
+        builder.append(Node::action(
+            "saga_id",
+            "GenerateSagaId",
+            ACTION_GENERATE_ID.as_ref(),
+        ));
+
+        builder.append(Node::action(
+            "new_volume_id",
+            "GenerateNewVolumeId",
+            ACTION_GENERATE_ID.as_ref(),
+        ));
+
+        builder.append(set_saga_id_action());
+        builder.append(create_replace_params_action());
+        builder.append(create_fake_volume_action());
+        builder.append(replace_snapshot_in_volume_action());
+        builder.append(notify_upstairs_action());
+        builder.append(update_request_record_action());
+
+        Ok(builder.build()?)
+    }
+}
+
+// region snapshot replacement step saga: action implementations
+
+async fn rsrss_set_saga_id(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let saga_id = sagactx.lookup::<Uuid>("saga_id")?;
+
+    // Change the request record here to an intermediate "running" state to
+    // block out other sagas that will be triggered for the same request.
+
+    osagactx
+        .datastore()
+        .set_region_snapshot_replacement_step_running(
+            &opctx,
+            params.request.id,
+            saga_id,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
+async fn rsrss_set_saga_id_undo(
+    sagactx: NexusActionContext,
+) -> Result<(), anyhow::Error> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let saga_id = sagactx.lookup::<Uuid>("saga_id")?;
+
+    osagactx
+        .datastore()
+        .undo_set_region_snapshot_replacement_step_running(
+            &opctx,
+            params.request.id,
+            saga_id,
+        )
+        .await?;
+
+    Ok(())
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct ReplaceParams {
+    old_snapshot_address: SocketAddrV6,
+    new_region_address: SocketAddrV6,
+}
+
+async fn rsrss_create_replace_params(
+    sagactx: NexusActionContext,
+) -> Result<ReplaceParams, ActionError> {
+    let log = sagactx.user_data().log();
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    // look up region snapshot replace request by id
+
+    let region_snapshot_replace_request = osagactx
+        .datastore()
+        .get_region_snapshot_replacement_request_by_id(
+            &opctx,
+            params.request.request_id,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    let region_snapshot = osagactx
+        .datastore()
+        .region_snapshot_get(
+            region_snapshot_replace_request.old_dataset_id,
+            region_snapshot_replace_request.old_region_id,
+            region_snapshot_replace_request.old_snapshot_id,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    let Some(region_snapshot) = region_snapshot else {
+        return Err(ActionError::action_failed(format!(
+            "region snapshot {} {} {} deleted!",
+            region_snapshot_replace_request.old_dataset_id,
+            region_snapshot_replace_request.old_region_id,
+            region_snapshot_replace_request.old_snapshot_id,
+        )));
+    };
+
+    let old_snapshot_address: SocketAddrV6 =
+        match region_snapshot.snapshot_addr.parse() {
+            Ok(addr) => addr,
+
+            Err(e) => {
+                return Err(ActionError::action_failed(format!(
+                    "parsing {} as SocketAddrV6 failed: {e}",
+                    region_snapshot.snapshot_addr,
+                )));
+            }
+        };
+
+    let Some(new_region_id) = region_snapshot_replace_request.new_region_id
+    else {
+        return Err(ActionError::action_failed(format!(
+            "request {} does not have a new_region_id!",
+            region_snapshot_replace_request.id,
+        )));
+    };
+
+    let new_region_address = osagactx
+        .nexus()
+        .region_addr(&log, new_region_id)
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(ReplaceParams { old_snapshot_address, new_region_address })
+}
+
+async fn rssrs_create_fake_volume(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+
+    let new_volume_id = sagactx.lookup::<Uuid>("new_volume_id")?;
+
+    // Create a fake volume record for the old snapshot target. This will be
+    // deleted after region snapshot replacement step saga has finished, and the
+    // region replacement snapshot gc step has run. It can be completely blank
+    // here, it will be replaced by `volume_replace_snapshot`.
+
+    let volume_construction_request = VolumeConstructionRequest::Volume {
+        id: new_volume_id,
+        block_size: 0,
+        sub_volumes: vec![VolumeConstructionRequest::Region {
+            block_size: 0,
+            blocks_per_extent: 0,
+            extent_count: 0,
+            gen: 0,
+            opts: CrucibleOpts {
+                id: new_volume_id,
+                target: vec![],
+                lossy: false,
+                flush_timeout: None,
+                key: None,
+                cert_pem: None,
+                key_pem: None,
+                root_cert_pem: None,
+                control: None,
+                read_only: true,
+            },
+        }],
+        read_only_parent: None,
+    };
+
+    let volume_data = serde_json::to_string(&volume_construction_request)
+        .map_err(|e| {
+            ActionError::action_failed(Error::internal_error(&e.to_string()))
+        })?;
+
+    let volume = db::model::Volume::new(new_volume_id, volume_data);
+
+    osagactx
+        .datastore()
+        .volume_create(volume)
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
+async fn rssrs_create_fake_volume_undo(
+    sagactx: NexusActionContext,
+) -> Result<(), anyhow::Error> {
+    let osagactx = sagactx.user_data();
+
+    // Delete the fake volume.
+
+    let new_volume_id = sagactx.lookup::<Uuid>("new_volume_id")?;
+    osagactx.datastore().volume_hard_delete(new_volume_id).await?;
+
+    Ok(())
+}
+
+async fn rsrss_replace_snapshot_in_volume(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+
+    let replace_params = sagactx.lookup::<ReplaceParams>("replace_params")?;
+
+    let new_volume_id = sagactx.lookup::<Uuid>("new_volume_id")?;
+
+    // `volume_replace_snapshot` will swap the old snapshot for the new region.
+    // No repair or reconcilation needs to occur after this.
+    osagactx
+        .datastore()
+        .volume_replace_snapshot(
+            VolumeWithTarget(params.request.volume_id),
+            ExistingTarget(replace_params.old_snapshot_address),
+            ReplacementTarget(replace_params.new_region_address),
+            VolumeToDelete(new_volume_id),
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
+async fn rsrss_replace_snapshot_in_volume_undo(
+    sagactx: NexusActionContext,
+) -> Result<(), anyhow::Error> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+
+    let replace_params = sagactx.lookup::<ReplaceParams>("replace_params")?;
+
+    let new_volume_id = sagactx.lookup::<Uuid>("new_volume_id")?;
+
+    osagactx
+        .datastore()
+        .volume_replace_snapshot(
+            VolumeWithTarget(params.request.volume_id),
+            ExistingTarget(replace_params.new_region_address),
+            ReplacementTarget(replace_params.old_snapshot_address),
+            VolumeToDelete(new_volume_id),
+        )
+        .await?;
+
+    Ok(())
+}
+
+async fn rsrss_notify_upstairs(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let log = sagactx.user_data().log();
+
+    // Make an effort to notify a Propolis if one was booted for this volume.
+    // This is best effort: if there is a failure, this saga will unwind and be
+    // triggered again for the same request. If there is no Propolis booted for
+    // this volume, then there's nothing to be done: any future Propolis will
+    // receive the updated Volume.
+    //
+    // Unlike for region replacement, there's no step required here if there
+    // isn't an active Propolis: any Upstairs created after the snapshot_addr
+    // is replaced will reference the cloned data.
+
+    let Some(disk) = osagactx
+        .datastore()
+        .disk_for_volume_id(params.request.volume_id)
+        .await
+        .map_err(ActionError::action_failed)?
+    else {
+        return Ok(());
+    };
+
+    let Some(instance_id) = disk.runtime().attach_instance_id else {
+        return Ok(());
+    };
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore())
+        .instance_id(instance_id)
+        .lookup_for(authz::Action::Read)
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    let instance_and_vmm = osagactx
+        .datastore()
+        .instance_fetch_with_vmm(&opctx, &authz_instance)
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    let Some(vmm) = instance_and_vmm.vmm() else {
+        return Ok(());
+    };
+
+    let state = vmm.runtime.state;
+
+    info!(
+        log,
+        "volume associated with disk attached to instance with vmm in \
+        state {state}";
+        "request id" => %params.request.id,
+        "volume id" => %params.request.volume_id,
+        "disk id" => ?disk.id(),
+        "instance id" => ?instance_id,
+        "vmm id" => ?vmm.id,
+    );
+
+    match &state {
+        VmmState::Running | VmmState::Rebooting => {
+            // Propolis server is ok to receive the volume replacement request.
+        }
+
+        VmmState::Starting
+        | VmmState::Stopping
+        | VmmState::Stopped
+        | VmmState::Migrating
+        | VmmState::Failed
+        | VmmState::Destroyed
+        | VmmState::SagaUnwound => {
+            // Propolis server is not ok to receive volume replacement requests
+            // - unwind so that this saga can run again.
+            return Err(ActionError::action_failed(format!(
+                "vmm {} propolis not in a state to receive request",
+                vmm.id,
+            )));
+        }
+    }
+
+    let new_volume_vcr = match osagactx
+        .datastore()
+        .volume_get(params.request.volume_id)
+        .await
+        .map_err(ActionError::action_failed)?
+    {
+        Some(volume) => volume.data().to_string(),
+
+        None => {
+            return Err(ActionError::action_failed(Error::internal_error(
+                "new volume is gone!",
+            )));
+        }
+    };
+
+    let instance_lookup =
+        LookupPath::new(&opctx, &osagactx.datastore()).instance_id(instance_id);
+
+    let (vmm, client) = osagactx
+        .nexus()
+        .propolis_client_for_instance(
+            &opctx,
+            &instance_lookup,
+            authz::Action::Modify,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    info!(
+        log,
+        "sending replacement request for disk volume to propolis";
+        "request id" => %params.request.id,
+        "volume id" => %params.request.volume_id,
+        "disk id" => ?disk.id(),
+        "instance id" => ?instance_id,
+        "vmm id" => ?vmm.id,
+    );
+
+    let result = client
+        .instance_issue_crucible_vcr_request()
+        .id(disk.id())
+        .body(propolis_client::types::InstanceVcrReplace {
+            name: disk.name().to_string(),
+            vcr_json: new_volume_vcr,
+        })
+        .send()
+        .await
+        .map_err(|e| match e {
+            propolis_client::Error::ErrorResponse(rv) => {
+                ActionError::action_failed(rv.message.clone())
+            }
+
+            _ => ActionError::action_failed(format!(
+                "unexpected failure during \
+                        `instance_issue_crucible_vcr_request`: {e}",
+            )),
+        })?;
+
+    let replace_result = result.into_inner();
+
+    info!(
+        log,
+        "saw replace result {replace_result:?}";
+        "request id" => %params.request.id,
+        "volume id" => %params.request.volume_id,
+        "disk id" => ?disk.id(),
+        "instance id" => ?instance_id,
+        "vmm id" => ?vmm.id,
+    );
+
+    match &replace_result {
+        ReplaceResult::Started => {
+            // This saga's call just started the replacement
+        }
+
+        ReplaceResult::StartedAlready => {
+            // A previous run of this saga (or saga node) started the
+            // replacement
+        }
+
+        ReplaceResult::CompletedAlready => {
+            // It's done! We see this if the same propolis that received the
+            // original replace request started and finished the replacement.
+        }
+
+        ReplaceResult::VcrMatches => {
+            // This propolis booted with the updated VCR
+        }
+
+        ReplaceResult::Missing => {
+            // The volume does not contain the region to be replaced. This is an
+            // error!
+            return Err(ActionError::action_failed(String::from(
+                "saw ReplaceResult::Missing",
+            )));
+        }
+    }
+
+    Ok(())
+}
+
+async fn rsrss_update_request_record(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let params = sagactx.saga_params::<Params>()?;
+    let osagactx = sagactx.user_data();
+    let datastore = osagactx.datastore();
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let saga_id = sagactx.lookup::<Uuid>("saga_id")?;
+    let new_volume_id = sagactx.lookup::<Uuid>("new_volume_id")?;
+
+    // Update the request record to 'Completed' and clear the operating saga id.
+    // There is no undo step for this, it should succeed idempotently.
+    datastore
+        .set_region_snapshot_replacement_step_complete(
+            &opctx,
+            params.request.id,
+            saga_id,
+            new_volume_id,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs
new file mode 100644
index 0000000000..93335b6125
--- /dev/null
+++ b/nexus/src/app/sagas/region_snapshot_replacement_step_garbage_collect.rs
@@ -0,0 +1,233 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Delete the volume that stashes the target replaced during a region snapshot
+//! replacement step saga. After that's done, change the region snapshot
+//! replacement step's state to "VolumeDeleted".
+
+use super::{ActionRegistry, NexusActionContext, NexusSaga, SagaInitError};
+use crate::app::sagas::declare_saga_actions;
+use crate::app::sagas::volume_delete;
+use crate::app::{authn, db};
+use serde::Deserialize;
+use serde::Serialize;
+use steno::ActionError;
+use steno::Node;
+use uuid::Uuid;
+
+// region snapshot replacement step garbage collect saga: input parameters
+
+#[derive(Debug, Deserialize, Serialize)]
+pub(crate) struct Params {
+    pub serialized_authn: authn::saga::Serialized,
+    /// The fake volume created for the snapshot that was replaced
+    // Note: this is only required in the params to build the volume-delete sub
+    // saga
+    pub old_snapshot_volume_id: Uuid,
+    pub request: db::model::RegionSnapshotReplacementStep,
+}
+
+// region snapshot replacement step garbage collect saga: actions
+
+declare_saga_actions! {
+    region_snapshot_replacement_step_garbage_collect;
+    UPDATE_REQUEST_RECORD -> "unused_1" {
+        + srsgs_update_request_record
+    }
+}
+
+// region snapshot replacement step garbage collect saga: definition
+
+#[derive(Debug)]
+pub(crate) struct SagaRegionSnapshotReplacementStepGarbageCollect;
+impl NexusSaga for SagaRegionSnapshotReplacementStepGarbageCollect {
+    const NAME: &'static str =
+        "region-snapshot-replacement-step-garbage-collect";
+    type Params = Params;
+
+    fn register_actions(registry: &mut ActionRegistry) {
+        region_snapshot_replacement_step_garbage_collect_register_actions(
+            registry,
+        );
+    }
+
+    fn make_saga_dag(
+        params: &Self::Params,
+        mut builder: steno::DagBuilder,
+    ) -> Result<steno::Dag, SagaInitError> {
+        let subsaga_params = volume_delete::Params {
+            serialized_authn: params.serialized_authn.clone(),
+            volume_id: params.old_snapshot_volume_id,
+        };
+
+        let subsaga_dag = {
+            let subsaga_builder = steno::DagBuilder::new(steno::SagaName::new(
+                volume_delete::SagaVolumeDelete::NAME,
+            ));
+            volume_delete::SagaVolumeDelete::make_saga_dag(
+                &subsaga_params,
+                subsaga_builder,
+            )?
+        };
+
+        builder.append(Node::constant(
+            "params_for_volume_delete_subsaga",
+            serde_json::to_value(&subsaga_params).map_err(|e| {
+                SagaInitError::SerializeError(
+                    "params_for_volume_delete_subsaga".to_string(),
+                    e,
+                )
+            })?,
+        ));
+
+        builder.append(Node::subsaga(
+            "volume_delete_subsaga_no_result",
+            subsaga_dag,
+            "params_for_volume_delete_subsaga",
+        ));
+
+        builder.append(update_request_record_action());
+
+        Ok(builder.build()?)
+    }
+}
+
+// region snapshot replacement step garbage collect saga: action implementations
+
+async fn srsgs_update_request_record(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let params = sagactx.saga_params::<Params>()?;
+    let osagactx = sagactx.user_data();
+    let datastore = osagactx.datastore();
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    // Now that the region snapshot step volume has been deleted, update the
+    // replacement request record to 'VolumeDeleted'. There is no undo step for
+    // this, it should succeed idempotently.
+
+    datastore
+        .set_region_snapshot_replacement_step_volume_deleted(
+            &opctx,
+            params.request.id,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+pub(crate) mod test {
+    use crate::app::sagas::region_snapshot_replacement_step_garbage_collect::*;
+    use nexus_db_model::RegionSnapshotReplacementStep;
+    use nexus_db_model::RegionSnapshotReplacementStepState;
+    use nexus_db_model::Volume;
+    use nexus_db_queries::authn::saga::Serialized;
+    use nexus_db_queries::context::OpContext;
+    use nexus_test_utils_macros::nexus_test;
+    use sled_agent_client::types::CrucibleOpts;
+    use sled_agent_client::types::VolumeConstructionRequest;
+    use uuid::Uuid;
+
+    type ControlPlaneTestContext =
+        nexus_test_utils::ControlPlaneTestContext<crate::Server>;
+
+    #[nexus_test(server = crate::Server)]
+    async fn test_region_snapshot_replacement_step_garbage_collect_saga(
+        cptestctx: &ControlPlaneTestContext,
+    ) {
+        let nexus = &cptestctx.server.server_context().nexus;
+        let datastore = nexus.datastore();
+        let opctx = OpContext::for_tests(
+            cptestctx.logctx.log.clone(),
+            datastore.clone(),
+        );
+
+        // Manually insert required records
+        let old_snapshot_volume_id = Uuid::new_v4();
+
+        let volume_construction_request = VolumeConstructionRequest::Volume {
+            id: old_snapshot_volume_id,
+            block_size: 0,
+            sub_volumes: vec![VolumeConstructionRequest::Region {
+                block_size: 0,
+                blocks_per_extent: 0,
+                extent_count: 0,
+                gen: 0,
+                opts: CrucibleOpts {
+                    id: old_snapshot_volume_id,
+                    target: vec![
+                        // XXX if you put something here, you'll need a
+                        // synthetic dataset record
+                    ],
+                    lossy: false,
+                    flush_timeout: None,
+                    key: None,
+                    cert_pem: None,
+                    key_pem: None,
+                    root_cert_pem: None,
+                    control: None,
+                    read_only: false,
+                },
+            }],
+            read_only_parent: None,
+        };
+
+        let volume_data =
+            serde_json::to_string(&volume_construction_request).unwrap();
+
+        datastore
+            .volume_create(Volume::new(old_snapshot_volume_id, volume_data))
+            .await
+            .unwrap();
+
+        let mut request =
+            RegionSnapshotReplacementStep::new(Uuid::new_v4(), Uuid::new_v4());
+        request.replacement_state =
+            RegionSnapshotReplacementStepState::Complete;
+        request.old_snapshot_volume_id = Some(old_snapshot_volume_id);
+
+        datastore
+            .insert_region_snapshot_replacement_step(&opctx, request.clone())
+            .await
+            .unwrap();
+
+        // Run the saga
+        let params = Params {
+            serialized_authn: Serialized::for_opctx(&opctx),
+            old_snapshot_volume_id,
+            request: request.clone(),
+        };
+
+        let _output = nexus
+            .sagas
+            .saga_execute::<SagaRegionSnapshotReplacementStepGarbageCollect>(
+                params,
+            )
+            .await
+            .unwrap();
+
+        // Validate the state transition
+        let result = datastore
+            .get_region_snapshot_replacement_step_by_id(&opctx, request.id)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            result.replacement_state,
+            RegionSnapshotReplacementStepState::VolumeDeleted
+        );
+
+        // Validate the Volume was deleted
+        assert!(datastore
+            .volume_get(old_snapshot_volume_id)
+            .await
+            .unwrap()
+            .is_none());
+    }
+}
diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs
index eeb14091b2..540ab90e28 100644
--- a/nexus/src/app/sagas/snapshot_create.rs
+++ b/nexus/src/app/sagas/snapshot_create.rs
@@ -106,11 +106,12 @@ use nexus_db_queries::db::lookup::LookupPath;
 use omicron_common::api::external;
 use omicron_common::api::external::Error;
 use omicron_common::retry_until_known_result;
+use omicron_uuid_kinds::{GenericUuid, PropolisUuid, SledUuid};
 use rand::{rngs::StdRng, RngCore, SeedableRng};
 use serde::Deserialize;
 use serde::Serialize;
 use sled_agent_client::types::CrucibleOpts;
-use sled_agent_client::types::InstanceIssueDiskSnapshotRequestBody;
+use sled_agent_client::types::VmmIssueDiskSnapshotRequestBody;
 use sled_agent_client::types::VolumeConstructionRequest;
 use slog::info;
 use std::collections::BTreeMap;
@@ -826,39 +827,43 @@ async fn ssc_send_snapshot_request_to_sled_agent(
         .await
         .map_err(ActionError::action_failed)?;
 
-    let sled_id = osagactx
+    let instance_and_vmm = osagactx
         .datastore()
         .instance_fetch_with_vmm(&opctx, &authz_instance)
         .await
-        .map_err(ActionError::action_failed)?
-        .sled_id();
+        .map_err(ActionError::action_failed)?;
+
+    let vmm = instance_and_vmm.vmm();
 
     // If this instance does not currently have a sled, we can't continue this
     // saga - the user will have to reissue the snapshot request and it will get
     // run on a Pantry.
-    let Some(sled_id) = sled_id else {
+    let Some((propolis_id, sled_id)) =
+        vmm.as_ref().map(|vmm| (vmm.id, vmm.sled_id))
+    else {
         return Err(ActionError::action_failed(Error::unavail(
-            "sled id is None!",
+            "instance no longer has an active VMM!",
         )));
     };
 
     info!(log, "asking for disk snapshot from Propolis via sled agent";
           "disk_id" => %params.disk_id,
           "instance_id" => %attach_instance_id,
+          "propolis_id" => %propolis_id,
           "sled_id" => %sled_id);
 
     let sled_agent_client = osagactx
         .nexus()
-        .sled_client(&sled_id)
+        .sled_client(&SledUuid::from_untyped_uuid(sled_id))
         .await
         .map_err(ActionError::action_failed)?;
 
     retry_until_known_result(log, || async {
         sled_agent_client
-            .instance_issue_disk_snapshot_request(
-                &attach_instance_id,
+            .vmm_issue_disk_snapshot_request(
+                &PropolisUuid::from_untyped_uuid(propolis_id),
                 &params.disk_id,
-                &InstanceIssueDiskSnapshotRequestBody { snapshot_id },
+                &VmmIssueDiskSnapshotRequestBody { snapshot_id },
             )
             .await
     })
@@ -2151,12 +2156,15 @@ mod test {
             .await
             .unwrap();
 
-        let sled_id = instance_state
-            .sled_id()
-            .expect("starting instance should have a sled");
+        let vmm_state = instance_state
+            .vmm()
+            .as_ref()
+            .expect("starting instance should have a vmm");
+        let propolis_id = PropolisUuid::from_untyped_uuid(vmm_state.id);
+        let sled_id = SledUuid::from_untyped_uuid(vmm_state.sled_id);
         let sa = nexus.sled_client(&sled_id).await.unwrap();
+        sa.vmm_finish_transition(propolis_id).await;
 
-        sa.instance_finish_transition(instance.identity.id).await;
         let instance_state = nexus
             .datastore()
             .instance_fetch_with_vmm(&opctx, &authz_instance)
diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs
index b9388a1116..1572ba4330 100644
--- a/nexus/src/app/sagas/test_helpers.rs
+++ b/nexus/src/app/sagas/test_helpers.rs
@@ -5,11 +5,8 @@
 //! Helper functions for writing saga undo tests and working with instances in
 //! saga tests.
 
-use super::NexusSaga;
-use crate::{
-    app::{saga::create_saga_dag, test_interfaces::TestInterfaces as _},
-    Nexus,
-};
+use super::{instance_common::VmmAndSledIds, NexusSaga};
+use crate::{app::saga::create_saga_dag, Nexus};
 use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection};
 use camino::Utf8Path;
 use diesel::{
@@ -137,13 +134,14 @@ pub(crate) async fn instance_simulate(
     info!(&cptestctx.logctx.log, "Poking simulated instance";
           "instance_id" => %instance_id);
     let nexus = &cptestctx.server.server_context().nexus;
+    let VmmAndSledIds { vmm_id, sled_id } =
+        instance_fetch_vmm_and_sled_ids(cptestctx, instance_id).await;
     let sa = nexus
-        .instance_sled_by_id(instance_id)
+        .sled_client(&sled_id)
         .await
-        .unwrap()
         .expect("instance must be on a sled to simulate a state change");
 
-    sa.instance_finish_transition(instance_id.into_untyped_uuid()).await;
+    sa.vmm_finish_transition(vmm_id).await;
 }
 
 pub(crate) async fn instance_single_step_on_sled(
@@ -158,12 +156,14 @@ pub(crate) async fn instance_single_step_on_sled(
         "sled_id" => %sled_id,
     );
     let nexus = &cptestctx.server.server_context().nexus;
+    let VmmAndSledIds { vmm_id, sled_id } =
+        instance_fetch_vmm_and_sled_ids(cptestctx, instance_id).await;
     let sa = nexus
-        .sled_client(sled_id)
+        .sled_client(&sled_id)
         .await
-        .expect("sled must exist to simulate a state change");
+        .expect("instance must be on a sled to simulate a state change");
 
-    sa.instance_single_step(instance_id.into_untyped_uuid()).await;
+    sa.vmm_single_step(vmm_id).await;
 }
 
 pub(crate) async fn instance_simulate_by_name(
@@ -186,12 +186,14 @@ pub(crate) async fn instance_simulate_by_name(
     let instance_lookup =
         nexus.instance_lookup(&opctx, instance_selector).unwrap();
     let (.., instance) = instance_lookup.fetch().await.unwrap();
+    let instance_id = InstanceUuid::from_untyped_uuid(instance.id());
+    let VmmAndSledIds { vmm_id, sled_id } =
+        instance_fetch_vmm_and_sled_ids(cptestctx, &instance_id).await;
     let sa = nexus
-        .instance_sled_by_id(&InstanceUuid::from_untyped_uuid(instance.id()))
+        .sled_client(&sled_id)
         .await
-        .unwrap()
         .expect("instance must be on a sled to simulate a state change");
-    sa.instance_finish_transition(instance.id()).await;
+    sa.vmm_finish_transition(vmm_id).await;
 }
 
 pub async fn instance_fetch(
@@ -218,6 +220,21 @@ pub async fn instance_fetch(
     db_state
 }
 
+pub(super) async fn instance_fetch_vmm_and_sled_ids(
+    cptestctx: &ControlPlaneTestContext,
+    instance_id: &InstanceUuid,
+) -> VmmAndSledIds {
+    let instance_and_vmm = instance_fetch(cptestctx, *instance_id).await;
+    let vmm = instance_and_vmm
+        .vmm()
+        .as_ref()
+        .expect("can only fetch VMM and sled IDs for an active instance");
+
+    let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id);
+    let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id);
+    VmmAndSledIds { vmm_id, sled_id }
+}
+
 pub async fn instance_fetch_all(
     cptestctx: &ControlPlaneTestContext,
     instance_id: InstanceUuid,
diff --git a/nexus/src/app/snapshot.rs b/nexus/src/app/snapshot.rs
index 040c9fc082..57b8edd1f0 100644
--- a/nexus/src/app/snapshot.rs
+++ b/nexus/src/app/snapshot.rs
@@ -109,7 +109,7 @@ impl super::Nexus {
 
             // If a Propolis _may_ exist, send the snapshot request there,
             // otherwise use the pantry.
-            !instance_state.vmm().is_some()
+            instance_state.vmm().is_none()
         } else {
             // This disk is not attached to an instance, use the pantry.
             true
diff --git a/nexus/src/app/switch_port.rs b/nexus/src/app/switch_port.rs
index 9726a59d33..b616531f53 100644
--- a/nexus/src/app/switch_port.rs
+++ b/nexus/src/app/switch_port.rs
@@ -30,6 +30,7 @@ impl super::Nexus {
         params: params::SwitchPortSettingsCreate,
     ) -> CreateResult<SwitchPortSettingsCombinedResult> {
         opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;
+        Self::switch_port_settings_validate(&params)?;
 
         //TODO race conditions on exists check versus update/create.
         //     Normally I would use a DB lock here, but not sure what
@@ -54,6 +55,36 @@ impl super::Nexus {
         }
     }
 
+    // TODO: more validation wanted
+    fn switch_port_settings_validate(
+        params: &params::SwitchPortSettingsCreate,
+    ) -> CreateResult<()> {
+        for x in params.bgp_peers.values() {
+            for p in x.peers.iter() {
+                if let Some(ref key) = p.md5_auth_key {
+                    if key.len() > 80 {
+                        return Err(Error::invalid_value(
+                            "md5_auth_key",
+                            format!("md5 auth key for {} is longer than 80 characters", p.addr)
+                        ));
+                    }
+                    for c in key.chars() {
+                        if !c.is_ascii() || c.is_ascii_control() {
+                            return Err(Error::invalid_value(
+                                "md5_auth_key",
+                                format!(
+                                    "md5 auth key for {} must be printable ascii",
+                                    p.addr
+                                ),
+                            ));
+                        }
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
     pub async fn switch_port_settings_create(
         self: &Arc<Self>,
         opctx: &OpContext,
diff --git a/nexus/src/app/test_interfaces.rs b/nexus/src/app/test_interfaces.rs
index adfafa523d..9852225e8c 100644
--- a/nexus/src/app/test_interfaces.rs
+++ b/nexus/src/app/test_interfaces.rs
@@ -6,8 +6,7 @@ use async_trait::async_trait;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::lookup::LookupPath;
 use omicron_common::api::external::Error;
-use omicron_uuid_kinds::GenericUuid;
-use omicron_uuid_kinds::{InstanceUuid, SledUuid};
+use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid};
 use sled_agent_client::Client as SledAgentClient;
 use std::sync::Arc;
 use uuid::Uuid;
@@ -19,25 +18,47 @@ pub use super::update::SpUpdater;
 pub use super::update::UpdateProgress;
 pub use gateway_client::types::SpType;
 
+/// The information needed to talk to a sled agent about an instance that is
+/// active on that sled.
+pub struct InstanceSledAgentInfo {
+    /// The ID of the Propolis job to send to sled agent.
+    pub propolis_id: PropolisUuid,
+
+    /// The ID of the sled where the Propolis job is running.
+    pub sled_id: SledUuid,
+
+    /// A client for talking to the Propolis's host sled.
+    pub sled_client: Arc<SledAgentClient>,
+
+    /// The ID of the instance's migration target Propolis, if it has one.
+    pub dst_propolis_id: Option<PropolisUuid>,
+}
+
 /// Exposes additional [`super::Nexus`] interfaces for use by the test suite
 #[async_trait]
 pub trait TestInterfaces {
     /// Access the Rack ID of the currently executing Nexus.
     fn rack_id(&self) -> Uuid;
 
-    /// Returns the SledAgentClient for an Instance from its id.  We may also
-    /// want to split this up into instance_lookup_by_id() and instance_sled(),
-    /// but after all it's a test suite special to begin with.
-    async fn instance_sled_by_id(
+    /// Attempts to obtain the Propolis ID and sled agent information for an
+    /// instance.
+    ///
+    /// # Arguments
+    ///
+    /// - `id`: The ID of the instance of interest.
+    /// - `opctx`: An optional operation context to use for authorization
+    ///   checks. If `None`, this routine supplies the default test opctx.
+    ///
+    /// # Return value
+    ///
+    /// - `Ok(Some(info))` if the instance has an active Propolis.
+    /// - `Ok(None)` if the instance has no active Propolis.
+    /// - `Err` if an error occurred.
+    async fn active_instance_info(
         &self,
         id: &InstanceUuid,
-    ) -> Result<Option<Arc<SledAgentClient>>, Error>;
-
-    async fn instance_sled_by_id_with_opctx(
-        &self,
-        id: &InstanceUuid,
-        opctx: &OpContext,
-    ) -> Result<Option<Arc<SledAgentClient>>, Error>;
+        opctx: Option<&OpContext>,
+    ) -> Result<Option<InstanceSledAgentInfo>, Error>;
 
     /// Returns the SledAgentClient for the sled running an instance to which a
     /// disk is attached.
@@ -46,18 +67,6 @@ pub trait TestInterfaces {
         id: &Uuid,
     ) -> Result<Option<Arc<SledAgentClient>>, Error>;
 
-    /// Returns the supplied instance's current active sled ID.
-    async fn instance_sled_id(
-        &self,
-        instance_id: &InstanceUuid,
-    ) -> Result<Option<SledUuid>, Error>;
-
-    async fn instance_sled_id_with_opctx(
-        &self,
-        instance_id: &InstanceUuid,
-        opctx: &OpContext,
-    ) -> Result<Option<SledUuid>, Error>;
-
     async fn set_disk_as_faulted(&self, disk_id: &Uuid) -> Result<bool, Error>;
 
     fn set_samael_max_issue_delay(&self, max_issue_delay: chrono::Duration);
@@ -69,30 +78,49 @@ impl TestInterfaces for super::Nexus {
         self.rack_id
     }
 
-    async fn instance_sled_by_id(
+    async fn active_instance_info(
         &self,
         id: &InstanceUuid,
-    ) -> Result<Option<Arc<SledAgentClient>>, Error> {
-        let opctx = OpContext::for_tests(
-            self.log.new(o!()),
-            Arc::clone(&self.db_datastore)
-                as Arc<dyn nexus_auth::storage::Storage>,
-        );
+        opctx: Option<&OpContext>,
+    ) -> Result<Option<InstanceSledAgentInfo>, Error> {
+        let local_opctx;
+        let opctx = match opctx {
+            Some(o) => o,
+            None => {
+                local_opctx = OpContext::for_tests(
+                    self.log.new(o!()),
+                    Arc::clone(&self.db_datastore)
+                        as Arc<dyn nexus_auth::storage::Storage>,
+                );
+                &local_opctx
+            }
+        };
 
-        self.instance_sled_by_id_with_opctx(id, &opctx).await
-    }
+        let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore)
+            .instance_id(id.into_untyped_uuid())
+            .lookup_for(nexus_db_queries::authz::Action::Read)
+            .await?;
 
-    async fn instance_sled_by_id_with_opctx(
-        &self,
-        id: &InstanceUuid,
-        opctx: &OpContext,
-    ) -> Result<Option<Arc<SledAgentClient>>, Error> {
-        let sled_id = self.instance_sled_id_with_opctx(id, opctx).await?;
-        if let Some(sled_id) = sled_id {
-            Ok(Some(self.sled_client(&sled_id).await?))
-        } else {
-            Ok(None)
-        }
+        let state = self
+            .datastore()
+            .instance_fetch_with_vmm(opctx, &authz_instance)
+            .await?;
+
+        let Some(vmm) = state.vmm() else {
+            return Ok(None);
+        };
+
+        let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id);
+        Ok(Some(InstanceSledAgentInfo {
+            propolis_id: PropolisUuid::from_untyped_uuid(vmm.id),
+            sled_id,
+            sled_client: self.sled_client(&sled_id).await?,
+            dst_propolis_id: state
+                .instance()
+                .runtime()
+                .dst_propolis_id
+                .map(PropolisUuid::from_untyped_uuid),
+        }))
     }
 
     async fn disk_sled_by_id(
@@ -112,37 +140,11 @@ impl TestInterfaces for super::Nexus {
         let instance_id = InstanceUuid::from_untyped_uuid(
             db_disk.runtime().attach_instance_id.unwrap(),
         );
-        self.instance_sled_by_id(&instance_id).await
-    }
-
-    async fn instance_sled_id(
-        &self,
-        id: &InstanceUuid,
-    ) -> Result<Option<SledUuid>, Error> {
-        let opctx = OpContext::for_tests(
-            self.log.new(o!()),
-            Arc::clone(&self.db_datastore)
-                as Arc<dyn nexus_auth::storage::Storage>,
-        );
-
-        self.instance_sled_id_with_opctx(id, &opctx).await
-    }
-
-    async fn instance_sled_id_with_opctx(
-        &self,
-        id: &InstanceUuid,
-        opctx: &OpContext,
-    ) -> Result<Option<SledUuid>, Error> {
-        let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore)
-            .instance_id(id.into_untyped_uuid())
-            .lookup_for(nexus_db_queries::authz::Action::Read)
-            .await?;
 
         Ok(self
-            .datastore()
-            .instance_fetch_with_vmm(opctx, &authz_instance)
+            .active_instance_info(&instance_id, Some(&opctx))
             .await?
-            .sled_id())
+            .map(|info| info.sled_client))
     }
 
     async fn set_disk_as_faulted(&self, disk_id: &Uuid) -> Result<bool, Error> {
diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs
index 7fe1ed84a4..4a43698f00 100644
--- a/nexus/src/bin/schema-updater.rs
+++ b/nexus/src/bin/schema-updater.rs
@@ -71,7 +71,7 @@ async fn main() -> anyhow::Result<()> {
     let log = Logger::root(drain, slog::o!("unit" => "schema_updater"));
 
     let crdb_cfg = db::Config { url: args.url };
-    let pool = Arc::new(db::Pool::new(&log, &crdb_cfg));
+    let pool = Arc::new(db::Pool::new_single_host(&log, &crdb_cfg));
     let schema_config = SchemaConfig { schema_dir: args.schema_directory };
     let all_versions = AllSchemaVersions::load(&schema_config.schema_dir)?;
 
diff --git a/nexus/src/context.rs b/nexus/src/context.rs
index 95d69e0c88..8cb696c62f 100644
--- a/nexus/src/context.rs
+++ b/nexus/src/context.rs
@@ -11,9 +11,7 @@ use authn::external::token::HttpAuthnToken;
 use authn::external::HttpAuthnScheme;
 use camino::Utf8PathBuf;
 use chrono::Duration;
-use internal_dns::ServiceName;
 use nexus_config::NexusConfig;
-use nexus_config::PostgresConfigWithUrl;
 use nexus_config::SchemeName;
 use nexus_db_queries::authn::external::session_cookie::SessionStore;
 use nexus_db_queries::authn::ConsoleSessionWithSiloId;
@@ -25,7 +23,6 @@ use oximeter::types::ProducerRegistry;
 use oximeter_instruments::http::{HttpService, LatencyTracker};
 use slog::Logger;
 use std::env;
-use std::str::FromStr;
 use std::sync::Arc;
 use uuid::Uuid;
 
@@ -210,7 +207,7 @@ impl ServerContext {
         // nexus in dev for everyone
 
         // Set up DNS Client
-        let resolver = match config.deployment.internal_dns {
+        let (resolver, dns_addrs) = match config.deployment.internal_dns {
             nexus_config::InternalDns::FromSubnet { subnet } => {
                 let az_subnet =
                     Ipv6Subnet::<AZ_PREFIX>::new(subnet.net().addr());
@@ -219,11 +216,21 @@ impl ServerContext {
                     "Setting up resolver using DNS servers for subnet: {:?}",
                     az_subnet
                 );
-                internal_dns::resolver::Resolver::new_from_subnet(
-                    log.new(o!("component" => "DnsResolver")),
-                    az_subnet,
+                let resolver =
+                    internal_dns::resolver::Resolver::new_from_subnet(
+                        log.new(o!("component" => "DnsResolver")),
+                        az_subnet,
+                    )
+                    .map_err(|e| {
+                        format!("Failed to create DNS resolver: {}", e)
+                    })?;
+
+                (
+                    resolver,
+                    internal_dns::resolver::Resolver::servers_from_subnet(
+                        az_subnet,
+                    ),
                 )
-                .map_err(|e| format!("Failed to create DNS resolver: {}", e))?
             }
             nexus_config::InternalDns::FromAddress { address } => {
                 info!(
@@ -231,56 +238,33 @@ impl ServerContext {
                     "Setting up resolver using DNS address: {:?}", address
                 );
 
-                internal_dns::resolver::Resolver::new_from_addrs(
-                    log.new(o!("component" => "DnsResolver")),
-                    &[address],
-                )
-                .map_err(|e| format!("Failed to create DNS resolver: {}", e))?
+                let resolver =
+                    internal_dns::resolver::Resolver::new_from_addrs(
+                        log.new(o!("component" => "DnsResolver")),
+                        &[address],
+                    )
+                    .map_err(|e| {
+                        format!("Failed to create DNS resolver: {}", e)
+                    })?;
+
+                (resolver, vec![address])
             }
         };
 
-        // Set up DB pool
-        let url = match &config.deployment.database {
-            nexus_config::Database::FromUrl { url } => url.clone(),
+        let pool = match &config.deployment.database {
+            nexus_config::Database::FromUrl { url } => {
+                info!(log, "Setting up qorb pool from a single host"; "url" => #?url);
+                db::Pool::new_single_host(
+                    &log,
+                    &db::Config { url: url.clone() },
+                )
+            }
             nexus_config::Database::FromDns => {
-                info!(log, "Accessing DB url from DNS");
-                // It's been requested but unfortunately not supported to
-                // directly connect using SRV based lookup.
-                // TODO-robustness: the set of cockroachdb hosts we'll use will
-                // be fixed to whatever we got back from DNS at Nexus start.
-                // This means a new cockroachdb instance won't picked up until
-                // Nexus restarts.
-                let addrs = loop {
-                    match resolver
-                        .lookup_all_socket_v6(ServiceName::Cockroach)
-                        .await
-                    {
-                        Ok(addrs) => break addrs,
-                        Err(e) => {
-                            warn!(
-                                log,
-                                "Failed to lookup cockroach addresses: {e}"
-                            );
-                            tokio::time::sleep(std::time::Duration::from_secs(
-                                1,
-                            ))
-                            .await;
-                        }
-                    }
-                };
-                let addrs_str = addrs
-                    .iter()
-                    .map(ToString::to_string)
-                    .collect::<Vec<_>>()
-                    .join(",");
-                info!(log, "DB addresses: {}", addrs_str);
-                PostgresConfigWithUrl::from_str(&format!(
-                    "postgresql://root@{addrs_str}/omicron?sslmode=disable",
-                ))
-                .map_err(|e| format!("Cannot parse Postgres URL: {}", e))?
+                info!(log, "Setting up qorb pool from DNS"; "dns_addrs" => #?dns_addrs);
+                db::Pool::new(&log, dns_addrs)
             }
         };
-        let pool = db::Pool::new(&log, &db::Config { url });
+
         let nexus = Nexus::new_with_id(
             rack_id,
             log.new(o!("component" => "nexus")),
diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs
index 9965b6e21e..66a8090f11 100644
--- a/nexus/src/internal_api/http_entrypoints.rs
+++ b/nexus/src/internal_api/http_entrypoints.rs
@@ -52,7 +52,7 @@ use omicron_common::api::internal::nexus::ProducerRegistrationResponse;
 use omicron_common::api::internal::nexus::RepairFinishInfo;
 use omicron_common::api::internal::nexus::RepairProgress;
 use omicron_common::api::internal::nexus::RepairStartInfo;
-use omicron_common::api::internal::nexus::SledInstanceState;
+use omicron_common::api::internal::nexus::SledVmmState;
 use omicron_common::update::ArtifactId;
 use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::InstanceUuid;
@@ -168,8 +168,8 @@ impl NexusInternalApi for NexusInternalApiImpl {
 
     async fn cpapi_instances_put(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-        new_runtime_state: TypedBody<SledInstanceState>,
+        path_params: Path<VmmPathParam>,
+        new_runtime_state: TypedBody<SledVmmState>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let apictx = &rqctx.context().context;
         let nexus = &apictx.nexus;
@@ -178,11 +178,7 @@ impl NexusInternalApi for NexusInternalApiImpl {
         let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
         let handler = async {
             nexus
-                .notify_instance_updated(
-                    &opctx,
-                    InstanceUuid::from_untyped_uuid(path.instance_id),
-                    &new_state,
-                )
+                .notify_vmm_updated(&opctx, path.propolis_id, &new_state)
                 .await?;
             Ok(HttpResponseUpdatedNoContent())
         };
diff --git a/nexus/src/populate.rs b/nexus/src/populate.rs
index 4fcb126356..f026b1b504 100644
--- a/nexus/src/populate.rs
+++ b/nexus/src/populate.rs
@@ -380,7 +380,7 @@ mod test {
         let logctx = dev::test_setup_log("test_populator");
         let mut db = test_setup_database(&logctx.log).await;
         let cfg = db::Config { url: db.pg_config().clone() };
-        let pool = Arc::new(db::Pool::new(&logctx.log, &cfg));
+        let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg));
         let datastore = Arc::new(
             db::DataStore::new(&logctx.log, pool, None).await.unwrap(),
         );
@@ -422,19 +422,13 @@ mod test {
             })
             .unwrap();
 
-        // Test again with the database offline.  In principle we could do this
-        // immediately without creating a new pool and datastore.  However, the
-        // pool's default behavior is to wait 30 seconds for a connection, which
-        // makes this test take a long time.  (See the note in
-        // nexus/src/db/pool.rs about this.)  So let's create a pool with an
-        // arbitrarily short timeout now.  (We wouldn't want to do this above
-        // because we do want to wait a bit when we expect things to work, in
-        // case the test system is busy.)
+        // Test again with the database offline. In principle we could do this
+        // immediately without creating a new pool and datastore.
         //
-        // Anyway, if we try again with a broken database, we should get a
+        // If we try again with a broken database, we should get a
         // ServiceUnavailable error, which indicates a transient failure.
         let pool =
-            Arc::new(db::Pool::new_failfast_for_tests(&logctx.log, &cfg));
+            Arc::new(db::Pool::new_single_host_failfast(&logctx.log, &cfg));
         // We need to create the datastore before tearing down the database, as
         // it verifies the schema version of the DB while booting.
         let datastore = Arc::new(
diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml
index d9cbb5eb34..6859e992ca 100644
--- a/nexus/tests/config.test.toml
+++ b/nexus/tests/config.test.toml
@@ -139,6 +139,7 @@ instance_updater.disable = true
 instance_updater.period_secs = 60
 region_snapshot_replacement_start.period_secs = 30
 region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
 
 [default_region_allocation_strategy]
 # we only have one sled in the test environment, so we need to use the
diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs
index 234ab5f382..fe6aab2770 100644
--- a/nexus/tests/integration_tests/disks.rs
+++ b/nexus/tests/integration_tests/disks.rs
@@ -188,12 +188,13 @@ async fn set_instance_state(
 }
 
 async fn instance_simulate(nexus: &Arc<Nexus>, id: &InstanceUuid) {
-    let sa = nexus
-        .instance_sled_by_id(id)
+    let info = nexus
+        .active_instance_info(id, None)
         .await
         .unwrap()
         .expect("instance must be on a sled to simulate a state change");
-    sa.instance_finish_transition(id.into_untyped_uuid()).await;
+
+    info.sled_client.vmm_finish_transition(info.propolis_id).await;
 }
 
 #[nexus_test]
diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs
index eb3c88eb38..a7228e0841 100644
--- a/nexus/tests/integration_tests/instances.rs
+++ b/nexus/tests/integration_tests/instances.rs
@@ -780,12 +780,13 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
     let instance_next = instance_get(&client, &instance_url).await;
     assert_eq!(instance_next.runtime.run_state, InstanceState::Running);
 
-    let original_sled = nexus
-        .instance_sled_id(&instance_id)
+    let sled_info = nexus
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
         .expect("running instance should have a sled");
 
+    let original_sled = sled_info.sled_id;
     let dst_sled_id = if original_sled == default_sled_id {
         other_sled_id
     } else {
@@ -808,12 +809,13 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
     .parsed_body::<Instance>()
     .unwrap();
 
-    let current_sled = nexus
-        .instance_sled_id(&instance_id)
+    let new_sled_info = nexus
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
         .expect("running instance should have a sled");
 
+    let current_sled = new_sled_info.sled_id;
     assert_eq!(current_sled, original_sled);
 
     // Ensure that both sled agents report that the migration is in progress.
@@ -840,6 +842,15 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
     assert_eq!(migration.target_state, MigrationState::Pending.into());
     assert_eq!(migration.source_state, MigrationState::Pending.into());
 
+    let info = nexus
+        .active_instance_info(&instance_id, None)
+        .await
+        .unwrap()
+        .expect("instance should be on a sled");
+    let src_propolis_id = info.propolis_id;
+    let dst_propolis_id =
+        info.dst_propolis_id.expect("instance should have a migration target");
+
     // Simulate the migration. We will use `instance_single_step_on_sled` to
     // single-step both sled-agents through the migration state machine and
     // ensure that the migration state looks nice at each step.
@@ -847,15 +858,15 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
         cptestctx,
         nexus,
         original_sled,
-        instance_id,
+        src_propolis_id,
         migration_id,
     )
     .await;
 
     // Move source to "migrating".
-    instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id)
+    vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id)
         .await;
-    instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id)
+    vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id)
         .await;
 
     let migration = dbg!(migration_fetch(cptestctx, migration_id).await);
@@ -865,9 +876,9 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
     assert_eq!(instance.runtime.run_state, InstanceState::Migrating);
 
     // Move target to "migrating".
-    instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id)
+    vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id)
         .await;
-    instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id)
+    vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id)
         .await;
 
     let migration = dbg!(migration_fetch(cptestctx, migration_id).await);
@@ -877,7 +888,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
     assert_eq!(instance.runtime.run_state, InstanceState::Migrating);
 
     // Move the source to "completed"
-    instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id)
+    vmm_simulate_on_sled(cptestctx, nexus, original_sled, src_propolis_id)
         .await;
 
     let migration = dbg!(migration_fetch(cptestctx, migration_id).await);
@@ -887,15 +898,16 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) {
     assert_eq!(instance.runtime.run_state, InstanceState::Migrating);
 
     // Move the target to "completed".
-    instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await;
+    vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await;
 
     instance_wait_for_state(&client, instance_id, InstanceState::Running).await;
 
     let current_sled = nexus
-        .instance_sled_id(&instance_id)
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
-        .expect("migrated instance should still have a sled");
+        .expect("migrated instance should still have a sled")
+        .sled_id;
 
     assert_eq!(current_sled, dst_sled_id);
 
@@ -978,11 +990,13 @@ async fn test_instance_migrate_v2p_and_routes(
         .derive_guest_network_interface_info(&opctx, &authz_instance)
         .await
         .unwrap();
+
     let original_sled_id = nexus
-        .instance_sled_id(&instance_id)
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
-        .expect("running instance should have a sled");
+        .expect("running instance should have a sled")
+        .sled_id;
 
     let mut sled_agents = vec![cptestctx.sled_agent.sled_agent.clone()];
     sled_agents.extend(other_sleds.iter().map(|tup| tup.1.sled_agent.clone()));
@@ -1035,25 +1049,35 @@ async fn test_instance_migrate_v2p_and_routes(
             .expect("since we've started a migration, the instance record must have a migration id!")
     };
 
+    let info = nexus
+        .active_instance_info(&instance_id, None)
+        .await
+        .unwrap()
+        .expect("instance should be on a sled");
+    let src_propolis_id = info.propolis_id;
+    let dst_propolis_id =
+        info.dst_propolis_id.expect("instance should have a migration target");
+
     // Tell both sled-agents to pretend to do the migration.
     instance_simulate_migration_source(
         cptestctx,
         nexus,
         original_sled_id,
-        instance_id,
+        src_propolis_id,
         migration_id,
     )
     .await;
-    instance_simulate_on_sled(cptestctx, nexus, original_sled_id, instance_id)
+    vmm_simulate_on_sled(cptestctx, nexus, original_sled_id, src_propolis_id)
         .await;
-    instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await;
+    vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await;
     instance_wait_for_state(&client, instance_id, InstanceState::Running).await;
 
     let current_sled = nexus
-        .instance_sled_id(&instance_id)
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
-        .expect("migrated instance should have a sled");
+        .expect("migrated instance should have a sled")
+        .sled_id;
     assert_eq!(current_sled, dst_sled_id);
 
     for sled_agent in &sled_agents {
@@ -1373,10 +1397,11 @@ async fn test_instance_metrics_with_migration(
     // Request migration to the other sled. This reserves resources on the
     // target sled, but shouldn't change the virtual provisioning counters.
     let original_sled = nexus
-        .instance_sled_id(&instance_id)
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
-        .expect("running instance should have a sled");
+        .expect("running instance should have a sled")
+        .sled_id;
 
     let dst_sled_id = if original_sled == default_sled_id {
         other_sled_id
@@ -1420,6 +1445,15 @@ async fn test_instance_metrics_with_migration(
             .expect("since we've started a migration, the instance record must have a migration id!")
     };
 
+    let info = nexus
+        .active_instance_info(&instance_id, None)
+        .await
+        .unwrap()
+        .expect("instance should be on a sled");
+    let src_propolis_id = info.propolis_id;
+    let dst_propolis_id =
+        info.dst_propolis_id.expect("instance should have a migration target");
+
     // Wait for the instance to be in the `Migrating` state. Otherwise, the
     // subsequent `instance_wait_for_state(..., Running)` may see the `Running`
     // state from the *old* VMM, rather than waiting for the migration to
@@ -1428,13 +1462,13 @@ async fn test_instance_metrics_with_migration(
         cptestctx,
         nexus,
         original_sled,
-        instance_id,
+        src_propolis_id,
         migration_id,
     )
     .await;
-    instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id)
+    vmm_single_step_on_sled(cptestctx, nexus, original_sled, src_propolis_id)
         .await;
-    instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id)
+    vmm_single_step_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id)
         .await;
     instance_wait_for_state(&client, instance_id, InstanceState::Migrating)
         .await;
@@ -1444,9 +1478,9 @@ async fn test_instance_metrics_with_migration(
     // Complete migration on the target. Simulated migrations always succeed.
     // After this the instance should be running and should continue to appear
     // to be provisioned.
-    instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id)
+    vmm_simulate_on_sled(cptestctx, nexus, original_sled, src_propolis_id)
         .await;
-    instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await;
+    vmm_simulate_on_sled(cptestctx, nexus, dst_sled_id, dst_propolis_id).await;
     instance_wait_for_state(&client, instance_id, InstanceState::Running).await;
 
     check_provisioning_state(4, 1).await;
@@ -3337,10 +3371,11 @@ async fn test_disks_detached_when_instance_destroyed(
     let apictx = &cptestctx.server.server_context();
     let nexus = &apictx.nexus;
     let sa = nexus
-        .instance_sled_by_id(&instance_id)
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
-        .expect("instance should be on a sled while it's running");
+        .expect("instance should be on a sled while it's running")
+        .sled_client;
 
     // Stop and delete instance
     instance_post(&client, instance_name, InstanceOp::Stop).await;
@@ -5080,28 +5115,29 @@ pub async fn assert_sled_vpc_routes(
 /// instance, and then tell it to finish simulating whatever async transition is
 /// going on.
 pub async fn instance_simulate(nexus: &Arc<Nexus>, id: &InstanceUuid) {
-    let sa = nexus
-        .instance_sled_by_id(id)
+    let sled_info = nexus
+        .active_instance_info(id, None)
         .await
         .unwrap()
         .expect("instance must be on a sled to simulate a state change");
-    sa.instance_finish_transition(id.into_untyped_uuid()).await;
+
+    sled_info.sled_client.vmm_finish_transition(sled_info.propolis_id).await;
 }
 
 /// Simulate one step of an ongoing instance state transition.  To do this, we
 /// have to look up the instance, then get the sled agent associated with that
 /// instance, and then tell it to finish simulating whatever async transition is
 /// going on.
-async fn instance_single_step_on_sled(
+async fn vmm_single_step_on_sled(
     cptestctx: &ControlPlaneTestContext,
     nexus: &Arc<Nexus>,
     sled_id: SledUuid,
-    instance_id: InstanceUuid,
+    propolis_id: PropolisUuid,
 ) {
     info!(&cptestctx.logctx.log, "Single-stepping simulated instance on sled";
-          "instance_id" => %instance_id, "sled_id" => %sled_id);
+          "propolis_id" => %propolis_id, "sled_id" => %sled_id);
     let sa = nexus.sled_client(&sled_id).await.unwrap();
-    sa.instance_single_step(instance_id.into_untyped_uuid()).await;
+    sa.vmm_single_step(propolis_id).await;
 }
 
 pub async fn instance_simulate_with_opctx(
@@ -5109,27 +5145,28 @@ pub async fn instance_simulate_with_opctx(
     id: &InstanceUuid,
     opctx: &OpContext,
 ) {
-    let sa = nexus
-        .instance_sled_by_id_with_opctx(id, opctx)
+    let sled_info = nexus
+        .active_instance_info(id, Some(opctx))
         .await
         .unwrap()
         .expect("instance must be on a sled to simulate a state change");
-    sa.instance_finish_transition(id.into_untyped_uuid()).await;
+
+    sled_info.sled_client.vmm_finish_transition(sled_info.propolis_id).await;
 }
 
 /// Simulates state transitions for the incarnation of the instance on the
 /// supplied sled (which may not be the sled ID currently stored in the
 /// instance's CRDB record).
-async fn instance_simulate_on_sled(
+async fn vmm_simulate_on_sled(
     cptestctx: &ControlPlaneTestContext,
     nexus: &Arc<Nexus>,
     sled_id: SledUuid,
-    instance_id: InstanceUuid,
+    propolis_id: PropolisUuid,
 ) {
     info!(&cptestctx.logctx.log, "Poking simulated instance on sled";
-          "instance_id" => %instance_id, "sled_id" => %sled_id);
+          "propolis_id" => %propolis_id, "sled_id" => %sled_id);
     let sa = nexus.sled_client(&sled_id).await.unwrap();
-    sa.instance_finish_transition(instance_id.into_untyped_uuid()).await;
+    sa.vmm_finish_transition(propolis_id).await;
 }
 
 /// Simulates a migration source for the provided instance ID, sled ID, and
@@ -5138,19 +5175,19 @@ async fn instance_simulate_migration_source(
     cptestctx: &ControlPlaneTestContext,
     nexus: &Arc<Nexus>,
     sled_id: SledUuid,
-    instance_id: InstanceUuid,
+    propolis_id: PropolisUuid,
     migration_id: Uuid,
 ) {
     info!(
         &cptestctx.logctx.log,
         "Simulating migration source sled";
-        "instance_id" => %instance_id,
+        "propolis_id" => %propolis_id,
         "sled_id" => %sled_id,
         "migration_id" => %migration_id,
     );
     let sa = nexus.sled_client(&sled_id).await.unwrap();
-    sa.instance_simulate_migration_source(
-        instance_id.into_untyped_uuid(),
+    sa.vmm_simulate_migration_source(
+        propolis_id,
         sled_agent_client::SimulateMigrationSource {
             migration_id,
             result: sled_agent_client::SimulatedMigrationResult::Success,
diff --git a/nexus/tests/integration_tests/ip_pools.rs b/nexus/tests/integration_tests/ip_pools.rs
index e872cc6fe3..f56755d85c 100644
--- a/nexus/tests/integration_tests/ip_pools.rs
+++ b/nexus/tests/integration_tests/ip_pools.rs
@@ -1344,12 +1344,12 @@ async fn test_ip_range_delete_with_allocated_external_ip_fails(
     .expect("Failed to stop instance");
 
     // Simulate the transition, wait until it is in fact stopped.
-    let sa = nexus
-        .instance_sled_by_id(&instance_id)
+    let info = nexus
+        .active_instance_info(&instance_id, None)
         .await
         .unwrap()
         .expect("running instance should be on a sled");
-    sa.instance_finish_transition(instance.identity.id).await;
+    info.sled_client.vmm_finish_transition(info.propolis_id).await;
     instance_wait_for_state(client, instance_id, InstanceState::Stopped).await;
 
     // Delete the instance
diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index 3b808984ae..9f4652c2da 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -23,8 +23,11 @@ use nexus_types::external_api::views::OxqlQueryResult;
 use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
 use omicron_uuid_kinds::{GenericUuid, InstanceUuid};
 use oximeter::types::Datum;
+use oximeter::types::FieldValue;
 use oximeter::types::Measurement;
 use oximeter::TimeseriesSchema;
+use std::borrow::Borrow;
+use std::collections::HashMap;
 use uuid::Uuid;
 
 pub async fn query_for_metrics(
@@ -344,7 +347,6 @@ async fn test_instance_watcher_metrics(
             );
         }};
     }
-    use oximeter::types::FieldValue;
     const INSTANCE_ID_FIELD: &str = "instance_id";
     const STATE_FIELD: &str = "state";
     const STATE_STARTING: &str = "starting";
@@ -589,6 +591,183 @@ async fn test_instance_watcher_metrics(
     assert_gte!(ts2_running, 2);
 }
 
+#[nexus_test]
+async fn test_mgs_metrics(
+    cptestctx: &ControlPlaneTestContext<omicron_nexus::Server>,
+) {
+    // Make a MGS
+    let (mut mgs_config, sp_sim_config) =
+        gateway_test_utils::setup::load_test_config();
+    let mgs = {
+        // munge the already-parsed MGS config file to point it at the test
+        // Nexus' address.
+        mgs_config.metrics = Some(gateway_test_utils::setup::MetricsConfig {
+            disabled: false,
+            dev_bind_loopback: true,
+            dev_nexus_address: Some(cptestctx.internal_client.bind_address),
+        });
+        gateway_test_utils::setup::test_setup_with_config(
+            "test_mgs_metrics",
+            gateway_messages::SpPort::One,
+            mgs_config,
+            &sp_sim_config,
+            None,
+        )
+        .await
+    };
+
+    // Let's look at all the simulated SP components in the config file which
+    // have sensor readings, so we can assert that there are timeseries for all
+    // of them.
+    let all_sp_configs = {
+        let gimlet_configs =
+            sp_sim_config.simulated_sps.gimlet.iter().map(|g| &g.common);
+        let sidecar_configs =
+            sp_sim_config.simulated_sps.sidecar.iter().map(|s| &s.common);
+        gimlet_configs.chain(sidecar_configs)
+    };
+    // XXX(eliza): yes, this code is repetitive. We could probably make it a
+    // little elss ugly with nested hash maps, but like...I already wrote it, so
+    // you don't have to. :)
+    //
+    // TODO(eliza): presently, we just expect that the number of timeseries for
+    // each serial number and sensor type lines up. If we wanted to be *really*
+    // fancy, we could also assert that all the component IDs, component kinds,
+    // and measurement values line up with the config. But, honestly, it's
+    // pretty unlikely that a bug in MGS' sensor metrics subsystem would mess
+    // that up --- the most important thing is just to make sure that the sensor
+    // data is *present*, as that should catch most regressions.
+    let mut temp_sensors = HashMap::new();
+    let mut current_sensors = HashMap::new();
+    let mut voltage_sensors = HashMap::new();
+    let mut power_sensors = HashMap::new();
+    let mut input_voltage_sensors = HashMap::new();
+    let mut input_current_sensors = HashMap::new();
+    let mut fan_speed_sensors = HashMap::new();
+    for sp in all_sp_configs {
+        let mut temp = 0;
+        let mut current = 0;
+        let mut voltage = 0;
+        let mut input_voltage = 0;
+        let mut input_current = 0;
+        let mut power = 0;
+        let mut speed = 0;
+        for component in &sp.components {
+            for sensor in &component.sensors {
+                use gateway_messages::measurement::MeasurementKind as Kind;
+                match sensor.def.kind {
+                    Kind::Temperature => temp += 1,
+                    Kind::Current => current += 1,
+                    Kind::Voltage => voltage += 1,
+                    Kind::InputVoltage => input_voltage += 1,
+                    Kind::InputCurrent => input_current += 1,
+                    Kind::Speed => speed += 1,
+                    Kind::Power => power += 1,
+                }
+            }
+        }
+        temp_sensors.insert(sp.serial_number.clone(), temp);
+        current_sensors.insert(sp.serial_number.clone(), current);
+        voltage_sensors.insert(sp.serial_number.clone(), voltage);
+        input_voltage_sensors.insert(sp.serial_number.clone(), input_voltage);
+        input_current_sensors.insert(sp.serial_number.clone(), input_current);
+        fan_speed_sensors.insert(sp.serial_number.clone(), speed);
+        power_sensors.insert(sp.serial_number.clone(), power);
+    }
+
+    async fn check_all_timeseries_present(
+        cptestctx: &ControlPlaneTestContext<omicron_nexus::Server>,
+        name: &str,
+        expected: HashMap<String, usize>,
+    ) {
+        let metric_name = format!("hardware_component:{name}");
+        eprintln!("\n=== checking timeseries for {metric_name} ===\n");
+
+        if expected.values().all(|&v| v == 0) {
+            eprintln!(
+                "-> SP sim config contains no {name} sensors, skipping it"
+            );
+            return;
+        }
+
+        let table = timeseries_query(&cptestctx, &format!("get {metric_name}"))
+            .await
+            .into_iter()
+            .find(|t| t.name() == metric_name);
+        let table = match table {
+            Some(table) => table,
+            None => panic!("missing table for {metric_name}"),
+        };
+
+        let mut found = expected
+            .keys()
+            .map(|serial| (serial.clone(), 0))
+            .collect::<HashMap<_, usize>>();
+        for timeseries in table.timeseries() {
+            let fields = &timeseries.fields;
+            let n_points = timeseries.points.len();
+            assert!(
+                n_points > 0,
+                "{metric_name} timeseries {fields:?} should have points"
+            );
+            let serial_str: &str = match timeseries.fields.get("chassis_serial")
+            {
+                Some(FieldValue::String(s)) => s.borrow(),
+                Some(x) => panic!(
+                    "{metric_name} `chassis_serial` field should be a string, but got: {x:?}"
+                ),
+                None => {
+                    panic!("{metric_name} timeseries should have a `chassis_serial` field")
+                }
+            };
+            if let Some(count) = found.get_mut(serial_str) {
+                *count += 1;
+            } else {
+                panic!(
+                    "{metric_name} timeseries had an unexpected chassis serial \
+                     number {serial_str:?} (not in the config file)",
+                );
+            }
+        }
+
+        eprintln!("-> {metric_name}: found timeseries: {found:#?}");
+        assert_eq!(
+            found, expected,
+            "number of {metric_name} timeseries didn't match expected in {table:#?}",
+        );
+        eprintln!("-> okay, looks good!");
+    }
+
+    // Wait until the MGS registers as a producer with Oximeter.
+    wait_for_producer(&cptestctx.oximeter, &mgs.gateway_id).await;
+
+    // ...and collect its samples.
+    cptestctx.oximeter.force_collect().await;
+
+    check_all_timeseries_present(&cptestctx, "temperature", temp_sensors).await;
+    check_all_timeseries_present(&cptestctx, "voltage", voltage_sensors).await;
+    check_all_timeseries_present(&cptestctx, "current", current_sensors).await;
+    check_all_timeseries_present(&cptestctx, "power", power_sensors).await;
+    check_all_timeseries_present(
+        &cptestctx,
+        "input_voltage",
+        input_voltage_sensors,
+    )
+    .await;
+    check_all_timeseries_present(
+        &cptestctx,
+        "input_current",
+        input_current_sensors,
+    )
+    .await;
+    check_all_timeseries_present(&cptestctx, "fan_speed", fan_speed_sensors)
+        .await;
+
+    // Because the `ControlPlaneTestContext` isn't managing the MGS we made for
+    // this test, we are responsible for removing its logs.
+    mgs.logctx.cleanup_successful();
+}
+
 /// Wait until a producer is registered with Oximeter.
 ///
 /// This blocks until the producer is registered, for up to 60s. It panics if
diff --git a/nexus/tests/integration_tests/pantry.rs b/nexus/tests/integration_tests/pantry.rs
index d77ad49db6..22d35b01b5 100644
--- a/nexus/tests/integration_tests/pantry.rs
+++ b/nexus/tests/integration_tests/pantry.rs
@@ -88,12 +88,12 @@ async fn set_instance_state(
 }
 
 async fn instance_simulate(nexus: &Arc<Nexus>, id: &InstanceUuid) {
-    let sa = nexus
-        .instance_sled_by_id(id)
+    let info = nexus
+        .active_instance_info(id, None)
         .await
         .unwrap()
         .expect("instance must be on a sled to simulate a state change");
-    sa.instance_finish_transition(id.into_untyped_uuid()).await;
+    info.sled_client.vmm_finish_transition(info.propolis_id).await;
 }
 
 async fn disk_get(client: &ClientTestContext, disk_url: &str) -> Disk {
diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs
index bf73855ea7..5201b5c971 100644
--- a/nexus/tests/integration_tests/schema.rs
+++ b/nexus/tests/integration_tests/schema.rs
@@ -954,12 +954,12 @@ async fn dbinit_equals_sum_of_all_up() {
     // Create a connection pool after we apply the first schema version but
     // before applying the rest, and grab a connection from that pool. We'll use
     // it for an extra check later.
-    let pool = nexus_db_queries::db::Pool::new(
+    let pool = nexus_db_queries::db::Pool::new_single_host(
         log,
         &nexus_db_queries::db::Config { url: crdb.pg_config().clone() },
     );
     let conn_from_pool =
-        pool.pool().get().await.expect("failed to get pooled connection");
+        pool.claim().await.expect("failed to get pooled connection");
 
     // Go from the second version to the latest version.
     for version in all_versions.iter_versions().skip(1) {
diff --git a/nexus/tests/integration_tests/sp_updater.rs b/nexus/tests/integration_tests/sp_updater.rs
index 8314d22173..6e482bc1ad 100644
--- a/nexus/tests/integration_tests/sp_updater.rs
+++ b/nexus/tests/integration_tests/sp_updater.rs
@@ -434,9 +434,23 @@ async fn test_sp_updater_switches_mgs_instances_on_failure() {
 #[tokio::test]
 async fn test_sp_updater_delivers_progress() {
     // Start MGS + Sim SP.
-    let mgstestctx =
-        mgs_setup::test_setup("test_sp_updater_delivers_progress", SpPort::One)
-            .await;
+    let mgstestctx = {
+        let (mut mgs_config, sp_sim_config) = mgs_setup::load_test_config();
+        // Enabling SP metrics collection makes this alread-flaky test even
+        // flakier, so let's just turn it off.
+        // TODO(eliza): it would be nice if we didn't have to disable metrics in
+        // this test, so that we can better catch regressions that could be
+        // introduced by the metrics subsystem...
+        mgs_config.metrics.get_or_insert_with(Default::default).disabled = true;
+        mgs_setup::test_setup_with_config(
+            "test_sp_updater_delivers_progress",
+            SpPort::One,
+            mgs_config,
+            &sp_sim_config,
+            None,
+        )
+        .await
+    };
 
     // Configure an MGS client.
     let mut mgs_clients =
diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs
index cc48f2646a..96de893fa3 100644
--- a/nexus/types/src/deployment.rs
+++ b/nexus/types/src/deployment.rs
@@ -27,20 +27,17 @@ use omicron_common::api::external::Generation;
 use omicron_common::disk::DiskIdentity;
 use omicron_common::disk::OmicronPhysicalDisksConfig;
 use omicron_uuid_kinds::CollectionUuid;
-use omicron_uuid_kinds::ExternalIpUuid;
 use omicron_uuid_kinds::OmicronZoneUuid;
 use omicron_uuid_kinds::SledUuid;
 use schemars::JsonSchema;
 use serde::Deserialize;
 use serde::Serialize;
-use slog_error_chain::SlogInlineError;
 use std::collections::BTreeMap;
 use std::collections::BTreeSet;
 use std::fmt;
 use std::net::Ipv6Addr;
 use strum::EnumIter;
 use strum::IntoEnumIterator;
-use thiserror::Error;
 use uuid::Uuid;
 
 mod blueprint_diff;
@@ -595,13 +592,6 @@ fn zone_sort_key<T: ZoneSortKey>(z: &T) -> impl Ord {
     (z.kind(), z.id())
 }
 
-/// Errors from converting an [`OmicronZoneType`] into a [`BlueprintZoneType`].
-#[derive(Debug, Clone, Error, SlogInlineError)]
-pub enum InvalidOmicronZoneType {
-    #[error("Omicron zone {} requires an external IP ID", kind.report_str())]
-    ExternalIpIdRequired { kind: ZoneKind },
-}
-
 /// Describes one Omicron-managed zone in a blueprint.
 ///
 /// Part of [`BlueprintZonesConfig`].
@@ -616,168 +606,6 @@ pub struct BlueprintZoneConfig {
     pub zone_type: BlueprintZoneType,
 }
 
-impl BlueprintZoneConfig {
-    /// Convert from an [`OmicronZoneConfig`].
-    ///
-    /// This method is annoying to call correctly and will become more so over
-    /// time. Ideally we'd remove all callers and then remove this method, but
-    /// for now we keep it.
-    ///
-    /// # Errors
-    ///
-    /// If `config.zone_type` is a zone that has an external IP address (Nexus,
-    /// boundary NTP, external DNS), `external_ip_id` must be `Some(_)` or this
-    /// method will return an error.
-    pub fn from_omicron_zone_config(
-        config: OmicronZoneConfig,
-        disposition: BlueprintZoneDisposition,
-        external_ip_id: Option<ExternalIpUuid>,
-    ) -> Result<Self, InvalidOmicronZoneType> {
-        let kind = config.zone_type.kind();
-        let zone_type = match config.zone_type {
-            OmicronZoneType::BoundaryNtp {
-                address,
-                dns_servers,
-                domain,
-                nic,
-                ntp_servers,
-                snat_cfg,
-            } => {
-                let external_ip_id = external_ip_id.ok_or(
-                    InvalidOmicronZoneType::ExternalIpIdRequired { kind },
-                )?;
-                BlueprintZoneType::BoundaryNtp(
-                    blueprint_zone_type::BoundaryNtp {
-                        address,
-                        ntp_servers,
-                        dns_servers,
-                        domain,
-                        nic,
-                        external_ip: OmicronZoneExternalSnatIp {
-                            id: external_ip_id,
-                            snat_cfg,
-                        },
-                    },
-                )
-            }
-            OmicronZoneType::Clickhouse { address, dataset } => {
-                BlueprintZoneType::Clickhouse(blueprint_zone_type::Clickhouse {
-                    address,
-                    dataset,
-                })
-            }
-            OmicronZoneType::ClickhouseKeeper { address, dataset } => {
-                BlueprintZoneType::ClickhouseKeeper(
-                    blueprint_zone_type::ClickhouseKeeper { address, dataset },
-                )
-            }
-            OmicronZoneType::ClickhouseServer { address, dataset } => {
-                BlueprintZoneType::ClickhouseServer(
-                    blueprint_zone_type::ClickhouseServer { address, dataset },
-                )
-            }
-            OmicronZoneType::CockroachDb { address, dataset } => {
-                BlueprintZoneType::CockroachDb(
-                    blueprint_zone_type::CockroachDb { address, dataset },
-                )
-            }
-            OmicronZoneType::Crucible { address, dataset } => {
-                BlueprintZoneType::Crucible(blueprint_zone_type::Crucible {
-                    address,
-                    dataset,
-                })
-            }
-            OmicronZoneType::CruciblePantry { address } => {
-                BlueprintZoneType::CruciblePantry(
-                    blueprint_zone_type::CruciblePantry { address },
-                )
-            }
-            OmicronZoneType::ExternalDns {
-                dataset,
-                dns_address,
-                http_address,
-                nic,
-            } => {
-                let external_ip_id = external_ip_id.ok_or(
-                    InvalidOmicronZoneType::ExternalIpIdRequired { kind },
-                )?;
-                BlueprintZoneType::ExternalDns(
-                    blueprint_zone_type::ExternalDns {
-                        dataset,
-                        http_address,
-                        dns_address: OmicronZoneExternalFloatingAddr {
-                            id: external_ip_id,
-                            addr: dns_address,
-                        },
-                        nic,
-                    },
-                )
-            }
-            OmicronZoneType::InternalDns {
-                dataset,
-                dns_address,
-                gz_address,
-                gz_address_index,
-                http_address,
-            } => BlueprintZoneType::InternalDns(
-                blueprint_zone_type::InternalDns {
-                    dataset,
-                    http_address,
-                    dns_address,
-                    gz_address,
-                    gz_address_index,
-                },
-            ),
-            OmicronZoneType::InternalNtp {
-                address,
-                dns_servers,
-                domain,
-                ntp_servers,
-            } => BlueprintZoneType::InternalNtp(
-                blueprint_zone_type::InternalNtp {
-                    address,
-                    ntp_servers,
-                    dns_servers,
-                    domain,
-                },
-            ),
-            OmicronZoneType::Nexus {
-                external_dns_servers,
-                external_ip,
-                external_tls,
-                internal_address,
-                nic,
-            } => {
-                let external_ip_id = external_ip_id.ok_or(
-                    InvalidOmicronZoneType::ExternalIpIdRequired { kind },
-                )?;
-                BlueprintZoneType::Nexus(blueprint_zone_type::Nexus {
-                    internal_address,
-                    external_ip: OmicronZoneExternalFloatingIp {
-                        id: external_ip_id,
-                        ip: external_ip,
-                    },
-                    nic,
-                    external_tls,
-                    external_dns_servers,
-                })
-            }
-            OmicronZoneType::Oximeter { address } => {
-                BlueprintZoneType::Oximeter(blueprint_zone_type::Oximeter {
-                    address,
-                })
-            }
-        };
-        Ok(Self {
-            disposition,
-            id: OmicronZoneUuid::from_untyped_uuid(config.id),
-            underlay_address: config.underlay_address,
-            filesystem_pool: config.filesystem_pool,
-            zone_type,
-        })
-    }
-}
-
 impl From<BlueprintZoneConfig> for OmicronZoneConfig {
     fn from(z: BlueprintZoneConfig) -> Self {
         Self {
diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs
index 8e4b6b3013..e5fd35d1e3 100644
--- a/nexus/types/src/internal_api/background.rs
+++ b/nexus/types/src/internal_api/background.rs
@@ -36,3 +36,13 @@ pub struct RegionSnapshotReplacementGarbageCollectStatus {
     pub garbage_collect_requested: Vec<String>,
     pub errors: Vec<String>,
 }
+
+/// The status of a `region_snapshot_replacement_step` background task
+/// activation
+#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)]
+pub struct RegionSnapshotReplacementStepStatus {
+    pub step_records_created_ok: Vec<String>,
+    pub step_garbage_collect_invoked_ok: Vec<String>,
+    pub step_invoked_ok: Vec<String>,
+    pub errors: Vec<String>,
+}
diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json
index 54b4822e51..619a2187b5 100644
--- a/openapi/nexus-internal.json
+++ b/openapi/nexus-internal.json
@@ -746,44 +746,6 @@
         }
       }
     },
-    "/instances/{instance_id}": {
-      "put": {
-        "summary": "Report updated state for an instance.",
-        "operationId": "cpapi_instances_put",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "type": "string",
-              "format": "uuid"
-            }
-          }
-        ],
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/SledInstanceState"
-              }
-            }
-          },
-          "required": true
-        },
-        "responses": {
-          "204": {
-            "description": "resource updated"
-          },
-          "4XX": {
-            "$ref": "#/components/responses/Error"
-          },
-          "5XX": {
-            "$ref": "#/components/responses/Error"
-          }
-        }
-      }
-    },
     "/instances/{instance_id}/migrate": {
       "post": {
         "operationId": "instance_migrate",
@@ -1470,6 +1432,43 @@
         }
       }
     },
+    "/vmms/{propolis_id}": {
+      "put": {
+        "summary": "Report updated state for a VMM.",
+        "operationId": "cpapi_instances_put",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
+            }
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/SledVmmState"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "204": {
+            "description": "resource updated"
+          },
+          "4XX": {
+            "$ref": "#/components/responses/Error"
+          },
+          "5XX": {
+            "$ref": "#/components/responses/Error"
+          }
+        }
+      }
+    },
     "/volume/{volume_id}/remove-read-only-parent": {
       "post": {
         "summary": "Request removal of a read_only_parent from a volume.",
@@ -4443,6 +4442,13 @@
             "enum": [
               "instance"
             ]
+          },
+          {
+            "description": "The producer is a management gateway service.",
+            "type": "string",
+            "enum": [
+              "management_gateway"
+            ]
           }
         ]
       },
@@ -5055,50 +5061,6 @@
           "id"
         ]
       },
-      "SledInstanceState": {
-        "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.",
-        "type": "object",
-        "properties": {
-          "migration_in": {
-            "nullable": true,
-            "description": "The current state of any inbound migration to this VMM.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/MigrationRuntimeState"
-              }
-            ]
-          },
-          "migration_out": {
-            "nullable": true,
-            "description": "The state of any outbound migration from this VMM.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/MigrationRuntimeState"
-              }
-            ]
-          },
-          "propolis_id": {
-            "description": "The ID of the VMM whose state is being reported.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/TypedUuidForPropolisKind"
-              }
-            ]
-          },
-          "vmm_state": {
-            "description": "The most recent state of the sled's VMM process.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/VmmRuntimeState"
-              }
-            ]
-          }
-        },
-        "required": [
-          "propolis_id",
-          "vmm_state"
-        ]
-      },
       "SledPolicy": {
         "description": "The operator-defined policy of a sled.",
         "oneOf": [
@@ -5213,6 +5175,41 @@
           }
         ]
       },
+      "SledVmmState": {
+        "description": "A wrapper type containing a sled's total knowledge of the state of a VMM.",
+        "type": "object",
+        "properties": {
+          "migration_in": {
+            "nullable": true,
+            "description": "The current state of any inbound migration to this VMM.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/MigrationRuntimeState"
+              }
+            ]
+          },
+          "migration_out": {
+            "nullable": true,
+            "description": "The state of any outbound migration from this VMM.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/MigrationRuntimeState"
+              }
+            ]
+          },
+          "vmm_state": {
+            "description": "The most recent state of the sled's VMM process.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/VmmRuntimeState"
+              }
+            ]
+          }
+        },
+        "required": [
+          "vmm_state"
+        ]
+      },
       "SourceNatConfig": {
         "description": "An IP address and port range used for source NAT, i.e., making outbound network connections from guests or services.",
         "type": "object",
@@ -5325,10 +5322,6 @@
         "type": "string",
         "format": "uuid"
       },
-      "TypedUuidForPropolisKind": {
-        "type": "string",
-        "format": "uuid"
-      },
       "TypedUuidForSledKind": {
         "type": "string",
         "format": "uuid"
@@ -5590,6 +5583,10 @@
             ]
           }
         ]
+      },
+      "TypedUuidForPropolisKind": {
+        "type": "string",
+        "format": "uuid"
       }
     },
     "responses": {
diff --git a/openapi/nexus.json b/openapi/nexus.json
index 285dcd82bb..47f1f0822b 100644
--- a/openapi/nexus.json
+++ b/openapi/nexus.json
@@ -19335,6 +19335,7 @@
             "type": "string"
           },
           "lldp_link_config_id": {
+            "nullable": true,
             "description": "The link-layer discovery protocol service configuration id for this link.",
             "type": "string",
             "format": "uuid"
@@ -19363,7 +19364,6 @@
           "autoneg",
           "fec",
           "link_name",
-          "lldp_link_config_id",
           "mtu",
           "port_settings_id",
           "speed"
@@ -19934,7 +19934,8 @@
               "nanoseconds",
               "volts",
               "amps",
-              "degrees_celcius"
+              "watts",
+              "degrees_celsius"
             ]
           },
           {
diff --git a/openapi/oximeter.json b/openapi/oximeter.json
index f596ac6ee6..327351d961 100644
--- a/openapi/oximeter.json
+++ b/openapi/oximeter.json
@@ -277,6 +277,13 @@
             "enum": [
               "instance"
             ]
+          },
+          {
+            "description": "The producer is a management gateway service.",
+            "type": "string",
+            "enum": [
+              "management_gateway"
+            ]
           }
         ]
       }
diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json
index 4c40fb5da0..ec2a8bfc4d 100644
--- a/openapi/sled-agent.json
+++ b/openapi/sled-agent.json
@@ -220,36 +220,17 @@
         }
       }
     },
-    "/instances/{instance_id}": {
-      "put": {
-        "operationId": "instance_register",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "$ref": "#/components/schemas/TypedUuidForInstanceKind"
-            }
-          }
-        ],
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/InstanceEnsureBody"
-              }
-            }
-          },
-          "required": true
-        },
+    "/inventory": {
+      "get": {
+        "summary": "Fetch basic information about this sled",
+        "operationId": "inventory",
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/SledInstanceState"
+                  "$ref": "#/components/schemas/Inventory"
                 }
               }
             }
@@ -261,26 +242,20 @@
             "$ref": "#/components/responses/Error"
           }
         }
-      },
-      "delete": {
-        "operationId": "instance_unregister",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "$ref": "#/components/schemas/TypedUuidForInstanceKind"
-            }
-          }
-        ],
+      }
+    },
+    "/network-bootstore-config": {
+      "get": {
+        "summary": "This API endpoint is only reading the local sled agent's view of the",
+        "description": "bootstore. The boostore is a distributed data store that is eventually consistent. Reads from individual nodes may not represent the latest state.",
+        "operationId": "read_network_bootstore_config_cache",
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/InstanceUnregisterResponse"
+                  "$ref": "#/components/schemas/EarlyNetworkConfig"
                 }
               }
             }
@@ -292,52 +267,22 @@
             "$ref": "#/components/responses/Error"
           }
         }
-      }
-    },
-    "/instances/{instance_id}/disks/{disk_id}/snapshot": {
-      "post": {
-        "summary": "Take a snapshot of a disk that is attached to an instance",
-        "operationId": "instance_issue_disk_snapshot_request",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "disk_id",
-            "required": true,
-            "schema": {
-              "type": "string",
-              "format": "uuid"
-            }
-          },
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "type": "string",
-              "format": "uuid"
-            }
-          }
-        ],
+      },
+      "put": {
+        "operationId": "write_network_bootstore_config",
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/InstanceIssueDiskSnapshotRequestBody"
+                "$ref": "#/components/schemas/EarlyNetworkConfig"
               }
             }
           },
           "required": true
         },
         "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/InstanceIssueDiskSnapshotRequestResponse"
-                }
-              }
-            }
+          "204": {
+            "description": "resource updated"
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -348,33 +293,20 @@
         }
       }
     },
-    "/instances/{instance_id}/external-ip": {
-      "put": {
-        "operationId": "instance_put_external_ip",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "$ref": "#/components/schemas/TypedUuidForInstanceKind"
-            }
-          }
-        ],
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/InstanceExternalIpBody"
+    "/omicron-physical-disks": {
+      "get": {
+        "operationId": "omicron_physical_disks_get",
+        "responses": {
+          "200": {
+            "description": "successful operation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/OmicronPhysicalDisksConfig"
+                }
               }
             }
           },
-          "required": true
-        },
-        "responses": {
-          "204": {
-            "description": "resource updated"
-          },
           "4XX": {
             "$ref": "#/components/responses/Error"
           },
@@ -383,31 +315,28 @@
           }
         }
       },
-      "delete": {
-        "operationId": "instance_delete_external_ip",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "$ref": "#/components/schemas/TypedUuidForInstanceKind"
-            }
-          }
-        ],
+      "put": {
+        "operationId": "omicron_physical_disks_put",
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/InstanceExternalIpBody"
+                "$ref": "#/components/schemas/OmicronPhysicalDisksConfig"
               }
             }
           },
           "required": true
         },
         "responses": {
-          "204": {
-            "description": "resource updated"
+          "200": {
+            "description": "successful operation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/DisksManagementResult"
+                }
+              }
+            }
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -418,26 +347,16 @@
         }
       }
     },
-    "/instances/{instance_id}/state": {
+    "/omicron-zones": {
       "get": {
-        "operationId": "instance_get_state",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "$ref": "#/components/schemas/TypedUuidForInstanceKind"
-            }
-          }
-        ],
+        "operationId": "omicron_zones_get",
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/SledInstanceState"
+                  "$ref": "#/components/schemas/OmicronZonesConfig"
                 }
               }
             }
@@ -451,37 +370,20 @@
         }
       },
       "put": {
-        "operationId": "instance_put_state",
-        "parameters": [
-          {
-            "in": "path",
-            "name": "instance_id",
-            "required": true,
-            "schema": {
-              "$ref": "#/components/schemas/TypedUuidForInstanceKind"
-            }
-          }
-        ],
+        "operationId": "omicron_zones_put",
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/InstancePutStateBody"
+                "$ref": "#/components/schemas/OmicronZonesConfig"
               }
             }
           },
           "required": true
         },
         "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/InstancePutStateResponse"
-                }
-              }
-            }
+          "204": {
+            "description": "resource updated"
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -492,17 +394,17 @@
         }
       }
     },
-    "/inventory": {
+    "/sled-identifiers": {
       "get": {
-        "summary": "Fetch basic information about this sled",
-        "operationId": "inventory",
+        "summary": "Fetch sled identifiers",
+        "operationId": "sled_identifiers",
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/Inventory"
+                  "$ref": "#/components/schemas/SledIdentifiers"
                 }
               }
             }
@@ -516,18 +418,16 @@
         }
       }
     },
-    "/network-bootstore-config": {
+    "/sled-role": {
       "get": {
-        "summary": "This API endpoint is only reading the local sled agent's view of the",
-        "description": "bootstore. The boostore is a distributed data store that is eventually consistent. Reads from individual nodes may not represent the latest state.",
-        "operationId": "read_network_bootstore_config_cache",
+        "operationId": "sled_role_get",
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/EarlyNetworkConfig"
+                  "$ref": "#/components/schemas/SledRole"
                 }
               }
             }
@@ -539,14 +439,17 @@
             "$ref": "#/components/responses/Error"
           }
         }
-      },
+      }
+    },
+    "/sleds": {
       "put": {
-        "operationId": "write_network_bootstore_config",
+        "summary": "Add a sled to a rack that was already initialized via RSS",
+        "operationId": "sled_add",
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/EarlyNetworkConfig"
+                "$ref": "#/components/schemas/AddSledRequest"
               }
             }
           },
@@ -565,16 +468,42 @@
         }
       }
     },
-    "/omicron-physical-disks": {
+    "/switch-ports": {
+      "post": {
+        "operationId": "uplink_ensure",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/SwitchPorts"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "204": {
+            "description": "resource updated"
+          },
+          "4XX": {
+            "$ref": "#/components/responses/Error"
+          },
+          "5XX": {
+            "$ref": "#/components/responses/Error"
+          }
+        }
+      }
+    },
+    "/timesync": {
       "get": {
-        "operationId": "omicron_physical_disks_get",
+        "operationId": "timesync_get",
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/OmicronPhysicalDisksConfig"
+                  "$ref": "#/components/schemas/TimeSync"
                 }
               }
             }
@@ -586,29 +515,24 @@
             "$ref": "#/components/responses/Error"
           }
         }
-      },
-      "put": {
-        "operationId": "omicron_physical_disks_put",
+      }
+    },
+    "/update": {
+      "post": {
+        "operationId": "update_artifact",
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/OmicronPhysicalDisksConfig"
+                "$ref": "#/components/schemas/UpdateArtifactId"
               }
             }
           },
           "required": true
         },
         "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/DisksManagementResult"
-                }
-              }
-            }
+          "204": {
+            "description": "resource updated"
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -619,16 +543,21 @@
         }
       }
     },
-    "/omicron-zones": {
+    "/v2p": {
       "get": {
-        "operationId": "omicron_zones_get",
+        "summary": "List v2p mappings present on sled",
+        "operationId": "list_v2p",
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/OmicronZonesConfig"
+                  "title": "Array_of_VirtualNetworkInterfaceHost",
+                  "type": "array",
+                  "items": {
+                    "$ref": "#/components/schemas/VirtualNetworkInterfaceHost"
+                  }
                 }
               }
             }
@@ -642,12 +571,13 @@
         }
       },
       "put": {
-        "operationId": "omicron_zones_put",
+        "summary": "Create a mapping from a virtual NIC to a physical host",
+        "operationId": "set_v2p",
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/OmicronZonesConfig"
+                "$ref": "#/components/schemas/VirtualNetworkInterfaceHost"
               }
             }
           },
@@ -664,23 +594,24 @@
             "$ref": "#/components/responses/Error"
           }
         }
-      }
-    },
-    "/sled-identifiers": {
-      "get": {
-        "summary": "Fetch sled identifiers",
-        "operationId": "sled_identifiers",
-        "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/SledIdentifiers"
-                }
+      },
+      "delete": {
+        "summary": "Delete a mapping from a virtual NIC to a physical host",
+        "operationId": "del_v2p",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/VirtualNetworkInterfaceHost"
               }
             }
           },
+          "required": true
+        },
+        "responses": {
+          "204": {
+            "description": "resource updated"
+          },
           "4XX": {
             "$ref": "#/components/responses/Error"
           },
@@ -690,16 +621,36 @@
         }
       }
     },
-    "/sled-role": {
-      "get": {
-        "operationId": "sled_role_get",
+    "/vmms/{propolis_id}": {
+      "put": {
+        "operationId": "vmm_register",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
+            }
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/InstanceEnsureBody"
+              }
+            }
+          },
+          "required": true
+        },
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "$ref": "#/components/schemas/SledRole"
+                  "$ref": "#/components/schemas/SledVmmState"
                 }
               }
             }
@@ -711,25 +662,29 @@
             "$ref": "#/components/responses/Error"
           }
         }
-      }
-    },
-    "/sleds": {
-      "put": {
-        "summary": "Add a sled to a rack that was already initialized via RSS",
-        "operationId": "sled_add",
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/AddSledRequest"
-              }
+      },
+      "delete": {
+        "operationId": "vmm_unregister",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
             }
-          },
-          "required": true
-        },
+          }
+        ],
         "responses": {
-          "204": {
-            "description": "resource updated"
+          "200": {
+            "description": "successful operation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/VmmUnregisterResponse"
+                }
+              }
+            }
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -740,22 +695,49 @@
         }
       }
     },
-    "/switch-ports": {
+    "/vmms/{propolis_id}/disks/{disk_id}/snapshot": {
       "post": {
-        "operationId": "uplink_ensure",
+        "summary": "Take a snapshot of a disk that is attached to an instance",
+        "operationId": "vmm_issue_disk_snapshot_request",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "disk_id",
+            "required": true,
+            "schema": {
+              "type": "string",
+              "format": "uuid"
+            }
+          },
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
+            }
+          }
+        ],
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/SwitchPorts"
+                "$ref": "#/components/schemas/VmmIssueDiskSnapshotRequestBody"
               }
             }
           },
           "required": true
         },
         "responses": {
-          "204": {
-            "description": "resource updated"
+          "200": {
+            "description": "successful operation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/VmmIssueDiskSnapshotRequestResponse"
+                }
+              }
+            }
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -766,20 +748,33 @@
         }
       }
     },
-    "/timesync": {
-      "get": {
-        "operationId": "timesync_get",
-        "responses": {
-          "200": {
-            "description": "successful operation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/TimeSync"
-                }
+    "/vmms/{propolis_id}/external-ip": {
+      "put": {
+        "operationId": "vmm_put_external_ip",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
+            }
+          }
+        ],
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/InstanceExternalIpBody"
               }
             }
           },
+          "required": true
+        },
+        "responses": {
+          "204": {
+            "description": "resource updated"
+          },
           "4XX": {
             "$ref": "#/components/responses/Error"
           },
@@ -787,16 +782,24 @@
             "$ref": "#/components/responses/Error"
           }
         }
-      }
-    },
-    "/update": {
-      "post": {
-        "operationId": "update_artifact",
+      },
+      "delete": {
+        "operationId": "vmm_delete_external_ip",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
+            }
+          }
+        ],
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/UpdateArtifactId"
+                "$ref": "#/components/schemas/InstanceExternalIpBody"
               }
             }
           },
@@ -815,21 +818,26 @@
         }
       }
     },
-    "/v2p": {
+    "/vmms/{propolis_id}/state": {
       "get": {
-        "summary": "List v2p mappings present on sled",
-        "operationId": "list_v2p",
+        "operationId": "vmm_get_state",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
+            }
+          }
+        ],
         "responses": {
           "200": {
             "description": "successful operation",
             "content": {
               "application/json": {
                 "schema": {
-                  "title": "Array_of_VirtualNetworkInterfaceHost",
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/VirtualNetworkInterfaceHost"
-                  }
+                  "$ref": "#/components/schemas/SledVmmState"
                 }
               }
             }
@@ -843,46 +851,37 @@
         }
       },
       "put": {
-        "summary": "Create a mapping from a virtual NIC to a physical host",
-        "operationId": "set_v2p",
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/VirtualNetworkInterfaceHost"
-              }
+        "operationId": "vmm_put_state",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "propolis_id",
+            "required": true,
+            "schema": {
+              "$ref": "#/components/schemas/TypedUuidForPropolisKind"
             }
-          },
-          "required": true
-        },
-        "responses": {
-          "204": {
-            "description": "resource updated"
-          },
-          "4XX": {
-            "$ref": "#/components/responses/Error"
-          },
-          "5XX": {
-            "$ref": "#/components/responses/Error"
           }
-        }
-      },
-      "delete": {
-        "summary": "Delete a mapping from a virtual NIC to a physical host",
-        "operationId": "del_v2p",
+        ],
         "requestBody": {
           "content": {
             "application/json": {
               "schema": {
-                "$ref": "#/components/schemas/VirtualNetworkInterfaceHost"
+                "$ref": "#/components/schemas/VmmPutStateBody"
               }
             }
           },
           "required": true
         },
         "responses": {
-          "204": {
-            "description": "resource updated"
+          "200": {
+            "description": "successful operation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/VmmPutStateResponse"
+                }
+              }
+            }
           },
           "4XX": {
             "$ref": "#/components/responses/Error"
@@ -2837,6 +2836,14 @@
               }
             ]
           },
+          "instance_id": {
+            "description": "The ID of the instance for which this VMM is being created.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/TypedUuidForInstanceKind"
+              }
+            ]
+          },
           "instance_runtime": {
             "description": "The instance runtime state for the instance being registered.",
             "allOf": [
@@ -2857,14 +2864,6 @@
             "description": "The address at which this VMM should serve a Propolis server API.",
             "type": "string"
           },
-          "propolis_id": {
-            "description": "The ID of the VMM being registered. This may not be the active VMM ID in the instance runtime state (e.g. if the new VMM is going to be a migration target).",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/TypedUuidForPropolisKind"
-              }
-            ]
-          },
           "vmm_runtime": {
             "description": "The initial VMM runtime state for the VMM being registered.",
             "allOf": [
@@ -2876,10 +2875,10 @@
         },
         "required": [
           "hardware",
+          "instance_id",
           "instance_runtime",
           "metadata",
           "propolis_addr",
-          "propolis_id",
           "vmm_runtime"
         ]
       },
@@ -2985,30 +2984,6 @@
           "source_nat"
         ]
       },
-      "InstanceIssueDiskSnapshotRequestBody": {
-        "type": "object",
-        "properties": {
-          "snapshot_id": {
-            "type": "string",
-            "format": "uuid"
-          }
-        },
-        "required": [
-          "snapshot_id"
-        ]
-      },
-      "InstanceIssueDiskSnapshotRequestResponse": {
-        "type": "object",
-        "properties": {
-          "snapshot_id": {
-            "type": "string",
-            "format": "uuid"
-          }
-        },
-        "required": [
-          "snapshot_id"
-        ]
-      },
       "InstanceMetadata": {
         "description": "Metadata used to track statistics about an instance.",
         "type": "object",
@@ -3052,187 +3027,71 @@
         "properties": {
           "hostname": {
             "description": "RFC1035-compliant hostname for the instance.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/Hostname"
-              }
-            ]
-          },
-          "memory": {
-            "$ref": "#/components/schemas/ByteCount"
-          },
-          "ncpus": {
-            "$ref": "#/components/schemas/InstanceCpuCount"
-          }
-        },
-        "required": [
-          "hostname",
-          "memory",
-          "ncpus"
-        ]
-      },
-      "InstancePutStateBody": {
-        "description": "The body of a request to move a previously-ensured instance into a specific runtime state.",
-        "type": "object",
-        "properties": {
-          "state": {
-            "description": "The state into which the instance should be driven.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/InstanceStateRequested"
-              }
-            ]
-          }
-        },
-        "required": [
-          "state"
-        ]
-      },
-      "InstancePutStateResponse": {
-        "description": "The response sent from a request to move an instance into a specific runtime state.",
-        "type": "object",
-        "properties": {
-          "updated_runtime": {
-            "nullable": true,
-            "description": "The current runtime state of the instance after handling the request to change its state. If the instance's state did not change, this field is `None`.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/SledInstanceState"
-              }
-            ]
-          }
-        }
-      },
-      "InstanceRuntimeState": {
-        "description": "The dynamic runtime properties of an instance: its current VMM ID (if any), migration information (if any), and the instance state to report if there is no active VMM.",
-        "type": "object",
-        "properties": {
-          "dst_propolis_id": {
-            "nullable": true,
-            "description": "If a migration is active, the ID of the target VMM.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/TypedUuidForPropolisKind"
-              }
-            ]
-          },
-          "gen": {
-            "description": "Generation number for this state.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/Generation"
-              }
-            ]
-          },
-          "migration_id": {
-            "nullable": true,
-            "description": "If a migration is active, the ID of that migration.",
-            "type": "string",
-            "format": "uuid"
-          },
-          "propolis_id": {
-            "nullable": true,
-            "description": "The instance's currently active VMM ID.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/TypedUuidForPropolisKind"
-              }
-            ]
-          },
-          "time_updated": {
-            "description": "Timestamp for this information.",
-            "type": "string",
-            "format": "date-time"
-          }
-        },
-        "required": [
-          "gen",
-          "time_updated"
-        ]
-      },
-      "InstanceStateRequested": {
-        "description": "Requestable running state of an Instance.\n\nA subset of [`omicron_common::api::external::InstanceState`].",
-        "oneOf": [
-          {
-            "description": "Run this instance by migrating in from a previous running incarnation of the instance.",
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "migration_target"
-                ]
-              },
-              "value": {
-                "$ref": "#/components/schemas/InstanceMigrationTargetParams"
-              }
-            },
-            "required": [
-              "type",
-              "value"
-            ]
-          },
-          {
-            "description": "Start the instance if it is not already running.",
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "running"
-                ]
-              }
-            },
-            "required": [
-              "type"
-            ]
-          },
-          {
-            "description": "Stop the instance.",
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "stopped"
-                ]
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Hostname"
               }
-            },
-            "required": [
-              "type"
             ]
           },
-          {
-            "description": "Immediately reset the instance, as though it had stopped and immediately began to run again.",
-            "type": "object",
-            "properties": {
-              "type": {
-                "type": "string",
-                "enum": [
-                  "reboot"
-                ]
-              }
-            },
-            "required": [
-              "type"
-            ]
+          "memory": {
+            "$ref": "#/components/schemas/ByteCount"
+          },
+          "ncpus": {
+            "$ref": "#/components/schemas/InstanceCpuCount"
           }
+        },
+        "required": [
+          "hostname",
+          "memory",
+          "ncpus"
         ]
       },
-      "InstanceUnregisterResponse": {
-        "description": "The response sent from a request to unregister an instance.",
+      "InstanceRuntimeState": {
+        "description": "The dynamic runtime properties of an instance: its current VMM ID (if any), migration information (if any), and the instance state to report if there is no active VMM.",
         "type": "object",
         "properties": {
-          "updated_runtime": {
+          "dst_propolis_id": {
             "nullable": true,
-            "description": "The current state of the instance after handling the request to unregister it. If the instance's state did not change, this field is `None`.",
+            "description": "If a migration is active, the ID of the target VMM.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/TypedUuidForPropolisKind"
+              }
+            ]
+          },
+          "gen": {
+            "description": "Generation number for this state.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Generation"
+              }
+            ]
+          },
+          "migration_id": {
+            "nullable": true,
+            "description": "If a migration is active, the ID of that migration.",
+            "type": "string",
+            "format": "uuid"
+          },
+          "propolis_id": {
+            "nullable": true,
+            "description": "The instance's currently active VMM ID.",
             "allOf": [
               {
-                "$ref": "#/components/schemas/SledInstanceState"
+                "$ref": "#/components/schemas/TypedUuidForPropolisKind"
               }
             ]
+          },
+          "time_updated": {
+            "description": "Timestamp for this information.",
+            "type": "string",
+            "format": "date-time"
           }
-        }
+        },
+        "required": [
+          "gen",
+          "time_updated"
+        ]
       },
       "Inventory": {
         "description": "Identity and basic status information about this sled agent",
@@ -4667,8 +4526,27 @@
           "sled_id"
         ]
       },
-      "SledInstanceState": {
-        "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.",
+      "SledRole": {
+        "description": "Describes the role of the sled within the rack.\n\nNote that this may change if the sled is physically moved within the rack.",
+        "oneOf": [
+          {
+            "description": "The sled is a general compute sled.",
+            "type": "string",
+            "enum": [
+              "gimlet"
+            ]
+          },
+          {
+            "description": "The sled is attached to the network switch, and has additional responsibilities.",
+            "type": "string",
+            "enum": [
+              "scrimlet"
+            ]
+          }
+        ]
+      },
+      "SledVmmState": {
+        "description": "A wrapper type containing a sled's total knowledge of the state of a VMM.",
         "type": "object",
         "properties": {
           "migration_in": {
@@ -4689,14 +4567,6 @@
               }
             ]
           },
-          "propolis_id": {
-            "description": "The ID of the VMM whose state is being reported.",
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/TypedUuidForPropolisKind"
-              }
-            ]
-          },
           "vmm_state": {
             "description": "The most recent state of the sled's VMM process.",
             "allOf": [
@@ -4707,29 +4577,9 @@
           }
         },
         "required": [
-          "propolis_id",
           "vmm_state"
         ]
       },
-      "SledRole": {
-        "description": "Describes the role of the sled within the rack.\n\nNote that this may change if the sled is physically moved within the rack.",
-        "oneOf": [
-          {
-            "description": "The sled is a general compute sled.",
-            "type": "string",
-            "enum": [
-              "gimlet"
-            ]
-          },
-          {
-            "description": "The sled is attached to the network switch, and has additional responsibilities.",
-            "type": "string",
-            "enum": [
-              "scrimlet"
-            ]
-          }
-        ]
-      },
       "Slot": {
         "description": "A stable index which is translated by Propolis into a PCI BDF, visible to the guest.\n\n<details><summary>JSON schema</summary>\n\n```json { \"description\": \"A stable index which is translated by Propolis into a PCI BDF, visible to the guest.\", \"type\": \"integer\", \"format\": \"uint8\", \"minimum\": 0.0 } ``` </details>",
         "type": "integer",
@@ -4912,6 +4762,10 @@
           "sync"
         ]
       },
+      "TypedUuidForInstanceKind": {
+        "type": "string",
+        "format": "uuid"
+      },
       "TypedUuidForPropolisKind": {
         "type": "string",
         "format": "uuid"
@@ -4996,6 +4850,62 @@
           "vni"
         ]
       },
+      "VmmIssueDiskSnapshotRequestBody": {
+        "type": "object",
+        "properties": {
+          "snapshot_id": {
+            "type": "string",
+            "format": "uuid"
+          }
+        },
+        "required": [
+          "snapshot_id"
+        ]
+      },
+      "VmmIssueDiskSnapshotRequestResponse": {
+        "type": "object",
+        "properties": {
+          "snapshot_id": {
+            "type": "string",
+            "format": "uuid"
+          }
+        },
+        "required": [
+          "snapshot_id"
+        ]
+      },
+      "VmmPutStateBody": {
+        "description": "The body of a request to move a previously-ensured instance into a specific runtime state.",
+        "type": "object",
+        "properties": {
+          "state": {
+            "description": "The state into which the instance should be driven.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/VmmStateRequested"
+              }
+            ]
+          }
+        },
+        "required": [
+          "state"
+        ]
+      },
+      "VmmPutStateResponse": {
+        "description": "The response sent from a request to move an instance into a specific runtime state.",
+        "type": "object",
+        "properties": {
+          "updated_runtime": {
+            "nullable": true,
+            "description": "The current runtime state of the instance after handling the request to change its state. If the instance's state did not change, this field is `None`.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/SledVmmState"
+              }
+            ]
+          }
+        }
+      },
       "VmmRuntimeState": {
         "description": "The dynamic runtime properties of an individual VMM process.",
         "type": "object",
@@ -5089,6 +4999,90 @@
           }
         ]
       },
+      "VmmStateRequested": {
+        "description": "Requestable running state of an Instance.\n\nA subset of [`omicron_common::api::external::InstanceState`].",
+        "oneOf": [
+          {
+            "description": "Run this instance by migrating in from a previous running incarnation of the instance.",
+            "type": "object",
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "migration_target"
+                ]
+              },
+              "value": {
+                "$ref": "#/components/schemas/InstanceMigrationTargetParams"
+              }
+            },
+            "required": [
+              "type",
+              "value"
+            ]
+          },
+          {
+            "description": "Start the instance if it is not already running.",
+            "type": "object",
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "running"
+                ]
+              }
+            },
+            "required": [
+              "type"
+            ]
+          },
+          {
+            "description": "Stop the instance.",
+            "type": "object",
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "stopped"
+                ]
+              }
+            },
+            "required": [
+              "type"
+            ]
+          },
+          {
+            "description": "Immediately reset the instance, as though it had stopped and immediately began to run again.",
+            "type": "object",
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "reboot"
+                ]
+              }
+            },
+            "required": [
+              "type"
+            ]
+          }
+        ]
+      },
+      "VmmUnregisterResponse": {
+        "description": "The response sent from a request to unregister an instance.",
+        "type": "object",
+        "properties": {
+          "updated_runtime": {
+            "nullable": true,
+            "description": "The current state of the instance after handling the request to unregister it. If the instance's state did not change, this field is `None`.",
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/SledVmmState"
+              }
+            ]
+          }
+        }
+      },
       "Vni": {
         "description": "A Geneve Virtual Network Identifier",
         "type": "integer",
@@ -5408,10 +5402,6 @@
           "A",
           "B"
         ]
-      },
-      "TypedUuidForInstanceKind": {
-        "type": "string",
-        "format": "uuid"
       }
     },
     "responses": {
diff --git a/oximeter/db/schema/replicated/11/timeseries-to-delete.txt b/oximeter/db/schema/replicated/11/timeseries-to-delete.txt
new file mode 100644
index 0000000000..4f0301a6b5
--- /dev/null
+++ b/oximeter/db/schema/replicated/11/timeseries-to-delete.txt
@@ -0,0 +1,9 @@
+switch_table:capacity
+switch_table:collisions
+switch_table:delete_misses
+switch_table:deletes
+switch_table:exhaustion
+switch_table:inserts
+switch_table:occupancy
+switch_table:update_misses
+switch_table:updates
diff --git a/oximeter/db/schema/single-node/11/timeseries-to-delete.txt b/oximeter/db/schema/single-node/11/timeseries-to-delete.txt
new file mode 100644
index 0000000000..4f0301a6b5
--- /dev/null
+++ b/oximeter/db/schema/single-node/11/timeseries-to-delete.txt
@@ -0,0 +1,9 @@
+switch_table:capacity
+switch_table:collisions
+switch_table:delete_misses
+switch_table:deletes
+switch_table:exhaustion
+switch_table:inserts
+switch_table:occupancy
+switch_table:update_misses
+switch_table:updates
diff --git a/oximeter/db/src/model.rs b/oximeter/db/src/model.rs
index 7608f81e45..a3e9d109ff 100644
--- a/oximeter/db/src/model.rs
+++ b/oximeter/db/src/model.rs
@@ -45,7 +45,7 @@ use uuid::Uuid;
 /// - [`crate::Client::initialize_db_with_version`]
 /// - [`crate::Client::ensure_schema`]
 /// - The `clickhouse-schema-updater` binary in this crate
-pub const OXIMETER_VERSION: u64 = 10;
+pub const OXIMETER_VERSION: u64 = 11;
 
 // Wrapper type to represent a boolean in the database.
 //
diff --git a/oximeter/oximeter/schema/hardware-component.toml b/oximeter/oximeter/schema/hardware-component.toml
new file mode 100644
index 0000000000..30a1d6510f
--- /dev/null
+++ b/oximeter/oximeter/schema/hardware-component.toml
@@ -0,0 +1,183 @@
+format_version = 1
+
+[target]
+name = "hardware_component"
+description = "A hardware component on a compute sled, switch, or power shelf"
+authz_scope = "fleet"
+versions = [
+    { version = 1, fields = [
+        "rack_id",
+        "slot",
+        "chassis_kind",
+        "chassis_serial",
+        "chassis_model",
+        "chassis_revision",
+        "hubris_archive_id",
+        "gateway_id",
+        "component_kind",
+        "component_id",
+        "description",
+    ]}
+]
+
+[fields.rack_id]
+type = "uuid"
+description = "ID of the rack on which this measurement was recorded."
+
+[fields.slot]
+type = "u32"
+description = """
+The cubby number or switch slot of the service processor reporting the \
+measurement"""
+
+[fields.chassis_model]
+type = "string"
+description = "Model number of the sled, switch, or power shelf"
+
+[fields.chassis_revision]
+type = "u32"
+description = "Revision number of the sled, switch, or power shelf"
+
+[fields.chassis_serial]
+type = "string"
+description = "Serial number of the sled, switch, or power shelf"
+
+[fields.hubris_archive_id]
+type = "string"
+description = """
+Hubris firmware archive ID of the service processor when the measurement \
+was recorded."""
+
+[fields.gateway_id]
+type = "uuid"
+description = """
+ID of the Management Gateway Service process which recorded the measurement."""
+
+[fields.chassis_kind]
+type = "string"
+description = """
+What kind of thing the component resides on.
+
+This will be one of 'sled', for components on compute sleds; 'switch', for \
+components on rack switches; or 'power', for components on power shelves."""
+
+[fields.component_id]
+type = "string"
+description = """
+The service processor component ID uniquely identifying the hardware \
+component on the sled, switch, or power shelf."""
+
+[fields.component_kind]
+type = "string"
+description = "What type of hardware component this thing is."
+
+[fields.description]
+type = "string"
+description = """
+A human-readable description of the hardware component. This may include \
+its location or role in the system (e.g. a DIMM's number, or a temperature \
+sensor's location)."""
+
+[fields.sensor]
+type = "string"
+description = """The name of a sensor that recorded a sensor reading."""
+
+[fields.error]
+type = "string"
+description = "The kind of sensor error that occurred"
+
+[fields.sensor_kind]
+type = "string"
+description = """
+Which kind of sensor could not be read due to a sensor error.
+
+This will be one of 'temperature', 'current', 'power', 'voltage', \
+'input_current', 'input_voltage', or 'fan_speed' (the same names as \
+the metrics emitted by these sensors when they are read successfully)."""
+
+[[metrics]]
+name = "temperature"
+description = "A temperature reading from a hardware component."
+units = "degrees_celsius"
+datum_type = "f32"
+versions = [
+    { added_in = 1, fields = ["sensor"]}
+]
+
+[[metrics]]
+name = "current"
+description = "Output current reading in amperes"
+units = "amps"
+datum_type = "f32"
+versions = [
+    { added_in = 1, fields = ["sensor"]}
+]
+
+[[metrics]]
+name = "power"
+description = "Power reading, in watts"
+units = "watts"
+datum_type = "f32"
+versions = [
+    { added_in = 1, fields = ["sensor"]}
+]
+
+[[metrics]]
+name = "voltage"
+description = "Output voltage reading, in volts"
+units = "volts"
+datum_type = "f32"
+versions = [
+    { added_in = 1, fields = ["sensor"]}
+]
+
+[[metrics]]
+name = "input_current"
+description = "Input electric current reading in amperes"
+units = "amps"
+datum_type = "f32"
+versions = [
+    { added_in = 1, fields = ["sensor"]}
+]
+
+[[metrics]]
+name = "input_voltage"
+description = "Input electric voltage reading, in volts"
+units = "volts"
+datum_type = "f32"
+versions = [
+    { added_in = 1, fields = ["sensor"]}
+]
+
+
+[[metrics]]
+name = "fan_speed"
+description = "A fan speed measurement, in rotations per minute"
+units = "rpm"
+datum_type = "f32"
+versions = [
+    { added_in = 1, fields = ["sensor"]}
+]
+
+[[metrics]]
+name = "sensor_error_count"
+description = "Cumulative count of errors reported by a sensor"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = ["sensor", "error", "sensor_kind"]}
+]
+
+[[metrics]]
+name = "poll_error_count"
+description = """
+Cumulative count of errors encountered whilst polling a component's sensors.
+
+Unlike the `sensor_error_count` metric, this counts errors encountered by \
+the management gateway while polling the component, rather than errors \
+reported by the component itself."""
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = ["error"] }
+]
diff --git a/oximeter/oximeter/schema/virtual-disk.toml b/oximeter/oximeter/schema/virtual-disk.toml
new file mode 100644
index 0000000000..54cedae6e6
--- /dev/null
+++ b/oximeter/oximeter/schema/virtual-disk.toml
@@ -0,0 +1,127 @@
+format_version = 1
+
+[target]
+name = "virtual_disk"
+description = "A virtual disk"
+authz_scope = "project"
+versions = [
+    { version = 1, fields = [ "attached_instance_id", "block_size", "disk_id", "project_id", "silo_id", ] },
+]
+
+[fields.attached_instance_id]
+type = "uuid"
+description = "ID of the instance the disk is attached to"
+
+[fields.block_size]
+type = "u32"
+description = "Block size of the disk, in bytes"
+
+[fields.disk_id]
+type = "uuid"
+description = "ID of the disk"
+
+[fields.failure_reason]
+type = "string"
+description = "The reason an I/O operation failed"
+
+[fields.io_kind]
+type = "string"
+description = "The kind of I/O operation"
+
+[fields.project_id]
+type = "uuid"
+description = "ID of the project containing the disk"
+
+[fields.silo_id]
+type = "uuid"
+description = "ID for the silo containing the disk"
+
+[[metrics]]
+name = "bytes_read"
+description = "Number of bytes read from the disk"
+units = "bytes"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [] }
+]
+
+[[metrics]]
+name = "reads"
+description = "Total number of read operations from the disk"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [] }
+]
+
+[[metrics]]
+name = "failed_reads"
+description = "Total number of failed read operations from the disk"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "failure_reason" ] }
+]
+
+[[metrics]]
+name = "bytes_written"
+description = "Number of bytes written to the disk"
+units = "bytes"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [] }
+]
+
+[[metrics]]
+name = "writes"
+description = "Total number of write operations to the disk"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [] }
+]
+
+[[metrics]]
+name = "failed_writes"
+description = "Total number of failed write operations to the disk"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "failure_reason" ] }
+]
+
+[[metrics]]
+name = "flushes"
+description = "Total number of flush operations on the disk"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [] }
+]
+
+[[metrics]]
+name = "failed_flushes"
+description = "Total number of failed flush operations on the disk"
+units = "count"
+datum_type = "cumulative_u64"
+versions = [
+    { added_in = 1, fields = [ "failure_reason" ] }
+]
+
+[[metrics]]
+name = "io_latency"
+description = "Histogram of latency for I/O operations by kind"
+units = "nanoseconds"
+datum_type = "histogram_u64"
+versions = [
+    { added_in = 1, fields = [ "io_kind" ] }
+]
+
+[[metrics]]
+name = "io_size"
+description = "Histogram of sizes for I/O operations by kind"
+units = "bytes"
+datum_type = "histogram_u64"
+versions = [
+    { added_in = 1, fields = [ "io_kind" ] }
+]
diff --git a/oximeter/schema/src/codegen.rs b/oximeter/schema/src/codegen.rs
index 0429cf0534..1e6e352c15 100644
--- a/oximeter/schema/src/codegen.rs
+++ b/oximeter/schema/src/codegen.rs
@@ -512,8 +512,9 @@ fn quote_units(units: Units) -> TokenStream {
         }
         Units::Amps => quote! { ::oximeter::schema::Units::Amps },
         Units::Volts => quote! { ::oximeter::schema::Units::Volts },
-        Units::DegreesCelcius => {
-            quote! { ::oximeter::schema::Units::DegreesCelcius }
+        Units::Watts => quote! { ::oximeter::schema::Units::Watts },
+        Units::DegreesCelsius => {
+            quote! { ::oximeter::schema::Units::DegreesCelsius }
         }
         Units::Rpm => quote! { ::oximeter::schema::Units::Rpm },
     }
diff --git a/oximeter/types/src/histogram.rs b/oximeter/types/src/histogram.rs
index 0b85727ee0..2a4feab382 100644
--- a/oximeter/types/src/histogram.rs
+++ b/oximeter/types/src/histogram.rs
@@ -1029,8 +1029,13 @@ where
             return Err(QuantizationError::InvalidSteps);
         }
 
-        // The highest power must be representable in the target type.
-        if self.checked_pow(hi.into()).is_none() {
+        // The highest power must be representable in the target type. Note that
+        // we have to convert to that target type _before_ doing this check.
+        let base = <u64 as From<Self>>::from(*self);
+        let Some(highest) = base.checked_pow(hi.into()) else {
+            return Err(QuantizationError::Overflow);
+        };
+        if <T as NumCast>::from(highest).is_none() {
             return Err(QuantizationError::Overflow);
         }
 
@@ -1039,7 +1044,6 @@ where
         //
         // Note that we unwrap in a few places below, where we're sure the
         // narrowing conversion cannot fail, such as to a u32.
-        let base = <u64 as From<Self>>::from(*self);
         let lo = <u64 as From<Self>>::from(lo);
         let hi = <u64 as From<Self>>::from(hi);
         let count = <u64 as NumCast>::from(count.get())
@@ -1057,7 +1061,6 @@ where
                 let lo = base.pow(lo as _);
                 let hi = base.pow(hi as _);
                 let distance = hi - lo;
-                dbg!(distance, count);
                 distance.is_multiple_of(&count)
             })
         }
@@ -1767,4 +1770,31 @@ mod tests {
             HistogramError::EmptyBins
         ));
     }
+
+    #[test]
+    fn test_log_linear_bins_does_not_overflow_wide_bin_type() {
+        let start: u16 = 3;
+        // 10u16 ** 10u16 overflows, but what we should be computing is 10u64 **
+        // 10u16, which would not overflow. We need to compute whether it
+        // overflows in the _support_ type.
+        let stop = 10;
+        Histogram::<u64>::span_decades(start, stop).expect(
+            "expected not to overflow, since support type is wide enough",
+        );
+    }
+
+    #[test]
+    fn test_log_linear_bins_does_overflow_narrow_bin_type() {
+        // In this case, the start / stop powers _and_ their resulting bins are
+        // both representable as u16s and also u64s. But we're generating bins
+        // that are u8s, which _the powers do_ overflow.
+        let start: u16 = 1;
+        let stop: u16 = 4;
+        Histogram::<u32>::span_decades(start, stop).expect(
+            "expected not to overflow a u32, since support type is wide enough",
+        );
+        Histogram::<u8>::span_decades(start, stop).expect_err(
+            "expected to overflow a u8, since support type is not wide enough",
+        );
+    }
 }
diff --git a/oximeter/types/src/schema.rs b/oximeter/types/src/schema.rs
index 80aaa6f101..135c77462a 100644
--- a/oximeter/types/src/schema.rs
+++ b/oximeter/types/src/schema.rs
@@ -189,7 +189,8 @@ pub enum Units {
     Nanoseconds,
     Volts,
     Amps,
-    DegreesCelcius,
+    Watts,
+    DegreesCelsius,
     /// Rotations per minute.
     Rpm,
 }
diff --git a/package-manifest.toml b/package-manifest.toml
index 125861f610..cab3c1877e 100644
--- a/package-manifest.toml
+++ b/package-manifest.toml
@@ -710,8 +710,8 @@ only_for_targets.image = "standard"
 # the other `source.*` keys.
 source.type = "prebuilt"
 source.repo = "dendrite"
-source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff"
-source.sha256 = "3771671f0069b33143774e560eb258db99253dba9b78fa3ca974f02a8e1145b4"
+source.commit = "76c735d472e3badaeca08982e22496fccb1ce210"
+source.sha256 = "3ee6cfe770da2855b4eb44c048637d56f8d72de45c8c396186dfe7232d8548fa"
 output.type = "zone"
 output.intermediate_only = true
 
@@ -737,8 +737,8 @@ only_for_targets.image = "standard"
 # the other `source.*` keys.
 source.type = "prebuilt"
 source.repo = "dendrite"
-source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff"
-source.sha256 = "ad02632713a57fe8c5371316320309e1fad52f0ce2f7e6f768859aa94dfbb1d9"
+source.commit = "76c735d472e3badaeca08982e22496fccb1ce210"
+source.sha256 = "0e68ea8fbb609bbe2c643fc8cadc0197bd641006a323149159893bfd0d816805"
 output.type = "zone"
 output.intermediate_only = true
 
@@ -757,8 +757,8 @@ only_for_targets.image = "standard"
 # the other `source.*` keys.
 source.type = "prebuilt"
 source.repo = "dendrite"
-source.commit = "21b16567f28e103f145cd18d53fac6958429c4ff"
-source.sha256 = "23bca3873cdb0441cd18c0cf071b86d49755be06837479661876ac95d2f10f27"
+source.commit = "76c735d472e3badaeca08982e22496fccb1ce210"
+source.sha256 = "45484d6d8557a0656984d0e6db879589d841d43ab6a11116cb1da314b928a425"
 output.type = "zone"
 output.intermediate_only = true
 
diff --git a/schema/crdb/add-management-gateway-producer-kind/up.sql b/schema/crdb/add-management-gateway-producer-kind/up.sql
new file mode 100644
index 0000000000..e872278e2f
--- /dev/null
+++ b/schema/crdb/add-management-gateway-producer-kind/up.sql
@@ -0,0 +1,2 @@
+ALTER TYPE omicron.public.producer_kind
+    ADD VALUE IF NOT EXISTS 'management_gateway' AFTER 'instance';
diff --git a/schema/crdb/collapse_lldp_settings/up2.sql b/schema/crdb/collapse_lldp_settings/up2.sql
index b2d884d068..8ead8a29b4 100644
--- a/schema/crdb/collapse_lldp_settings/up2.sql
+++ b/schema/crdb/collapse_lldp_settings/up2.sql
@@ -1,4 +1,4 @@
 /*
  * Add a pointer to this link's LLDP config settings.
  */
-ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS lldp_link_config_id UUID NOT NULL;
+ALTER TABLE omicron.public.switch_port_settings_link_config ADD COLUMN IF NOT EXISTS lldp_link_config_id UUID;
diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql
index baef38e44f..d531672832 100644
--- a/schema/crdb/dbinit.sql
+++ b/schema/crdb/dbinit.sql
@@ -1334,7 +1334,9 @@ CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM (
     -- removed).
     'service',
     -- A Propolis VMM for an instance in the omicron.public.instance table
-    'instance'
+    'instance',
+    -- A management gateway service on a scrimlet.
+    'management_gateway'
 );
 
 /*
@@ -2655,7 +2657,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.switch_port_settings_link_config (
     fec omicron.public.switch_link_fec,
     speed omicron.public.switch_link_speed,
     autoneg BOOL NOT NULL DEFAULT false,
-    lldp_link_config_id UUID NOT NULL,
+    lldp_link_config_id UUID,
 
     PRIMARY KEY (port_settings_id, link_name)
 );
@@ -4212,7 +4214,7 @@ INSERT INTO omicron.public.db_metadata (
     version,
     target_version
 ) VALUES
-    (TRUE, NOW(), NOW(), '90.0.0', NULL)
+    (TRUE, NOW(), NOW(), '92.0.0', NULL)
 ON CONFLICT DO NOTHING;
 
 COMMIT;
diff --git a/schema/crdb/lldp-link-config-nullable/up1.sql b/schema/crdb/lldp-link-config-nullable/up1.sql
new file mode 100644
index 0000000000..c8e1122f68
--- /dev/null
+++ b/schema/crdb/lldp-link-config-nullable/up1.sql
@@ -0,0 +1,20 @@
+-- Refer to https://github.com/oxidecomputer/omicron/issues/6433 for the justificaiton
+-- behind this schema change.
+--
+-- In short: the "collapse_lldp_settings" schema change was edited after
+-- merging. That change included a schema change which added a non-null column
+-- to an existing table. Such a data-modifying statement is only valid for
+-- tables with no rows - however, in our test systems, we observed rows, which
+-- prevented this schema change from progressing.
+--
+-- To resolve:
+-- 1. Within the old "collapse_lldp_settings" change, we retroactively dropped the
+-- non-null constraint. For systems with populated
+-- "switch_port_settings_link_config" tables, this allows the schema update to
+-- complete without an error.
+-- 2. Within this new "lldp-link-config-nullable" change, we ALSO dropped the
+-- non-null constraint. For systems without populated
+-- "switch_port_settings_link_config" tables -- which may have been able to
+-- apply the "collapse_lldp_settings" change successfully -- this converges the state
+-- of the database to the same outcome, where the columns is nullable.
+ALTER TABLE omicron.public.switch_port_settings_link_config ALTER COLUMN lldp_link_config_id DROP NOT NULL;
diff --git a/schema/rss-service-plan-v4.json b/schema/rss-service-plan-v4.json
new file mode 100644
index 0000000000..badfaf4589
--- /dev/null
+++ b/schema/rss-service-plan-v4.json
@@ -0,0 +1,999 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Plan",
+  "type": "object",
+  "required": [
+    "dns_config",
+    "services"
+  ],
+  "properties": {
+    "dns_config": {
+      "$ref": "#/definitions/DnsConfigParams"
+    },
+    "services": {
+      "type": "object",
+      "additionalProperties": {
+        "$ref": "#/definitions/SledConfig"
+      }
+    }
+  },
+  "definitions": {
+    "BlueprintZoneConfig": {
+      "description": "Describes one Omicron-managed zone in a blueprint.\n\nPart of [`BlueprintZonesConfig`].",
+      "type": "object",
+      "required": [
+        "disposition",
+        "id",
+        "underlay_address",
+        "zone_type"
+      ],
+      "properties": {
+        "disposition": {
+          "description": "The disposition (desired state) of this zone recorded in the blueprint.",
+          "allOf": [
+            {
+              "$ref": "#/definitions/BlueprintZoneDisposition"
+            }
+          ]
+        },
+        "filesystem_pool": {
+          "anyOf": [
+            {
+              "$ref": "#/definitions/ZpoolName"
+            },
+            {
+              "type": "null"
+            }
+          ]
+        },
+        "id": {
+          "$ref": "#/definitions/TypedUuidForOmicronZoneKind"
+        },
+        "underlay_address": {
+          "type": "string",
+          "format": "ipv6"
+        },
+        "zone_type": {
+          "$ref": "#/definitions/BlueprintZoneType"
+        }
+      }
+    },
+    "BlueprintZoneDisposition": {
+      "description": "The desired state of an Omicron-managed zone in a blueprint.\n\nPart of [`BlueprintZoneConfig`].",
+      "oneOf": [
+        {
+          "description": "The zone is in-service.",
+          "type": "string",
+          "enum": [
+            "in_service"
+          ]
+        },
+        {
+          "description": "The zone is not in service.",
+          "type": "string",
+          "enum": [
+            "quiesced"
+          ]
+        },
+        {
+          "description": "The zone is permanently gone.",
+          "type": "string",
+          "enum": [
+            "expunged"
+          ]
+        }
+      ]
+    },
+    "BlueprintZoneType": {
+      "oneOf": [
+        {
+          "type": "object",
+          "required": [
+            "address",
+            "dns_servers",
+            "external_ip",
+            "nic",
+            "ntp_servers",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "dns_servers": {
+              "type": "array",
+              "items": {
+                "type": "string",
+                "format": "ip"
+              }
+            },
+            "domain": {
+              "type": [
+                "string",
+                "null"
+              ]
+            },
+            "external_ip": {
+              "$ref": "#/definitions/OmicronZoneExternalSnatIp"
+            },
+            "nic": {
+              "description": "The service vNIC providing outbound connectivity using OPTE.",
+              "allOf": [
+                {
+                  "$ref": "#/definitions/NetworkInterface"
+                }
+              ]
+            },
+            "ntp_servers": {
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "boundary_ntp"
+              ]
+            }
+          }
+        },
+        {
+          "description": "Used in single-node clickhouse setups",
+          "type": "object",
+          "required": [
+            "address",
+            "dataset",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "dataset": {
+              "$ref": "#/definitions/OmicronZoneDataset"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "clickhouse"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "address",
+            "dataset",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "dataset": {
+              "$ref": "#/definitions/OmicronZoneDataset"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "clickhouse_keeper"
+              ]
+            }
+          }
+        },
+        {
+          "description": "Used in replicated clickhouse setups",
+          "type": "object",
+          "required": [
+            "address",
+            "dataset",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "dataset": {
+              "$ref": "#/definitions/OmicronZoneDataset"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "clickhouse_server"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "address",
+            "dataset",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "dataset": {
+              "$ref": "#/definitions/OmicronZoneDataset"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "cockroach_db"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "address",
+            "dataset",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "dataset": {
+              "$ref": "#/definitions/OmicronZoneDataset"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "crucible"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "address",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "crucible_pantry"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "dataset",
+            "dns_address",
+            "http_address",
+            "nic",
+            "type"
+          ],
+          "properties": {
+            "dataset": {
+              "$ref": "#/definitions/OmicronZoneDataset"
+            },
+            "dns_address": {
+              "description": "The address at which the external DNS server is reachable.",
+              "allOf": [
+                {
+                  "$ref": "#/definitions/OmicronZoneExternalFloatingAddr"
+                }
+              ]
+            },
+            "http_address": {
+              "description": "The address at which the external DNS server API is reachable.",
+              "type": "string"
+            },
+            "nic": {
+              "description": "The service vNIC providing external connectivity using OPTE.",
+              "allOf": [
+                {
+                  "$ref": "#/definitions/NetworkInterface"
+                }
+              ]
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "external_dns"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "dataset",
+            "dns_address",
+            "gz_address",
+            "gz_address_index",
+            "http_address",
+            "type"
+          ],
+          "properties": {
+            "dataset": {
+              "$ref": "#/definitions/OmicronZoneDataset"
+            },
+            "dns_address": {
+              "type": "string"
+            },
+            "gz_address": {
+              "description": "The addresses in the global zone which should be created\n\nFor the DNS service, which exists outside the sleds's typical subnet - adding an address in the GZ is necessary to allow inter-zone traffic routing.",
+              "type": "string",
+              "format": "ipv6"
+            },
+            "gz_address_index": {
+              "description": "The address is also identified with an auxiliary bit of information to ensure that the created global zone address can have a unique name.",
+              "type": "integer",
+              "format": "uint32",
+              "minimum": 0.0
+            },
+            "http_address": {
+              "type": "string"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "internal_dns"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "address",
+            "dns_servers",
+            "ntp_servers",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "dns_servers": {
+              "type": "array",
+              "items": {
+                "type": "string",
+                "format": "ip"
+              }
+            },
+            "domain": {
+              "type": [
+                "string",
+                "null"
+              ]
+            },
+            "ntp_servers": {
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "internal_ntp"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "external_dns_servers",
+            "external_ip",
+            "external_tls",
+            "internal_address",
+            "nic",
+            "type"
+          ],
+          "properties": {
+            "external_dns_servers": {
+              "description": "External DNS servers Nexus can use to resolve external hosts.",
+              "type": "array",
+              "items": {
+                "type": "string",
+                "format": "ip"
+              }
+            },
+            "external_ip": {
+              "description": "The address at which the external nexus server is reachable.",
+              "allOf": [
+                {
+                  "$ref": "#/definitions/OmicronZoneExternalFloatingIp"
+                }
+              ]
+            },
+            "external_tls": {
+              "description": "Whether Nexus's external endpoint should use TLS",
+              "type": "boolean"
+            },
+            "internal_address": {
+              "description": "The address at which the internal nexus server is reachable.",
+              "type": "string"
+            },
+            "nic": {
+              "description": "The service vNIC providing external connectivity using OPTE.",
+              "allOf": [
+                {
+                  "$ref": "#/definitions/NetworkInterface"
+                }
+              ]
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "nexus"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "address",
+            "type"
+          ],
+          "properties": {
+            "address": {
+              "type": "string"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "oximeter"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "DiskIdentity": {
+      "description": "Uniquely identifies a disk.",
+      "type": "object",
+      "required": [
+        "model",
+        "serial",
+        "vendor"
+      ],
+      "properties": {
+        "model": {
+          "type": "string"
+        },
+        "serial": {
+          "type": "string"
+        },
+        "vendor": {
+          "type": "string"
+        }
+      }
+    },
+    "DnsConfigParams": {
+      "description": "DnsConfigParams\n\n<details><summary>JSON schema</summary>\n\n```json { \"type\": \"object\", \"required\": [ \"generation\", \"time_created\", \"zones\" ], \"properties\": { \"generation\": { \"type\": \"integer\", \"format\": \"uint64\", \"minimum\": 0.0 }, \"time_created\": { \"type\": \"string\", \"format\": \"date-time\" }, \"zones\": { \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/DnsConfigZone\" } } } } ``` </details>",
+      "type": "object",
+      "required": [
+        "generation",
+        "time_created",
+        "zones"
+      ],
+      "properties": {
+        "generation": {
+          "type": "integer",
+          "format": "uint64",
+          "minimum": 0.0
+        },
+        "time_created": {
+          "type": "string",
+          "format": "date-time"
+        },
+        "zones": {
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/DnsConfigZone"
+          }
+        }
+      }
+    },
+    "DnsConfigZone": {
+      "description": "DnsConfigZone\n\n<details><summary>JSON schema</summary>\n\n```json { \"type\": \"object\", \"required\": [ \"records\", \"zone_name\" ], \"properties\": { \"records\": { \"type\": \"object\", \"additionalProperties\": { \"type\": \"array\", \"items\": { \"$ref\": \"#/components/schemas/DnsRecord\" } } }, \"zone_name\": { \"type\": \"string\" } } } ``` </details>",
+      "type": "object",
+      "required": [
+        "records",
+        "zone_name"
+      ],
+      "properties": {
+        "records": {
+          "type": "object",
+          "additionalProperties": {
+            "type": "array",
+            "items": {
+              "$ref": "#/definitions/DnsRecord"
+            }
+          }
+        },
+        "zone_name": {
+          "type": "string"
+        }
+      }
+    },
+    "DnsRecord": {
+      "description": "DnsRecord\n\n<details><summary>JSON schema</summary>\n\n```json { \"oneOf\": [ { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"type\": \"string\", \"format\": \"ipv4\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"A\" ] } } }, { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"type\": \"string\", \"format\": \"ipv6\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"AAAA\" ] } } }, { \"type\": \"object\", \"required\": [ \"data\", \"type\" ], \"properties\": { \"data\": { \"$ref\": \"#/components/schemas/Srv\" }, \"type\": { \"type\": \"string\", \"enum\": [ \"SRV\" ] } } } ] } ``` </details>",
+      "oneOf": [
+        {
+          "type": "object",
+          "required": [
+            "data",
+            "type"
+          ],
+          "properties": {
+            "data": {
+              "type": "string",
+              "format": "ipv4"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "A"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "data",
+            "type"
+          ],
+          "properties": {
+            "data": {
+              "type": "string",
+              "format": "ipv6"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "AAAA"
+              ]
+            }
+          }
+        },
+        {
+          "type": "object",
+          "required": [
+            "data",
+            "type"
+          ],
+          "properties": {
+            "data": {
+              "$ref": "#/definitions/Srv"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "SRV"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "Generation": {
+      "description": "Generation numbers stored in the database, used for optimistic concurrency control",
+      "type": "integer",
+      "format": "uint64",
+      "minimum": 0.0
+    },
+    "IpNet": {
+      "oneOf": [
+        {
+          "title": "v4",
+          "allOf": [
+            {
+              "$ref": "#/definitions/Ipv4Net"
+            }
+          ]
+        },
+        {
+          "title": "v6",
+          "allOf": [
+            {
+              "$ref": "#/definitions/Ipv6Net"
+            }
+          ]
+        }
+      ],
+      "x-rust-type": {
+        "crate": "oxnet",
+        "path": "oxnet::IpNet",
+        "version": "0.1.0"
+      }
+    },
+    "Ipv4Net": {
+      "title": "An IPv4 subnet",
+      "description": "An IPv4 subnet, including prefix and prefix length",
+      "examples": [
+        "192.168.1.0/24"
+      ],
+      "type": "string",
+      "pattern": "^(([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\\.){3}([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])/([0-9]|1[0-9]|2[0-9]|3[0-2])$",
+      "x-rust-type": {
+        "crate": "oxnet",
+        "path": "oxnet::Ipv4Net",
+        "version": "0.1.0"
+      }
+    },
+    "Ipv6Net": {
+      "title": "An IPv6 subnet",
+      "description": "An IPv6 subnet, including prefix and subnet mask",
+      "examples": [
+        "fd12:3456::/64"
+      ],
+      "type": "string",
+      "pattern": "^(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))\\/([0-9]|[1-9][0-9]|1[0-1][0-9]|12[0-8])$",
+      "x-rust-type": {
+        "crate": "oxnet",
+        "path": "oxnet::Ipv6Net",
+        "version": "0.1.0"
+      }
+    },
+    "MacAddr": {
+      "title": "A MAC address",
+      "description": "A Media Access Control address, in EUI-48 format",
+      "examples": [
+        "ff:ff:ff:ff:ff:ff"
+      ],
+      "type": "string",
+      "maxLength": 17,
+      "minLength": 5,
+      "pattern": "^([0-9a-fA-F]{0,2}:){5}[0-9a-fA-F]{0,2}$"
+    },
+    "Name": {
+      "title": "A name unique within the parent collection",
+      "description": "Names must begin with a lower case ASCII letter, be composed exclusively of lowercase ASCII, uppercase ASCII, numbers, and '-', and may not end with a '-'. Names cannot be a UUID, but they may contain a UUID. They can be at most 63 characters long.",
+      "type": "string",
+      "maxLength": 63,
+      "minLength": 1,
+      "pattern": "^(?![0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$)^[a-z]([a-zA-Z0-9-]*[a-zA-Z0-9]+)?$"
+    },
+    "NetworkInterface": {
+      "description": "Information required to construct a virtual network interface",
+      "type": "object",
+      "required": [
+        "id",
+        "ip",
+        "kind",
+        "mac",
+        "name",
+        "primary",
+        "slot",
+        "subnet",
+        "vni"
+      ],
+      "properties": {
+        "id": {
+          "type": "string",
+          "format": "uuid"
+        },
+        "ip": {
+          "type": "string",
+          "format": "ip"
+        },
+        "kind": {
+          "$ref": "#/definitions/NetworkInterfaceKind"
+        },
+        "mac": {
+          "$ref": "#/definitions/MacAddr"
+        },
+        "name": {
+          "$ref": "#/definitions/Name"
+        },
+        "primary": {
+          "type": "boolean"
+        },
+        "slot": {
+          "type": "integer",
+          "format": "uint8",
+          "minimum": 0.0
+        },
+        "subnet": {
+          "$ref": "#/definitions/IpNet"
+        },
+        "transit_ips": {
+          "default": [],
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/IpNet"
+          }
+        },
+        "vni": {
+          "$ref": "#/definitions/Vni"
+        }
+      }
+    },
+    "NetworkInterfaceKind": {
+      "description": "The type of network interface",
+      "oneOf": [
+        {
+          "description": "A vNIC attached to a guest instance",
+          "type": "object",
+          "required": [
+            "id",
+            "type"
+          ],
+          "properties": {
+            "id": {
+              "type": "string",
+              "format": "uuid"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "instance"
+              ]
+            }
+          }
+        },
+        {
+          "description": "A vNIC associated with an internal service",
+          "type": "object",
+          "required": [
+            "id",
+            "type"
+          ],
+          "properties": {
+            "id": {
+              "type": "string",
+              "format": "uuid"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "service"
+              ]
+            }
+          }
+        },
+        {
+          "description": "A vNIC associated with a probe",
+          "type": "object",
+          "required": [
+            "id",
+            "type"
+          ],
+          "properties": {
+            "id": {
+              "type": "string",
+              "format": "uuid"
+            },
+            "type": {
+              "type": "string",
+              "enum": [
+                "probe"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "OmicronPhysicalDiskConfig": {
+      "type": "object",
+      "required": [
+        "id",
+        "identity",
+        "pool_id"
+      ],
+      "properties": {
+        "id": {
+          "type": "string",
+          "format": "uuid"
+        },
+        "identity": {
+          "$ref": "#/definitions/DiskIdentity"
+        },
+        "pool_id": {
+          "$ref": "#/definitions/TypedUuidForZpoolKind"
+        }
+      }
+    },
+    "OmicronPhysicalDisksConfig": {
+      "type": "object",
+      "required": [
+        "disks",
+        "generation"
+      ],
+      "properties": {
+        "disks": {
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/OmicronPhysicalDiskConfig"
+          }
+        },
+        "generation": {
+          "description": "generation number of this configuration\n\nThis generation number is owned by the control plane (i.e., RSS or Nexus, depending on whether RSS-to-Nexus handoff has happened).  It should not be bumped within Sled Agent.\n\nSled Agent rejects attempts to set the configuration to a generation older than the one it's currently running.",
+          "allOf": [
+            {
+              "$ref": "#/definitions/Generation"
+            }
+          ]
+        }
+      }
+    },
+    "OmicronZoneDataset": {
+      "description": "Describes a persistent ZFS dataset associated with an Omicron zone",
+      "type": "object",
+      "required": [
+        "pool_name"
+      ],
+      "properties": {
+        "pool_name": {
+          "$ref": "#/definitions/ZpoolName"
+        }
+      }
+    },
+    "OmicronZoneExternalFloatingAddr": {
+      "description": "Floating external address with port allocated to an Omicron-managed zone.",
+      "type": "object",
+      "required": [
+        "addr",
+        "id"
+      ],
+      "properties": {
+        "addr": {
+          "type": "string"
+        },
+        "id": {
+          "$ref": "#/definitions/TypedUuidForExternalIpKind"
+        }
+      }
+    },
+    "OmicronZoneExternalFloatingIp": {
+      "description": "Floating external IP allocated to an Omicron-managed zone.\n\nThis is a slimmer `nexus_db_model::ExternalIp` that only stores the fields necessary for blueprint planning, and requires that the zone have a single IP.",
+      "type": "object",
+      "required": [
+        "id",
+        "ip"
+      ],
+      "properties": {
+        "id": {
+          "$ref": "#/definitions/TypedUuidForExternalIpKind"
+        },
+        "ip": {
+          "type": "string",
+          "format": "ip"
+        }
+      }
+    },
+    "OmicronZoneExternalSnatIp": {
+      "description": "SNAT (outbound) external IP allocated to an Omicron-managed zone.\n\nThis is a slimmer `nexus_db_model::ExternalIp` that only stores the fields necessary for blueprint planning, and requires that the zone have a single IP.",
+      "type": "object",
+      "required": [
+        "id",
+        "snat_cfg"
+      ],
+      "properties": {
+        "id": {
+          "$ref": "#/definitions/TypedUuidForExternalIpKind"
+        },
+        "snat_cfg": {
+          "$ref": "#/definitions/SourceNatConfig"
+        }
+      }
+    },
+    "SledConfig": {
+      "type": "object",
+      "required": [
+        "disks",
+        "zones"
+      ],
+      "properties": {
+        "disks": {
+          "description": "Control plane disks configured for this sled",
+          "allOf": [
+            {
+              "$ref": "#/definitions/OmicronPhysicalDisksConfig"
+            }
+          ]
+        },
+        "zones": {
+          "description": "zones configured for this sled",
+          "type": "array",
+          "items": {
+            "$ref": "#/definitions/BlueprintZoneConfig"
+          }
+        }
+      }
+    },
+    "SourceNatConfig": {
+      "description": "An IP address and port range used for source NAT, i.e., making outbound network connections from guests or services.",
+      "type": "object",
+      "required": [
+        "first_port",
+        "ip",
+        "last_port"
+      ],
+      "properties": {
+        "first_port": {
+          "description": "The first port used for source NAT, inclusive.",
+          "type": "integer",
+          "format": "uint16",
+          "minimum": 0.0
+        },
+        "ip": {
+          "description": "The external address provided to the instance or service.",
+          "type": "string",
+          "format": "ip"
+        },
+        "last_port": {
+          "description": "The last port used for source NAT, also inclusive.",
+          "type": "integer",
+          "format": "uint16",
+          "minimum": 0.0
+        }
+      }
+    },
+    "Srv": {
+      "description": "Srv\n\n<details><summary>JSON schema</summary>\n\n```json { \"type\": \"object\", \"required\": [ \"port\", \"prio\", \"target\", \"weight\" ], \"properties\": { \"port\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 }, \"prio\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 }, \"target\": { \"type\": \"string\" }, \"weight\": { \"type\": \"integer\", \"format\": \"uint16\", \"minimum\": 0.0 } } } ``` </details>",
+      "type": "object",
+      "required": [
+        "port",
+        "prio",
+        "target",
+        "weight"
+      ],
+      "properties": {
+        "port": {
+          "type": "integer",
+          "format": "uint16",
+          "minimum": 0.0
+        },
+        "prio": {
+          "type": "integer",
+          "format": "uint16",
+          "minimum": 0.0
+        },
+        "target": {
+          "type": "string"
+        },
+        "weight": {
+          "type": "integer",
+          "format": "uint16",
+          "minimum": 0.0
+        }
+      }
+    },
+    "TypedUuidForExternalIpKind": {
+      "type": "string",
+      "format": "uuid"
+    },
+    "TypedUuidForOmicronZoneKind": {
+      "type": "string",
+      "format": "uuid"
+    },
+    "TypedUuidForZpoolKind": {
+      "type": "string",
+      "format": "uuid"
+    },
+    "Vni": {
+      "description": "A Geneve Virtual Network Identifier",
+      "type": "integer",
+      "format": "uint32",
+      "minimum": 0.0
+    },
+    "ZpoolName": {
+      "title": "The name of a Zpool",
+      "description": "Zpool names are of the format ox{i,p}_<UUID>. They are either Internal or External, and should be unique",
+      "type": "string",
+      "pattern": "^ox[ip]_[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
+    }
+  }
+}
\ No newline at end of file
diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs
index c44b24d712..410747bf46 100644
--- a/sled-agent/api/src/lib.rs
+++ b/sled-agent/api/src/lib.rs
@@ -15,7 +15,7 @@ use nexus_sled_agent_shared::inventory::{
 };
 use omicron_common::{
     api::internal::{
-        nexus::{DiskRuntimeState, SledInstanceState, UpdateArtifactId},
+        nexus::{DiskRuntimeState, SledVmmState, UpdateArtifactId},
         shared::{
             ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers,
             SwitchPorts, VirtualNetworkInterfaceHost,
@@ -23,7 +23,7 @@ use omicron_common::{
     },
     disk::{DiskVariant, DisksManagementResult, OmicronPhysicalDisksConfig},
 };
-use omicron_uuid_kinds::{InstanceUuid, ZpoolUuid};
+use omicron_uuid_kinds::{PropolisUuid, ZpoolUuid};
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use sled_agent_types::{
@@ -36,8 +36,8 @@ use sled_agent_types::{
     early_networking::EarlyNetworkConfig,
     firewall_rules::VpcFirewallRulesEnsureBody,
     instance::{
-        InstanceEnsureBody, InstanceExternalIpBody, InstancePutStateBody,
-        InstancePutStateResponse, InstanceUnregisterResponse,
+        InstanceEnsureBody, InstanceExternalIpBody, VmmPutStateBody,
+        VmmPutStateResponse, VmmUnregisterResponse,
     },
     sled::AddSledRequest,
     time_sync::TimeSync,
@@ -212,59 +212,59 @@ pub trait SledAgentApi {
 
     #[endpoint {
         method = PUT,
-        path = "/instances/{instance_id}",
+        path = "/vmms/{propolis_id}",
     }]
-    async fn instance_register(
+    async fn vmm_register(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceEnsureBody>,
-    ) -> Result<HttpResponseOk<SledInstanceState>, HttpError>;
+    ) -> Result<HttpResponseOk<SledVmmState>, HttpError>;
 
     #[endpoint {
         method = DELETE,
-        path = "/instances/{instance_id}",
+        path = "/vmms/{propolis_id}",
     }]
-    async fn instance_unregister(
+    async fn vmm_unregister(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-    ) -> Result<HttpResponseOk<InstanceUnregisterResponse>, HttpError>;
+        path_params: Path<VmmPathParam>,
+    ) -> Result<HttpResponseOk<VmmUnregisterResponse>, HttpError>;
 
     #[endpoint {
         method = PUT,
-        path = "/instances/{instance_id}/state",
+        path = "/vmms/{propolis_id}/state",
     }]
-    async fn instance_put_state(
+    async fn vmm_put_state(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-        body: TypedBody<InstancePutStateBody>,
-    ) -> Result<HttpResponseOk<InstancePutStateResponse>, HttpError>;
+        path_params: Path<VmmPathParam>,
+        body: TypedBody<VmmPutStateBody>,
+    ) -> Result<HttpResponseOk<VmmPutStateResponse>, HttpError>;
 
     #[endpoint {
         method = GET,
-        path = "/instances/{instance_id}/state",
+        path = "/vmms/{propolis_id}/state",
     }]
-    async fn instance_get_state(
+    async fn vmm_get_state(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-    ) -> Result<HttpResponseOk<SledInstanceState>, HttpError>;
+        path_params: Path<VmmPathParam>,
+    ) -> Result<HttpResponseOk<SledVmmState>, HttpError>;
 
     #[endpoint {
         method = PUT,
-        path = "/instances/{instance_id}/external-ip",
+        path = "/vmms/{propolis_id}/external-ip",
     }]
-    async fn instance_put_external_ip(
+    async fn vmm_put_external_ip(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceExternalIpBody>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
 
     #[endpoint {
         method = DELETE,
-        path = "/instances/{instance_id}/external-ip",
+        path = "/vmms/{propolis_id}/external-ip",
     }]
-    async fn instance_delete_external_ip(
+    async fn vmm_delete_external_ip(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceExternalIpBody>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError>;
 
@@ -290,16 +290,13 @@ pub trait SledAgentApi {
     /// Take a snapshot of a disk that is attached to an instance
     #[endpoint {
         method = POST,
-        path = "/instances/{instance_id}/disks/{disk_id}/snapshot",
+        path = "/vmms/{propolis_id}/disks/{disk_id}/snapshot",
     }]
-    async fn instance_issue_disk_snapshot_request(
+    async fn vmm_issue_disk_snapshot_request(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstanceIssueDiskSnapshotRequestPathParam>,
-        body: TypedBody<InstanceIssueDiskSnapshotRequestBody>,
-    ) -> Result<
-        HttpResponseOk<InstanceIssueDiskSnapshotRequestResponse>,
-        HttpError,
-    >;
+        path_params: Path<VmmIssueDiskSnapshotRequestPathParam>,
+        body: TypedBody<VmmIssueDiskSnapshotRequestBody>,
+    ) -> Result<HttpResponseOk<VmmIssueDiskSnapshotRequestResponse>, HttpError>;
 
     #[endpoint {
         method = PUT,
@@ -516,8 +513,8 @@ impl From<DiskVariant> for DiskType {
 
 /// Path parameters for Instance requests (sled agent API)
 #[derive(Deserialize, JsonSchema)]
-pub struct InstancePathParam {
-    pub instance_id: InstanceUuid,
+pub struct VmmPathParam {
+    pub propolis_id: PropolisUuid,
 }
 
 /// Path parameters for Disk requests (sled agent API)
@@ -527,18 +524,18 @@ pub struct DiskPathParam {
 }
 
 #[derive(Deserialize, JsonSchema)]
-pub struct InstanceIssueDiskSnapshotRequestPathParam {
-    pub instance_id: Uuid,
+pub struct VmmIssueDiskSnapshotRequestPathParam {
+    pub propolis_id: PropolisUuid,
     pub disk_id: Uuid,
 }
 
 #[derive(Deserialize, JsonSchema)]
-pub struct InstanceIssueDiskSnapshotRequestBody {
+pub struct VmmIssueDiskSnapshotRequestBody {
     pub snapshot_id: Uuid,
 }
 
 #[derive(Serialize, JsonSchema)]
-pub struct InstanceIssueDiskSnapshotRequestResponse {
+pub struct VmmIssueDiskSnapshotRequestResponse {
     pub snapshot_id: Uuid,
 }
 
diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs
index adbeb9158f..f95bf0cb64 100644
--- a/sled-agent/src/common/instance.rs
+++ b/sled-agent/src/common/instance.rs
@@ -7,10 +7,9 @@
 use chrono::{DateTime, Utc};
 use omicron_common::api::external::Generation;
 use omicron_common::api::internal::nexus::{
-    MigrationRuntimeState, MigrationState, SledInstanceState, VmmRuntimeState,
+    MigrationRuntimeState, MigrationState, SledVmmState, VmmRuntimeState,
     VmmState,
 };
-use omicron_uuid_kinds::PropolisUuid;
 use propolis_client::types::{
     InstanceMigrationStatus, InstanceState as PropolisApiState,
     InstanceStateMonitorResponse, MigrationState as PropolisMigrationState,
@@ -21,7 +20,6 @@ use uuid::Uuid;
 #[derive(Clone, Debug)]
 pub struct InstanceStates {
     vmm: VmmRuntimeState,
-    propolis_id: PropolisUuid,
     migration_in: Option<MigrationRuntimeState>,
     migration_out: Option<MigrationRuntimeState>,
 }
@@ -173,11 +171,7 @@ pub enum Action {
 }
 
 impl InstanceStates {
-    pub fn new(
-        vmm: VmmRuntimeState,
-        propolis_id: PropolisUuid,
-        migration_id: Option<Uuid>,
-    ) -> Self {
+    pub fn new(vmm: VmmRuntimeState, migration_id: Option<Uuid>) -> Self {
         // If this instance is created with a migration ID, we are the intended
         // target of a migration in. Set that up now.
         let migration_in =
@@ -187,17 +181,13 @@ impl InstanceStates {
                 gen: Generation::new(),
                 time_updated: Utc::now(),
             });
-        InstanceStates { vmm, propolis_id, migration_in, migration_out: None }
+        InstanceStates { vmm, migration_in, migration_out: None }
     }
 
     pub fn vmm(&self) -> &VmmRuntimeState {
         &self.vmm
     }
 
-    pub fn propolis_id(&self) -> PropolisUuid {
-        self.propolis_id
-    }
-
     pub fn migration_in(&self) -> Option<&MigrationRuntimeState> {
         self.migration_in.as_ref()
     }
@@ -209,10 +199,9 @@ impl InstanceStates {
     /// Creates a `SledInstanceState` structure containing the entirety of this
     /// structure's runtime state. This requires cloning; for simple read access
     /// use the `instance` or `vmm` accessors instead.
-    pub fn sled_instance_state(&self) -> SledInstanceState {
-        SledInstanceState {
+    pub fn sled_instance_state(&self) -> SledVmmState {
+        SledVmmState {
             vmm_state: self.vmm.clone(),
-            propolis_id: self.propolis_id,
             migration_in: self.migration_in.clone(),
             migration_out: self.migration_out.clone(),
         }
@@ -377,7 +366,6 @@ mod test {
     use uuid::Uuid;
 
     fn make_instance() -> InstanceStates {
-        let propolis_id = PropolisUuid::new_v4();
         let now = Utc::now();
 
         let vmm = VmmRuntimeState {
@@ -386,7 +374,7 @@ mod test {
             time_updated: now,
         };
 
-        InstanceStates::new(vmm, propolis_id, None)
+        InstanceStates::new(vmm, None)
     }
 
     fn make_migration_source_instance() -> InstanceStates {
@@ -406,7 +394,6 @@ mod test {
     }
 
     fn make_migration_target_instance() -> InstanceStates {
-        let propolis_id = PropolisUuid::new_v4();
         let now = Utc::now();
 
         let vmm = VmmRuntimeState {
@@ -415,7 +402,7 @@ mod test {
             time_updated: now,
         };
 
-        InstanceStates::new(vmm, propolis_id, Some(Uuid::new_v4()))
+        InstanceStates::new(vmm, Some(Uuid::new_v4()))
     }
 
     fn make_observed_state(
diff --git a/sled-agent/src/fakes/nexus.rs b/sled-agent/src/fakes/nexus.rs
index 246ef07b60..bd4680563e 100644
--- a/sled-agent/src/fakes/nexus.rs
+++ b/sled-agent/src/fakes/nexus.rs
@@ -15,12 +15,11 @@ use hyper::Body;
 use internal_dns::ServiceName;
 use nexus_client::types::SledAgentInfo;
 use omicron_common::api::external::Error;
-use omicron_common::api::internal::nexus::{
-    SledInstanceState, UpdateArtifactId,
-};
-use omicron_uuid_kinds::OmicronZoneUuid;
+use omicron_common::api::internal::nexus::{SledVmmState, UpdateArtifactId};
+use omicron_uuid_kinds::{OmicronZoneUuid, PropolisUuid};
 use schemars::JsonSchema;
 use serde::Deserialize;
+use sled_agent_api::VmmPathParam;
 use uuid::Uuid;
 
 /// Implements a fake Nexus.
@@ -50,8 +49,8 @@ pub trait FakeNexusServer: Send + Sync {
 
     fn cpapi_instances_put(
         &self,
-        _instance_id: Uuid,
-        _new_runtime_state: SledInstanceState,
+        _propolis_id: PropolisUuid,
+        _new_runtime_state: SledVmmState,
     ) -> Result<(), Error> {
         Err(Error::internal_error("Not implemented"))
     }
@@ -118,22 +117,18 @@ async fn sled_agent_put(
     Ok(HttpResponseUpdatedNoContent())
 }
 
-#[derive(Deserialize, JsonSchema)]
-struct InstancePathParam {
-    instance_id: Uuid,
-}
 #[endpoint {
     method = PUT,
-    path = "/instances/{instance_id}",
+    path = "/vmms/{propolis_id}",
 }]
 async fn cpapi_instances_put(
     request_context: RequestContext<ServerContext>,
-    path_params: Path<InstancePathParam>,
-    new_runtime_state: TypedBody<SledInstanceState>,
+    path_params: Path<VmmPathParam>,
+    new_runtime_state: TypedBody<SledVmmState>,
 ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
     let context = request_context.context();
     context.cpapi_instances_put(
-        path_params.into_inner().instance_id,
+        path_params.into_inner().propolis_id,
         new_runtime_state.into_inner(),
     )?;
     Ok(HttpResponseUpdatedNoContent())
diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
index 2bf8067d1c..221224a2e9 100644
--- a/sled-agent/src/http_entrypoints.rs
+++ b/sled-agent/src/http_entrypoints.rs
@@ -21,7 +21,7 @@ use nexus_sled_agent_shared::inventory::{
 };
 use omicron_common::api::external::Error;
 use omicron_common::api::internal::nexus::{
-    DiskRuntimeState, SledInstanceState, UpdateArtifactId,
+    DiskRuntimeState, SledVmmState, UpdateArtifactId,
 };
 use omicron_common::api::internal::shared::{
     ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, SwitchPorts,
@@ -30,7 +30,6 @@ use omicron_common::api::internal::shared::{
 use omicron_common::disk::{
     DiskVariant, DisksManagementResult, M2Slot, OmicronPhysicalDisksConfig,
 };
-use omicron_uuid_kinds::{GenericUuid, InstanceUuid};
 use sled_agent_api::*;
 use sled_agent_types::boot_disk::{
     BootDiskOsWriteStatus, BootDiskPathParams, BootDiskUpdatePathParams,
@@ -41,8 +40,8 @@ use sled_agent_types::disk::DiskEnsureBody;
 use sled_agent_types::early_networking::EarlyNetworkConfig;
 use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody;
 use sled_agent_types::instance::{
-    InstanceEnsureBody, InstanceExternalIpBody, InstancePutStateBody,
-    InstancePutStateResponse, InstanceUnregisterResponse,
+    InstanceEnsureBody, InstanceExternalIpBody, VmmPutStateBody,
+    VmmPutStateResponse, VmmUnregisterResponse,
 };
 use sled_agent_types::sled::AddSledRequest;
 use sled_agent_types::time_sync::TimeSync;
@@ -294,18 +293,18 @@ impl SledAgentApi for SledAgentImpl {
         Ok(HttpResponseUpdatedNoContent())
     }
 
-    async fn instance_register(
+    async fn vmm_register(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceEnsureBody>,
-    ) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
+    ) -> Result<HttpResponseOk<SledVmmState>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let propolis_id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
         Ok(HttpResponseOk(
             sa.instance_ensure_registered(
-                instance_id,
-                body_args.propolis_id,
+                body_args.instance_id,
+                propolis_id,
                 body_args.hardware,
                 body_args.instance_runtime,
                 body_args.vmm_runtime,
@@ -316,58 +315,56 @@ impl SledAgentApi for SledAgentImpl {
         ))
     }
 
-    async fn instance_unregister(
+    async fn vmm_unregister(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-    ) -> Result<HttpResponseOk<InstanceUnregisterResponse>, HttpError> {
+        path_params: Path<VmmPathParam>,
+    ) -> Result<HttpResponseOk<VmmUnregisterResponse>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
-        Ok(HttpResponseOk(sa.instance_ensure_unregistered(instance_id).await?))
+        let id = path_params.into_inner().propolis_id;
+        Ok(HttpResponseOk(sa.instance_ensure_unregistered(id).await?))
     }
 
-    async fn instance_put_state(
+    async fn vmm_put_state(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-        body: TypedBody<InstancePutStateBody>,
-    ) -> Result<HttpResponseOk<InstancePutStateResponse>, HttpError> {
+        path_params: Path<VmmPathParam>,
+        body: TypedBody<VmmPutStateBody>,
+    ) -> Result<HttpResponseOk<VmmPutStateResponse>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
-        Ok(HttpResponseOk(
-            sa.instance_ensure_state(instance_id, body_args.state).await?,
-        ))
+        Ok(HttpResponseOk(sa.instance_ensure_state(id, body_args.state).await?))
     }
 
-    async fn instance_get_state(
+    async fn vmm_get_state(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-    ) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
+        path_params: Path<VmmPathParam>,
+    ) -> Result<HttpResponseOk<SledVmmState>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
-        Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?))
+        let id = path_params.into_inner().propolis_id;
+        Ok(HttpResponseOk(sa.instance_get_state(id).await?))
     }
 
-    async fn instance_put_external_ip(
+    async fn vmm_put_external_ip(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceExternalIpBody>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
-        sa.instance_put_external_ip(instance_id, &body_args).await?;
+        sa.instance_put_external_ip(id, &body_args).await?;
         Ok(HttpResponseUpdatedNoContent())
     }
 
-    async fn instance_delete_external_ip(
+    async fn vmm_delete_external_ip(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceExternalIpBody>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
-        sa.instance_delete_external_ip(instance_id, &body_args).await?;
+        sa.instance_delete_external_ip(id, &body_args).await?;
         Ok(HttpResponseUpdatedNoContent())
     }
 
@@ -399,26 +396,24 @@ impl SledAgentApi for SledAgentImpl {
         Ok(HttpResponseUpdatedNoContent())
     }
 
-    async fn instance_issue_disk_snapshot_request(
+    async fn vmm_issue_disk_snapshot_request(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstanceIssueDiskSnapshotRequestPathParam>,
-        body: TypedBody<InstanceIssueDiskSnapshotRequestBody>,
-    ) -> Result<
-        HttpResponseOk<InstanceIssueDiskSnapshotRequestResponse>,
-        HttpError,
-    > {
+        path_params: Path<VmmIssueDiskSnapshotRequestPathParam>,
+        body: TypedBody<VmmIssueDiskSnapshotRequestBody>,
+    ) -> Result<HttpResponseOk<VmmIssueDiskSnapshotRequestResponse>, HttpError>
+    {
         let sa = rqctx.context();
         let path_params = path_params.into_inner();
         let body = body.into_inner();
 
-        sa.instance_issue_disk_snapshot_request(
-            InstanceUuid::from_untyped_uuid(path_params.instance_id),
+        sa.vmm_issue_disk_snapshot_request(
+            path_params.propolis_id,
             path_params.disk_id,
             body.snapshot_id,
         )
         .await?;
 
-        Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse {
+        Ok(HttpResponseOk(VmmIssueDiskSnapshotRequestResponse {
             snapshot_id: body.snapshot_id,
         }))
     }
diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
index 0bcbc97fd2..33b2d0cf67 100644
--- a/sled-agent/src/instance.rs
+++ b/sled-agent/src/instance.rs
@@ -25,14 +25,13 @@ use illumos_utils::opte::{DhcpCfg, PortCreateParams, PortManager};
 use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory};
 use illumos_utils::svc::wait_for_service;
 use illumos_utils::zone::PROPOLIS_ZONE_PREFIX;
-use omicron_common::api::internal::nexus::{
-    SledInstanceState, VmmRuntimeState,
-};
+use omicron_common::api::internal::nexus::{SledVmmState, VmmRuntimeState};
 use omicron_common::api::internal::shared::{
     NetworkInterface, ResolvedVpcFirewallRule, SledIdentifiers, SourceNatConfig,
 };
 use omicron_common::backoff;
 use omicron_common::zpool_name::ZpoolName;
+use omicron_common::NoDebug;
 use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid};
 use propolis_client::Client as PropolisClient;
 use rand::prelude::IteratorRandom;
@@ -104,11 +103,11 @@ pub enum Error {
     #[error("Error resolving DNS name: {0}")]
     ResolveError(#[from] internal_dns::resolver::ResolveError),
 
-    #[error("Instance {0} not running!")]
-    InstanceNotRunning(InstanceUuid),
+    #[error("Propolis job with ID {0} is registered but not running")]
+    VmNotRunning(PropolisUuid),
 
-    #[error("Instance already registered with Propolis ID {0}")]
-    InstanceAlreadyRegistered(PropolisUuid),
+    #[error("Propolis job with ID {0} already registered")]
+    PropolisAlreadyRegistered(PropolisUuid),
 
     #[error("No U.2 devices found")]
     U2NotFound,
@@ -217,15 +216,15 @@ enum InstanceRequest {
         tx: oneshot::Sender<Option<ZpoolName>>,
     },
     CurrentState {
-        tx: oneshot::Sender<SledInstanceState>,
+        tx: oneshot::Sender<SledVmmState>,
     },
     PutState {
-        state: InstanceStateRequested,
-        tx: oneshot::Sender<Result<InstancePutStateResponse, ManagerError>>,
+        state: VmmStateRequested,
+        tx: oneshot::Sender<Result<VmmPutStateResponse, ManagerError>>,
     },
     Terminate {
         mark_failed: bool,
-        tx: oneshot::Sender<Result<InstanceUnregisterResponse, ManagerError>>,
+        tx: oneshot::Sender<Result<VmmUnregisterResponse, ManagerError>>,
     },
     IssueSnapshotRequest {
         disk_id: Uuid,
@@ -337,7 +336,7 @@ struct InstanceRunner {
 
     // Disk related properties
     requested_disks: Vec<propolis_client::types::DiskRequest>,
-    cloud_init_bytes: Option<String>,
+    cloud_init_bytes: Option<NoDebug<String>>,
 
     // Internal State management
     state: InstanceStates,
@@ -414,12 +413,12 @@ impl InstanceRunner {
                         },
                         Some(PutState{ state, tx }) => {
                              tx.send(self.put_state(state).await
-                                .map(|r| InstancePutStateResponse { updated_runtime: Some(r) })
+                                .map(|r| VmmPutStateResponse { updated_runtime: Some(r) })
                                 .map_err(|e| e.into()))
                                 .map_err(|_| Error::FailedSendClientClosed)
                         },
                         Some(Terminate { mark_failed, tx }) => {
-                            tx.send(Ok(InstanceUnregisterResponse {
+                            tx.send(Ok(VmmUnregisterResponse {
                                 updated_runtime: Some(self.terminate(mark_failed).await)
                             }))
                             .map_err(|_| Error::FailedSendClientClosed)
@@ -499,15 +498,10 @@ impl InstanceRunner {
     }
 
     /// Yields this instance's ID.
-    fn id(&self) -> InstanceUuid {
+    fn instance_id(&self) -> InstanceUuid {
         InstanceUuid::from_untyped_uuid(self.properties.id)
     }
 
-    /// Yields this instance's Propolis's ID.
-    fn propolis_id(&self) -> &PropolisUuid {
-        &self.propolis_id
-    }
-
     async fn publish_state_to_nexus(&self) {
         // Retry until Nexus acknowledges that it has applied this state update.
         // Note that Nexus may receive this call but then fail while reacting
@@ -518,15 +512,13 @@ impl InstanceRunner {
             || async {
                 let state = self.state.sled_instance_state();
                 info!(self.log, "Publishing instance state update to Nexus";
-                    "instance_id" => %self.id(),
+                    "instance_id" => %self.instance_id(),
+                    "propolis_id" => %self.propolis_id,
                     "state" => ?state,
                 );
 
                 self.nexus_client
-                    .cpapi_instances_put(
-                        &self.id().into_untyped_uuid(),
-                        &state.into(),
-                    )
+                    .cpapi_instances_put(&self.propolis_id, &state.into())
                     .await
                     .map_err(|err| -> backoff::BackoffError<Error> {
                         match &err {
@@ -576,7 +568,8 @@ impl InstanceRunner {
                 warn!(self.log,
                       "Failed to publish instance state to Nexus: {}",
                       err.to_string();
-                      "instance_id" => %self.id(),
+                      "instance_id" => %self.instance_id(),
+                      "propolis_id" => %self.propolis_id,
                       "retry_after" => ?delay);
             },
         )
@@ -586,7 +579,8 @@ impl InstanceRunner {
             error!(
                 self.log,
                 "Failed to publish state to Nexus, will not retry: {:?}", e;
-                "instance_id" => %self.id()
+                "instance_id" => %self.instance_id(),
+                "propolis_id" => %self.propolis_id,
             );
         }
     }
@@ -622,7 +616,7 @@ impl InstanceRunner {
         info!(
             self.log,
             "updated state after observing Propolis state change";
-            "propolis_id" => %self.state.propolis_id(),
+            "propolis_id" => %self.propolis_id,
             "new_vmm_state" => ?self.state.vmm()
         );
 
@@ -634,7 +628,8 @@ impl InstanceRunner {
         match action {
             Some(InstanceAction::Destroy) => {
                 info!(self.log, "terminating VMM that has exited";
-                      "instance_id" => %self.id());
+                      "instance_id" => %self.instance_id(),
+                      "propolis_id" => %self.propolis_id);
                 let mark_failed = false;
                 self.terminate(mark_failed).await;
                 Reaction::Terminate
@@ -724,10 +719,10 @@ impl InstanceRunner {
                 .map(Into::into)
                 .collect(),
             migrate,
-            cloud_init_bytes: self.cloud_init_bytes.clone(),
+            cloud_init_bytes: self.cloud_init_bytes.clone().map(|x| x.0),
         };
 
-        info!(self.log, "Sending ensure request to propolis: {:?}", request);
+        debug!(self.log, "Sending ensure request to propolis: {:?}", request);
         let result = client.instance_ensure().body(request).send().await;
         info!(self.log, "result of instance_ensure call is {:?}", result);
         result?;
@@ -780,7 +775,7 @@ impl InstanceRunner {
     /// This routine is safe to call even if the instance's zone was never
     /// started. It is also safe to call multiple times on a single instance.
     async fn terminate_inner(&mut self) {
-        let zname = propolis_zone_name(self.propolis_id());
+        let zname = propolis_zone_name(&self.propolis_id);
 
         // First fetch the running state.
         //
@@ -948,8 +943,10 @@ impl InstanceRunner {
     }
 }
 
-/// A reference to a single instance running a running Propolis server.
+/// Describes a single Propolis server that incarnates a specific instance.
 pub struct Instance {
+    id: InstanceUuid,
+
     tx: mpsc::Sender<InstanceRequest>,
 
     #[allow(dead_code)]
@@ -1091,7 +1088,7 @@ impl Instance {
             dhcp_config,
             requested_disks: hardware.disks,
             cloud_init_bytes: hardware.cloud_init_bytes,
-            state: InstanceStates::new(vmm_runtime, propolis_id, migration_id),
+            state: InstanceStates::new(vmm_runtime, migration_id),
             running_state: None,
             nexus_client,
             storage,
@@ -1104,7 +1101,11 @@ impl Instance {
         let runner_handle =
             tokio::task::spawn(async move { runner.run().await });
 
-        Ok(Instance { tx, runner_handle })
+        Ok(Instance { id, tx, runner_handle })
+    }
+
+    pub fn id(&self) -> InstanceUuid {
+        self.id
     }
 
     /// Create bundle from an instance zone.
@@ -1130,7 +1131,7 @@ impl Instance {
         Ok(rx.await?)
     }
 
-    pub async fn current_state(&self) -> Result<SledInstanceState, Error> {
+    pub async fn current_state(&self) -> Result<SledVmmState, Error> {
         let (tx, rx) = oneshot::channel();
         self.tx
             .send(InstanceRequest::CurrentState { tx })
@@ -1152,8 +1153,8 @@ impl Instance {
     /// Rebooting to Running to Stopping to Stopped.
     pub async fn put_state(
         &self,
-        tx: oneshot::Sender<Result<InstancePutStateResponse, ManagerError>>,
-        state: InstanceStateRequested,
+        tx: oneshot::Sender<Result<VmmPutStateResponse, ManagerError>>,
+        state: VmmStateRequested,
     ) -> Result<(), Error> {
         self.tx
             .send(InstanceRequest::PutState { state, tx })
@@ -1166,7 +1167,7 @@ impl Instance {
     /// immediately transitions the instance to the Destroyed state.
     pub async fn terminate(
         &self,
-        tx: oneshot::Sender<Result<InstanceUnregisterResponse, ManagerError>>,
+        tx: oneshot::Sender<Result<VmmUnregisterResponse, ManagerError>>,
         mark_failed: bool,
     ) -> Result<(), Error> {
         self.tx
@@ -1224,7 +1225,7 @@ impl InstanceRunner {
     async fn request_zone_bundle(
         &self,
     ) -> Result<ZoneBundleMetadata, BundleError> {
-        let name = propolis_zone_name(self.propolis_id());
+        let name = propolis_zone_name(&self.propolis_id);
         match &self.running_state {
             None => Err(BundleError::Unavailable { name }),
             Some(RunningState { ref running_zone, .. }) => {
@@ -1242,7 +1243,7 @@ impl InstanceRunner {
         run_state.running_zone.root_zpool().map(|p| p.clone())
     }
 
-    fn current_state(&self) -> SledInstanceState {
+    fn current_state(&self) -> SledVmmState {
         self.state.sled_instance_state()
     }
 
@@ -1300,19 +1301,19 @@ impl InstanceRunner {
 
     async fn put_state(
         &mut self,
-        state: InstanceStateRequested,
-    ) -> Result<SledInstanceState, Error> {
+        state: VmmStateRequested,
+    ) -> Result<SledVmmState, Error> {
         use propolis_client::types::InstanceStateRequested as PropolisRequest;
         let (propolis_state, next_published) = match state {
-            InstanceStateRequested::MigrationTarget(migration_params) => {
+            VmmStateRequested::MigrationTarget(migration_params) => {
                 self.propolis_ensure(Some(migration_params)).await?;
                 (None, None)
             }
-            InstanceStateRequested::Running => {
+            VmmStateRequested::Running => {
                 self.propolis_ensure(None).await?;
                 (Some(PropolisRequest::Run), None)
             }
-            InstanceStateRequested::Stopped => {
+            VmmStateRequested::Stopped => {
                 // If the instance has not started yet, unregister it
                 // immediately. Since there is no Propolis to push updates when
                 // this happens, generate an instance record bearing the
@@ -1328,9 +1329,9 @@ impl InstanceRunner {
                     )
                 }
             }
-            InstanceStateRequested::Reboot => {
+            VmmStateRequested::Reboot => {
                 if self.running_state.is_none() {
-                    return Err(Error::InstanceNotRunning(self.id()));
+                    return Err(Error::VmNotRunning(self.propolis_id));
                 }
                 (
                     Some(PropolisRequest::Reboot),
@@ -1379,7 +1380,7 @@ impl InstanceRunner {
 
         // Create a zone for the propolis instance, using the previously
         // configured VNICs.
-        let zname = propolis_zone_name(self.propolis_id());
+        let zname = propolis_zone_name(&self.propolis_id);
         let mut rng = rand::rngs::StdRng::from_entropy();
         let latest_disks = self
             .storage
@@ -1399,7 +1400,7 @@ impl InstanceRunner {
             .with_zone_root_path(root)
             .with_zone_image_paths(&["/opt/oxide".into()])
             .with_zone_type("propolis-server")
-            .with_unique_name(self.propolis_id().into_untyped_uuid())
+            .with_unique_name(self.propolis_id.into_untyped_uuid())
             .with_datasets(&[])
             .with_filesystems(&[])
             .with_data_links(&[])
@@ -1483,7 +1484,7 @@ impl InstanceRunner {
         Ok(PropolisSetup { client, running_zone })
     }
 
-    async fn terminate(&mut self, mark_failed: bool) -> SledInstanceState {
+    async fn terminate(&mut self, mark_failed: bool) -> SledVmmState {
         self.terminate_inner().await;
         self.state.terminate_rudely(mark_failed);
 
@@ -1508,9 +1509,7 @@ impl InstanceRunner {
 
             Ok(())
         } else {
-            Err(Error::InstanceNotRunning(InstanceUuid::from_untyped_uuid(
-                self.properties.id,
-            )))
+            Err(Error::VmNotRunning(self.propolis_id))
         }
     }
 
@@ -1604,7 +1603,7 @@ mod tests {
     enum ReceivedInstanceState {
         #[default]
         None,
-        InstancePut(SledInstanceState),
+        InstancePut(SledVmmState),
     }
 
     struct NexusServer {
@@ -1614,8 +1613,8 @@ mod tests {
     impl FakeNexusServer for NexusServer {
         fn cpapi_instances_put(
             &self,
-            _instance_id: Uuid,
-            new_runtime_state: SledInstanceState,
+            _propolis_id: PropolisUuid,
+            new_runtime_state: SledVmmState,
         ) -> Result<(), omicron_common::api::external::Error> {
             self.observed_runtime_state
                 .send(ReceivedInstanceState::InstancePut(new_runtime_state))
@@ -1760,7 +1759,7 @@ mod tests {
         let id = InstanceUuid::new_v4();
         let propolis_id = PropolisUuid::from_untyped_uuid(PROPOLIS_ID);
 
-        let ticket = InstanceTicket::new_without_manager_for_test(id);
+        let ticket = InstanceTicket::new_without_manager_for_test(propolis_id);
 
         let initial_state = fake_instance_initial_state(propolis_addr);
 
@@ -1917,7 +1916,7 @@ mod tests {
 
         // pretending we're InstanceManager::ensure_state, start our "instance"
         // (backed by fakes and propolis_mock_server)
-        inst.put_state(put_tx, InstanceStateRequested::Running)
+        inst.put_state(put_tx, VmmStateRequested::Running)
             .await
             .expect("failed to send Instance::put_state");
 
@@ -2011,7 +2010,7 @@ mod tests {
 
         // pretending we're InstanceManager::ensure_state, try in vain to start
         // our "instance", but no propolis server is running
-        inst.put_state(put_tx, InstanceStateRequested::Running)
+        inst.put_state(put_tx, VmmStateRequested::Running)
             .await
             .expect("failed to send Instance::put_state");
 
@@ -2025,7 +2024,7 @@ mod tests {
             .await
             .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?");
 
-        if let ReceivedInstanceState::InstancePut(SledInstanceState {
+        if let ReceivedInstanceState::InstancePut(SledVmmState {
             vmm_state: VmmRuntimeState { state: VmmState::Running, .. },
             ..
         }) = state_rx.borrow().to_owned()
@@ -2118,7 +2117,7 @@ mod tests {
 
         // pretending we're InstanceManager::ensure_state, try in vain to start
         // our "instance", but the zone never finishes installing
-        inst.put_state(put_tx, InstanceStateRequested::Running)
+        inst.put_state(put_tx, VmmStateRequested::Running)
             .await
             .expect("failed to send Instance::put_state");
 
@@ -2133,7 +2132,7 @@ mod tests {
             .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?");
         debug!(log, "Zone-boot timeout awaited");
 
-        if let ReceivedInstanceState::InstancePut(SledInstanceState {
+        if let ReceivedInstanceState::InstancePut(SledVmmState {
             vmm_state: VmmRuntimeState { state: VmmState::Running, .. },
             ..
         }) = state_rx.borrow().to_owned()
@@ -2256,7 +2255,7 @@ mod tests {
         .await
         .unwrap();
 
-        mgr.ensure_state(instance_id, InstanceStateRequested::Running)
+        mgr.ensure_state(propolis_id, VmmStateRequested::Running)
             .await
             .unwrap();
 
diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs
index 63164ed290..24be8be89f 100644
--- a/sled-agent/src/instance_manager.rs
+++ b/sled-agent/src/instance_manager.rs
@@ -4,13 +4,13 @@
 
 //! API for controlling multiple instances on a sled.
 
-use crate::instance::propolis_zone_name;
 use crate::instance::Instance;
 use crate::metrics::MetricsRequestQueue;
 use crate::nexus::NexusClient;
 use crate::vmm_reservoir::VmmReservoirManagerHandle;
 use crate::zone_bundle::BundleError;
 use crate::zone_bundle::ZoneBundler;
+use illumos_utils::zone::PROPOLIS_ZONE_PREFIX;
 use omicron_common::api::external::ByteCount;
 
 use anyhow::anyhow;
@@ -20,7 +20,7 @@ use illumos_utils::opte::PortManager;
 use illumos_utils::running_zone::ZoneBuilderFactory;
 use omicron_common::api::external::Generation;
 use omicron_common::api::internal::nexus::InstanceRuntimeState;
-use omicron_common::api::internal::nexus::SledInstanceState;
+use omicron_common::api::internal::nexus::SledVmmState;
 use omicron_common::api::internal::nexus::VmmRuntimeState;
 use omicron_common::api::internal::shared::SledIdentifiers;
 use omicron_uuid_kinds::InstanceUuid;
@@ -44,8 +44,8 @@ pub enum Error {
     #[error("Instance error: {0}")]
     Instance(#[from] crate::instance::Error),
 
-    #[error("No such instance ID: {0}")]
-    NoSuchInstance(InstanceUuid),
+    #[error("VMM with ID {0} not found")]
+    NoSuchVmm(PropolisUuid),
 
     #[error("OPTE port management error: {0}")]
     Opte(#[from] illumos_utils::opte::Error),
@@ -117,7 +117,7 @@ impl InstanceManager {
             terminate_tx,
             terminate_rx,
             nexus_client,
-            instances: BTreeMap::new(),
+            jobs: BTreeMap::new(),
             vnic_allocator: VnicAllocator::new("Instance", etherstub),
             port_manager,
             storage_generation: None,
@@ -150,7 +150,7 @@ impl InstanceManager {
         propolis_addr: SocketAddr,
         sled_identifiers: SledIdentifiers,
         metadata: InstanceMetadata,
-    ) -> Result<SledInstanceState, Error> {
+    ) -> Result<SledVmmState, Error> {
         let (tx, rx) = oneshot::channel();
         self.inner
             .tx
@@ -172,13 +172,13 @@ impl InstanceManager {
 
     pub async fn ensure_unregistered(
         &self,
-        instance_id: InstanceUuid,
-    ) -> Result<InstanceUnregisterResponse, Error> {
+        propolis_id: PropolisUuid,
+    ) -> Result<VmmUnregisterResponse, Error> {
         let (tx, rx) = oneshot::channel();
         self.inner
             .tx
             .send(InstanceManagerRequest::EnsureUnregistered {
-                instance_id,
+                propolis_id,
                 tx,
             })
             .await
@@ -188,14 +188,14 @@ impl InstanceManager {
 
     pub async fn ensure_state(
         &self,
-        instance_id: InstanceUuid,
-        target: InstanceStateRequested,
-    ) -> Result<InstancePutStateResponse, Error> {
+        propolis_id: PropolisUuid,
+        target: VmmStateRequested,
+    ) -> Result<VmmPutStateResponse, Error> {
         let (tx, rx) = oneshot::channel();
         self.inner
             .tx
             .send(InstanceManagerRequest::EnsureState {
-                instance_id,
+                propolis_id,
                 target,
                 tx,
             })
@@ -206,31 +206,32 @@ impl InstanceManager {
             // these may involve a long-running zone creation, so avoid HTTP
             // request timeouts by decoupling the response
             // (see InstanceRunner::put_state)
-            InstanceStateRequested::MigrationTarget(_)
-            | InstanceStateRequested::Running => {
+            VmmStateRequested::MigrationTarget(_)
+            | VmmStateRequested::Running => {
                 // We don't want the sending side of the channel to see an
                 // error if we drop rx without awaiting it.
                 // Since we don't care about the response here, we spawn rx
                 // into a task which will await it for us in the background.
                 tokio::spawn(rx);
-                Ok(InstancePutStateResponse { updated_runtime: None })
+                Ok(VmmPutStateResponse { updated_runtime: None })
+            }
+            VmmStateRequested::Stopped | VmmStateRequested::Reboot => {
+                rx.await?
             }
-            InstanceStateRequested::Stopped
-            | InstanceStateRequested::Reboot => rx.await?,
         }
     }
 
-    pub async fn instance_issue_disk_snapshot_request(
+    pub async fn issue_disk_snapshot_request(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         disk_id: Uuid,
         snapshot_id: Uuid,
     ) -> Result<(), Error> {
         let (tx, rx) = oneshot::channel();
         self.inner
             .tx
-            .send(InstanceManagerRequest::InstanceIssueDiskSnapshot {
-                instance_id,
+            .send(InstanceManagerRequest::IssueDiskSnapshot {
+                propolis_id,
                 disk_id,
                 snapshot_id,
                 tx,
@@ -259,14 +260,14 @@ impl InstanceManager {
 
     pub async fn add_external_ip(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         ip: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
         let (tx, rx) = oneshot::channel();
         self.inner
             .tx
-            .send(InstanceManagerRequest::InstanceAddExternalIp {
-                instance_id,
+            .send(InstanceManagerRequest::AddExternalIp {
+                propolis_id,
                 ip: *ip,
                 tx,
             })
@@ -277,14 +278,14 @@ impl InstanceManager {
 
     pub async fn delete_external_ip(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         ip: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
         let (tx, rx) = oneshot::channel();
         self.inner
             .tx
-            .send(InstanceManagerRequest::InstanceDeleteExternalIp {
-                instance_id,
+            .send(InstanceManagerRequest::DeleteExternalIp {
+                propolis_id,
                 ip: *ip,
                 tx,
             })
@@ -300,12 +301,12 @@ impl InstanceManager {
 
     pub async fn get_instance_state(
         &self,
-        instance_id: InstanceUuid,
-    ) -> Result<SledInstanceState, Error> {
+        propolis_id: PropolisUuid,
+    ) -> Result<SledVmmState, Error> {
         let (tx, rx) = oneshot::channel();
         self.inner
             .tx
-            .send(InstanceManagerRequest::GetState { instance_id, tx })
+            .send(InstanceManagerRequest::GetState { propolis_id, tx })
             .await
             .map_err(|_| Error::FailedSendInstanceManagerClosed)?;
         rx.await?
@@ -351,20 +352,20 @@ enum InstanceManagerRequest {
         // reasonable choice...
         sled_identifiers: Box<SledIdentifiers>,
         metadata: InstanceMetadata,
-        tx: oneshot::Sender<Result<SledInstanceState, Error>>,
+        tx: oneshot::Sender<Result<SledVmmState, Error>>,
     },
     EnsureUnregistered {
-        instance_id: InstanceUuid,
-        tx: oneshot::Sender<Result<InstanceUnregisterResponse, Error>>,
+        propolis_id: PropolisUuid,
+        tx: oneshot::Sender<Result<VmmUnregisterResponse, Error>>,
     },
     EnsureState {
-        instance_id: InstanceUuid,
-        target: InstanceStateRequested,
-        tx: oneshot::Sender<Result<InstancePutStateResponse, Error>>,
+        propolis_id: PropolisUuid,
+        target: VmmStateRequested,
+        tx: oneshot::Sender<Result<VmmPutStateResponse, Error>>,
     },
 
-    InstanceIssueDiskSnapshot {
-        instance_id: InstanceUuid,
+    IssueDiskSnapshot {
+        propolis_id: PropolisUuid,
         disk_id: Uuid,
         snapshot_id: Uuid,
         tx: oneshot::Sender<Result<(), Error>>,
@@ -373,19 +374,19 @@ enum InstanceManagerRequest {
         name: String,
         tx: oneshot::Sender<Result<ZoneBundleMetadata, BundleError>>,
     },
-    InstanceAddExternalIp {
-        instance_id: InstanceUuid,
+    AddExternalIp {
+        propolis_id: PropolisUuid,
         ip: InstanceExternalIpBody,
         tx: oneshot::Sender<Result<(), Error>>,
     },
-    InstanceDeleteExternalIp {
-        instance_id: InstanceUuid,
+    DeleteExternalIp {
+        propolis_id: PropolisUuid,
         ip: InstanceExternalIpBody,
         tx: oneshot::Sender<Result<(), Error>>,
     },
     GetState {
-        instance_id: InstanceUuid,
-        tx: oneshot::Sender<Result<SledInstanceState, Error>>,
+        propolis_id: PropolisUuid,
+        tx: oneshot::Sender<Result<SledVmmState, Error>>,
     },
     OnlyUseDisks {
         disks: AllDisks,
@@ -396,7 +397,7 @@ enum InstanceManagerRequest {
 // Requests that the instance manager stop processing information about a
 // particular instance.
 struct InstanceDeregisterRequest {
-    id: InstanceUuid,
+    id: PropolisUuid,
 }
 
 struct InstanceManagerRunner {
@@ -422,8 +423,8 @@ struct InstanceManagerRunner {
     // TODO: If we held an object representing an enum of "Created OR Running"
     // instance, we could avoid the methods within "instance.rs" that panic
     // if the Propolis client hasn't been initialized.
-    /// A mapping from a Sled Agent "Instance ID" to ("Propolis ID", [Instance]).
-    instances: BTreeMap<InstanceUuid, (PropolisUuid, Instance)>,
+    /// A mapping from a Propolis ID to the [Instance] that Propolis incarnates.
+    jobs: BTreeMap<PropolisUuid, Instance>,
 
     vnic_allocator: VnicAllocator<Etherstub>,
     port_manager: PortManager,
@@ -451,7 +452,7 @@ impl InstanceManagerRunner {
                 request = self.terminate_rx.recv() => {
                     match request {
                         Some(request) => {
-                            self.instances.remove(&request.id);
+                            self.jobs.remove(&request.id);
                         },
                         None => {
                             warn!(self.log, "InstanceManager's 'instance terminate' channel closed; shutting down");
@@ -484,31 +485,31 @@ impl InstanceManagerRunner {
                                 metadata
                             ).await).map_err(|_| Error::FailedSendClientClosed)
                         },
-                        Some(EnsureUnregistered { instance_id, tx }) => {
-                            self.ensure_unregistered(tx, instance_id).await
+                        Some(EnsureUnregistered { propolis_id, tx }) => {
+                            self.ensure_unregistered(tx, propolis_id).await
                         },
-                        Some(EnsureState { instance_id, target, tx }) => {
-                            self.ensure_state(tx, instance_id, target).await
+                        Some(EnsureState { propolis_id, target, tx }) => {
+                            self.ensure_state(tx, propolis_id, target).await
                         },
-                        Some(InstanceIssueDiskSnapshot { instance_id, disk_id, snapshot_id, tx }) => {
-                            self.instance_issue_disk_snapshot_request(tx, instance_id, disk_id, snapshot_id).await
+                        Some(IssueDiskSnapshot { propolis_id, disk_id, snapshot_id, tx }) => {
+                            self.issue_disk_snapshot_request(tx, propolis_id, disk_id, snapshot_id).await
                         },
                         Some(CreateZoneBundle { name, tx }) => {
                             self.create_zone_bundle(tx, &name).await.map_err(Error::from)
                         },
-                        Some(InstanceAddExternalIp { instance_id, ip, tx }) => {
-                            self.add_external_ip(tx, instance_id, &ip).await
+                        Some(AddExternalIp { propolis_id, ip, tx }) => {
+                            self.add_external_ip(tx, propolis_id, &ip).await
                         },
-                        Some(InstanceDeleteExternalIp { instance_id, ip, tx }) => {
-                            self.delete_external_ip(tx, instance_id, &ip).await
+                        Some(DeleteExternalIp { propolis_id, ip, tx }) => {
+                            self.delete_external_ip(tx, propolis_id, &ip).await
                         },
-                        Some(GetState { instance_id, tx }) => {
+                        Some(GetState { propolis_id, tx }) => {
                             // TODO(eliza): it could potentially be nice to
                             // refactor this to use `tokio::sync::watch`, rather
                             // than having to force `GetState` requests to
                             // serialize with the requests that actually update
                             // the state...
-                            self.get_instance_state(tx, instance_id).await
+                            self.get_instance_state(tx, propolis_id).await
                         },
                         Some(OnlyUseDisks { disks, tx } ) => {
                             self.use_only_these_disks(disks).await;
@@ -533,8 +534,8 @@ impl InstanceManagerRunner {
         }
     }
 
-    fn get_instance(&self, instance_id: InstanceUuid) -> Option<&Instance> {
-        self.instances.get(&instance_id).map(|(_id, v)| v)
+    fn get_propolis(&self, propolis_id: PropolisUuid) -> Option<&Instance> {
+        self.jobs.get(&propolis_id)
     }
 
     /// Ensures that the instance manager contains a registered instance with
@@ -565,7 +566,7 @@ impl InstanceManagerRunner {
         propolis_addr: SocketAddr,
         sled_identifiers: SledIdentifiers,
         metadata: InstanceMetadata,
-    ) -> Result<SledInstanceState, Error> {
+    ) -> Result<SledVmmState, Error> {
         info!(
             &self.log,
             "ensuring instance is registered";
@@ -579,17 +580,16 @@ impl InstanceManagerRunner {
         );
 
         let instance = {
-            if let Some((existing_propolis_id, existing_instance)) =
-                self.instances.get(&instance_id)
-            {
-                if propolis_id != *existing_propolis_id {
+            if let Some(existing_instance) = self.jobs.get(&propolis_id) {
+                if instance_id != existing_instance.id() {
                     info!(&self.log,
-                          "instance already registered with another Propolis ID";
-                          "instance_id" => %instance_id,
-                          "existing_propolis_id" => %*existing_propolis_id);
+                          "Propolis ID already used by another instance";
+                          "propolis_id" => %propolis_id,
+                          "existing_instanceId" => %existing_instance.id());
+
                     return Err(Error::Instance(
-                        crate::instance::Error::InstanceAlreadyRegistered(
-                            *existing_propolis_id,
+                        crate::instance::Error::PropolisAlreadyRegistered(
+                            propolis_id,
                         ),
                     ));
                 } else {
@@ -602,11 +602,16 @@ impl InstanceManagerRunner {
             } else {
                 info!(&self.log,
                       "registering new instance";
-                      "instance_id" => ?instance_id);
-                let instance_log =
-                    self.log.new(o!("instance_id" => format!("{instance_id}")));
+                      "instance_id" => %instance_id,
+                      "propolis_id" => %propolis_id);
+
+                let instance_log = self.log.new(o!(
+                    "instance_id" => instance_id.to_string(),
+                    "propolis_id" => propolis_id.to_string(),
+                ));
+
                 let ticket =
-                    InstanceTicket::new(instance_id, self.terminate_tx.clone());
+                    InstanceTicket::new(propolis_id, self.terminate_tx.clone());
 
                 let services = InstanceManagerServices {
                     nexus_client: self.nexus_client.clone(),
@@ -635,27 +640,26 @@ impl InstanceManagerRunner {
                     sled_identifiers,
                     metadata,
                 )?;
-                let _old =
-                    self.instances.insert(instance_id, (propolis_id, instance));
+                let _old = self.jobs.insert(propolis_id, instance);
                 assert!(_old.is_none());
-                &self.instances.get(&instance_id).unwrap().1
+                &self.jobs.get(&propolis_id).unwrap()
             }
         };
 
         Ok(instance.current_state().await?)
     }
 
-    /// Idempotently ensures the instance is not registered with this instance
-    /// manager. If the instance exists and has a running Propolis, that
-    /// Propolis is rudely terminated.
+    /// Idempotently ensures this VM is not registered with this instance
+    /// manager. If this Propolis job is registered and has a running zone, the
+    /// zone is rudely terminated.
     async fn ensure_unregistered(
         &mut self,
-        tx: oneshot::Sender<Result<InstanceUnregisterResponse, Error>>,
-        instance_id: InstanceUuid,
+        tx: oneshot::Sender<Result<VmmUnregisterResponse, Error>>,
+        propolis_id: PropolisUuid,
     ) -> Result<(), Error> {
         // If the instance does not exist, we response immediately.
-        let Some(instance) = self.get_instance(instance_id) else {
-            tx.send(Ok(InstanceUnregisterResponse { updated_runtime: None }))
+        let Some(instance) = self.get_propolis(propolis_id) else {
+            tx.send(Ok(VmmUnregisterResponse { updated_runtime: None }))
                 .map_err(|_| Error::FailedSendClientClosed)?;
             return Ok(());
         };
@@ -667,15 +671,15 @@ impl InstanceManagerRunner {
         Ok(())
     }
 
-    /// Idempotently attempts to drive the supplied instance into the supplied
+    /// Idempotently attempts to drive the supplied Propolis into the supplied
     /// runtime state.
     async fn ensure_state(
         &mut self,
-        tx: oneshot::Sender<Result<InstancePutStateResponse, Error>>,
-        instance_id: InstanceUuid,
-        target: InstanceStateRequested,
+        tx: oneshot::Sender<Result<VmmPutStateResponse, Error>>,
+        propolis_id: PropolisUuid,
+        target: VmmStateRequested,
     ) -> Result<(), Error> {
-        let Some(instance) = self.get_instance(instance_id) else {
+        let Some(instance) = self.get_propolis(propolis_id) else {
             match target {
                 // If the instance isn't registered, then by definition it
                 // isn't running here. Allow requests to stop or destroy the
@@ -685,14 +689,12 @@ impl InstanceManagerRunner {
                 // Propolis handled it, sled agent unregistered the
                 // instance, and only then did a second stop request
                 // arrive.
-                InstanceStateRequested::Stopped => {
-                    tx.send(Ok(InstancePutStateResponse {
-                        updated_runtime: None,
-                    }))
-                    .map_err(|_| Error::FailedSendClientClosed)?;
+                VmmStateRequested::Stopped => {
+                    tx.send(Ok(VmmPutStateResponse { updated_runtime: None }))
+                        .map_err(|_| Error::FailedSendClientClosed)?;
                 }
                 _ => {
-                    tx.send(Err(Error::NoSuchInstance(instance_id)))
+                    tx.send(Err(Error::NoSuchVmm(propolis_id)))
                         .map_err(|_| Error::FailedSendClientClosed)?;
                 }
             }
@@ -702,20 +704,15 @@ impl InstanceManagerRunner {
         Ok(())
     }
 
-    async fn instance_issue_disk_snapshot_request(
+    async fn issue_disk_snapshot_request(
         &self,
         tx: oneshot::Sender<Result<(), Error>>,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         disk_id: Uuid,
         snapshot_id: Uuid,
     ) -> Result<(), Error> {
-        let instance = {
-            let (_, instance) = self
-                .instances
-                .get(&instance_id)
-                .ok_or(Error::NoSuchInstance(instance_id))?;
-            instance
-        };
+        let instance =
+            self.jobs.get(&propolis_id).ok_or(Error::NoSuchVmm(propolis_id))?;
 
         instance
             .issue_snapshot_request(tx, disk_id, snapshot_id)
@@ -729,11 +726,19 @@ impl InstanceManagerRunner {
         tx: oneshot::Sender<Result<ZoneBundleMetadata, BundleError>>,
         name: &str,
     ) -> Result<(), BundleError> {
-        let Some((_propolis_id, instance)) =
-            self.instances.values().find(|(propolis_id, _instance)| {
-                name == propolis_zone_name(propolis_id)
-            })
-        else {
+        // A well-formed Propolis zone name must consist of
+        // `PROPOLIS_ZONE_PREFIX` and the Propolis ID. If the prefix is not
+        // present or the Propolis ID portion of the supplied zone name isn't
+        // parseable as a UUID, there is no Propolis zone with the specified
+        // name to capture into a bundle, so return a `NoSuchZone` error.
+        let vmm_id: PropolisUuid = name
+            .strip_prefix(PROPOLIS_ZONE_PREFIX)
+            .and_then(|uuid_str| uuid_str.parse::<PropolisUuid>().ok())
+            .ok_or_else(|| BundleError::NoSuchZone {
+                name: name.to_string(),
+            })?;
+
+        let Some(instance) = self.jobs.get(&vmm_id) else {
             return Err(BundleError::NoSuchZone { name: name.to_string() });
         };
         instance.request_zone_bundle(tx).await
@@ -742,11 +747,11 @@ impl InstanceManagerRunner {
     async fn add_external_ip(
         &self,
         tx: oneshot::Sender<Result<(), Error>>,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         ip: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
-        let Some(instance) = self.get_instance(instance_id) else {
-            return Err(Error::NoSuchInstance(instance_id));
+        let Some(instance) = self.get_propolis(propolis_id) else {
+            return Err(Error::NoSuchVmm(propolis_id));
         };
         instance.add_external_ip(tx, ip).await?;
         Ok(())
@@ -755,11 +760,11 @@ impl InstanceManagerRunner {
     async fn delete_external_ip(
         &self,
         tx: oneshot::Sender<Result<(), Error>>,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         ip: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
-        let Some(instance) = self.get_instance(instance_id) else {
-            return Err(Error::NoSuchInstance(instance_id));
+        let Some(instance) = self.get_propolis(propolis_id) else {
+            return Err(Error::NoSuchVmm(propolis_id));
         };
 
         instance.delete_external_ip(tx, ip).await?;
@@ -768,12 +773,12 @@ impl InstanceManagerRunner {
 
     async fn get_instance_state(
         &self,
-        tx: oneshot::Sender<Result<SledInstanceState, Error>>,
-        instance_id: InstanceUuid,
+        tx: oneshot::Sender<Result<SledVmmState, Error>>,
+        propolis_id: PropolisUuid,
     ) -> Result<(), Error> {
-        let Some(instance) = self.get_instance(instance_id) else {
+        let Some(instance) = self.get_propolis(propolis_id) else {
             return tx
-                .send(Err(Error::NoSuchInstance(instance_id)))
+                .send(Err(Error::NoSuchVmm(propolis_id)))
                 .map_err(|_| Error::FailedSendClientClosed);
         };
 
@@ -801,7 +806,7 @@ impl InstanceManagerRunner {
         let u2_set: HashSet<_> = disks.all_u2_zpools().into_iter().collect();
 
         let mut to_remove = vec![];
-        for (id, (_, instance)) in self.instances.iter() {
+        for (id, instance) in self.jobs.iter() {
             // If we can read the filesystem pool, consider it. Otherwise, move
             // on, to prevent blocking the cleanup of other instances.
             let Ok(Some(filesystem_pool)) =
@@ -817,7 +822,7 @@ impl InstanceManagerRunner {
 
         for id in to_remove {
             info!(self.log, "use_only_these_disks: Removing instance"; "instance_id" => ?id);
-            if let Some((_, instance)) = self.instances.remove(&id) {
+            if let Some(instance) = self.jobs.remove(&id) {
                 let (tx, rx) = oneshot::channel();
                 let mark_failed = true;
                 if let Err(e) = instance.terminate(tx, mark_failed).await {
@@ -835,22 +840,22 @@ impl InstanceManagerRunner {
 
 /// Represents membership of an instance in the [`InstanceManager`].
 pub struct InstanceTicket {
-    id: InstanceUuid,
+    id: PropolisUuid,
     terminate_tx: Option<mpsc::UnboundedSender<InstanceDeregisterRequest>>,
 }
 
 impl InstanceTicket {
-    // Creates a new instance ticket for instance "id" to be removed
-    // from the manger on destruction.
+    // Creates a new instance ticket for the Propolis job with the supplied `id`
+    // to be removed from the manager on destruction.
     fn new(
-        id: InstanceUuid,
+        id: PropolisUuid,
         terminate_tx: mpsc::UnboundedSender<InstanceDeregisterRequest>,
     ) -> Self {
         InstanceTicket { id, terminate_tx: Some(terminate_tx) }
     }
 
     #[cfg(all(test, target_os = "illumos"))]
-    pub(crate) fn new_without_manager_for_test(id: InstanceUuid) -> Self {
+    pub(crate) fn new_without_manager_for_test(id: PropolisUuid) -> Self {
         Self { id, terminate_tx: None }
     }
 
diff --git a/sled-agent/src/rack_setup/mod.rs b/sled-agent/src/rack_setup/mod.rs
index 0ec14138fc..e1b12d6b2b 100644
--- a/sled-agent/src/rack_setup/mod.rs
+++ b/sled-agent/src/rack_setup/mod.rs
@@ -9,3 +9,8 @@ mod plan;
 pub mod service;
 
 pub use plan::service::SledConfig;
+pub use plan::service::{
+    from_ipaddr_to_external_floating_ip,
+    from_sockaddr_to_external_floating_addr,
+    from_source_nat_config_to_external_snat_ip,
+};
diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs
index 055d2cdca2..e0959b0219 100644
--- a/sled-agent/src/rack_setup/plan/service.rs
+++ b/sled-agent/src/rack_setup/plan/service.rs
@@ -10,7 +10,13 @@ use illumos_utils::zpool::ZpoolName;
 use internal_dns::config::{Host, Zone};
 use internal_dns::ServiceName;
 use nexus_sled_agent_shared::inventory::{
-    Inventory, OmicronZoneConfig, OmicronZoneDataset, OmicronZoneType, SledRole,
+    Inventory, OmicronZoneDataset, SledRole,
+};
+use nexus_types::deployment::{
+    blueprint_zone_type, BlueprintPhysicalDisksConfig, BlueprintZoneConfig,
+    BlueprintZoneDisposition, BlueprintZoneType,
+    OmicronZoneExternalFloatingAddr, OmicronZoneExternalFloatingIp,
+    OmicronZoneExternalSnatIp,
 };
 use omicron_common::address::{
     get_sled_address, get_switch_zone_address, Ipv6Subnet, ReservedRackSubnet,
@@ -33,7 +39,9 @@ use omicron_common::policy::{
     BOUNDARY_NTP_REDUNDANCY, COCKROACHDB_REDUNDANCY, DNS_REDUNDANCY,
     MAX_DNS_REDUNDANCY, NEXUS_REDUNDANCY,
 };
-use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid, SledUuid, ZpoolUuid};
+use omicron_uuid_kinds::{
+    ExternalIpUuid, GenericUuid, OmicronZoneUuid, SledUuid, ZpoolUuid,
+};
 use rand::prelude::SliceRandom;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
@@ -120,10 +128,10 @@ pub enum PlanError {
 #[derive(Clone, Debug, Default, Serialize, Deserialize, JsonSchema)]
 pub struct SledConfig {
     /// Control plane disks configured for this sled
-    pub disks: OmicronPhysicalDisksConfig,
+    pub disks: BlueprintPhysicalDisksConfig,
 
     /// zones configured for this sled
-    pub zones: Vec<OmicronZoneConfig>,
+    pub zones: Vec<BlueprintZoneConfig>,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
@@ -140,7 +148,53 @@ impl Ledgerable for Plan {
 }
 const RSS_SERVICE_PLAN_V1_FILENAME: &str = "rss-service-plan.json";
 const RSS_SERVICE_PLAN_V2_FILENAME: &str = "rss-service-plan-v2.json";
-const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan-v3.json";
+const RSS_SERVICE_PLAN_V3_FILENAME: &str = "rss-service-plan-v3.json";
+const RSS_SERVICE_PLAN_FILENAME: &str = "rss-service-plan-v4.json";
+
+pub fn from_sockaddr_to_external_floating_addr(
+    addr: SocketAddr,
+) -> OmicronZoneExternalFloatingAddr {
+    // This is pretty weird: IP IDs don't exist yet, so it's fine for us
+    // to make them up (Nexus will record them as a part of the
+    // handoff). We could pass `None` here for some zone types, but it's
+    // a little simpler to just always pass a new ID, which will only be
+    // used if the zone type has an external IP.
+    //
+    // This should all go away once RSS starts using blueprints more
+    // directly (instead of this conversion after the fact):
+    // https://github.com/oxidecomputer/omicron/issues/5272
+    OmicronZoneExternalFloatingAddr { id: ExternalIpUuid::new_v4(), addr }
+}
+
+pub fn from_ipaddr_to_external_floating_ip(
+    ip: IpAddr,
+) -> OmicronZoneExternalFloatingIp {
+    // This is pretty weird: IP IDs don't exist yet, so it's fine for us
+    // to make them up (Nexus will record them as a part of the
+    // handoff). We could pass `None` here for some zone types, but it's
+    // a little simpler to just always pass a new ID, which will only be
+    // used if the zone type has an external IP.
+    //
+    // This should all go away once RSS starts using blueprints more
+    // directly (instead of this conversion after the fact):
+    // https://github.com/oxidecomputer/omicron/issues/5272
+    OmicronZoneExternalFloatingIp { id: ExternalIpUuid::new_v4(), ip }
+}
+
+pub fn from_source_nat_config_to_external_snat_ip(
+    snat_cfg: SourceNatConfig,
+) -> OmicronZoneExternalSnatIp {
+    // This is pretty weird: IP IDs don't exist yet, so it's fine for us
+    // to make them up (Nexus will record them as a part of the
+    // handoff). We could pass `None` here for some zone types, but it's
+    // a little simpler to just always pass a new ID, which will only be
+    // used if the zone type has an external IP.
+    //
+    // This should all go away once RSS starts using blueprints more
+    // directly (instead of this conversion after the fact):
+    // https://github.com/oxidecomputer/omicron/issues/5272
+    OmicronZoneExternalSnatIp { id: ExternalIpUuid::new_v4(), snat_cfg }
+}
 
 impl Plan {
     pub async fn load(
@@ -200,6 +254,14 @@ impl Plan {
             }
         })? {
             Err(PlanError::FoundV2)
+        } else if Self::has_v3(storage_manager).await.map_err(|err| {
+            // Same as the comment above, but for version 3.
+            PlanError::Io {
+                message: String::from("looking for v3 RSS plan"),
+                err,
+            }
+        })? {
+            Err(PlanError::FoundV2)
         } else {
             Ok(None)
         }
@@ -243,6 +305,25 @@ impl Plan {
         Ok(false)
     }
 
+    async fn has_v3(
+        storage_manager: &StorageHandle,
+    ) -> Result<bool, std::io::Error> {
+        let paths = storage_manager
+            .get_latest_disks()
+            .await
+            .all_m2_mountpoints(CONFIG_DATASET)
+            .into_iter()
+            .map(|p| p.join(RSS_SERVICE_PLAN_V3_FILENAME));
+
+        for p in paths {
+            if p.try_exists()? {
+                return Ok(true);
+            }
+        }
+
+        Ok(false)
+    }
+
     async fn is_sled_scrimlet(
         log: &Logger,
         address: SocketAddrV6,
@@ -419,20 +500,22 @@ impl Plan {
                 sled.alloc_dataset_from_u2s(DatasetType::InternalDns)?;
             let filesystem_pool = Some(dataset_name.pool().clone());
 
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: ip,
-                zone_type: OmicronZoneType::InternalDns {
-                    dataset: OmicronZoneDataset {
-                        pool_name: dataset_name.pool().clone(),
-                    },
-                    http_address,
-                    dns_address,
-                    gz_address: dns_subnet.gz_address(),
-                    gz_address_index: i.try_into().expect("Giant indices?"),
-                },
                 filesystem_pool,
+                zone_type: BlueprintZoneType::InternalDns(
+                    blueprint_zone_type::InternalDns {
+                        dataset: OmicronZoneDataset {
+                            pool_name: dataset_name.pool().clone(),
+                        },
+                        http_address,
+                        dns_address,
+                        gz_address: dns_subnet.gz_address(),
+                        gz_address_index: i.try_into().expect("Giant indices?"),
+                    },
+                ),
             });
         }
 
@@ -458,16 +541,18 @@ impl Plan {
             let dataset_name =
                 sled.alloc_dataset_from_u2s(DatasetType::CockroachDb)?;
             let filesystem_pool = Some(dataset_name.pool().clone());
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: ip,
-                zone_type: OmicronZoneType::CockroachDb {
-                    dataset: OmicronZoneDataset {
-                        pool_name: dataset_name.pool().clone(),
+                zone_type: BlueprintZoneType::CockroachDb(
+                    blueprint_zone_type::CockroachDb {
+                        address,
+                        dataset: OmicronZoneDataset {
+                            pool_name: dataset_name.pool().clone(),
+                        },
                     },
-                    address,
-                },
+                ),
                 filesystem_pool,
             });
         }
@@ -499,23 +584,27 @@ impl Plan {
                 )
                 .unwrap();
             let dns_port = omicron_common::address::DNS_PORT;
-            let dns_address = SocketAddr::new(external_ip, dns_port);
+            let dns_address = from_sockaddr_to_external_floating_addr(
+                SocketAddr::new(external_ip, dns_port),
+            );
             let dataset_kind = DatasetType::ExternalDns;
             let dataset_name = sled.alloc_dataset_from_u2s(dataset_kind)?;
             let filesystem_pool = Some(dataset_name.pool().clone());
 
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: *http_address.ip(),
-                zone_type: OmicronZoneType::ExternalDns {
-                    dataset: OmicronZoneDataset {
-                        pool_name: dataset_name.pool().clone(),
+                zone_type: BlueprintZoneType::ExternalDns(
+                    blueprint_zone_type::ExternalDns {
+                        dataset: OmicronZoneDataset {
+                            pool_name: dataset_name.pool().clone(),
+                        },
+                        http_address,
+                        dns_address,
+                        nic,
                     },
-                    http_address,
-                    dns_address,
-                    nic,
-                },
+                ),
                 filesystem_pool,
             });
         }
@@ -539,28 +628,32 @@ impl Plan {
                 .unwrap();
             let (nic, external_ip) = svc_port_builder.next_nexus(id)?;
             let filesystem_pool = Some(sled.alloc_zpool_from_u2s()?);
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: address,
-                zone_type: OmicronZoneType::Nexus {
-                    internal_address: SocketAddrV6::new(
-                        address,
-                        omicron_common::address::NEXUS_INTERNAL_PORT,
-                        0,
-                        0,
-                    ),
-                    external_ip,
-                    nic,
-                    // Tell Nexus to use TLS if and only if the caller
-                    // provided TLS certificates.  This effectively
-                    // determines the status of TLS for the lifetime of
-                    // the rack.  In production-like deployments, we'd
-                    // always expect TLS to be enabled.  It's only in
-                    // development that it might not be.
-                    external_tls: !config.external_certificates.is_empty(),
-                    external_dns_servers: config.dns_servers.clone(),
-                },
+                zone_type: BlueprintZoneType::Nexus(
+                    blueprint_zone_type::Nexus {
+                        internal_address: SocketAddrV6::new(
+                            address,
+                            omicron_common::address::NEXUS_INTERNAL_PORT,
+                            0,
+                            0,
+                        ),
+                        external_ip: from_ipaddr_to_external_floating_ip(
+                            external_ip,
+                        ),
+                        nic,
+                        // Tell Nexus to use TLS if and only if the caller
+                        // provided TLS certificates.  This effectively
+                        // determines the status of TLS for the lifetime of
+                        // the rack.  In production-like deployments, we'd
+                        // always expect TLS to be enabled.  It's only in
+                        // development that it might not be.
+                        external_tls: !config.external_certificates.is_empty(),
+                        external_dns_servers: config.dns_servers.clone(),
+                    },
+                ),
                 filesystem_pool,
             });
         }
@@ -584,18 +677,20 @@ impl Plan {
                 )
                 .unwrap();
             let filesystem_pool = Some(sled.alloc_zpool_from_u2s()?);
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: address,
-                zone_type: OmicronZoneType::Oximeter {
-                    address: SocketAddrV6::new(
-                        address,
-                        omicron_common::address::OXIMETER_PORT,
-                        0,
-                        0,
-                    ),
-                },
+                zone_type: BlueprintZoneType::Oximeter(
+                    blueprint_zone_type::Oximeter {
+                        address: SocketAddrV6::new(
+                            address,
+                            omicron_common::address::OXIMETER_PORT,
+                            0,
+                            0,
+                        ),
+                    },
+                ),
                 filesystem_pool,
             })
         }
@@ -623,16 +718,18 @@ impl Plan {
             let dataset_name =
                 sled.alloc_dataset_from_u2s(DatasetType::Clickhouse)?;
             let filesystem_pool = Some(dataset_name.pool().clone());
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: ip,
-                zone_type: OmicronZoneType::Clickhouse {
-                    address,
-                    dataset: OmicronZoneDataset {
-                        pool_name: dataset_name.pool().clone(),
+                zone_type: BlueprintZoneType::Clickhouse(
+                    blueprint_zone_type::Clickhouse {
+                        address,
+                        dataset: OmicronZoneDataset {
+                            pool_name: dataset_name.pool().clone(),
+                        },
                     },
-                },
+                ),
                 filesystem_pool,
             });
         }
@@ -664,16 +761,18 @@ impl Plan {
             let dataset_name =
                 sled.alloc_dataset_from_u2s(DatasetType::ClickhouseServer)?;
             let filesystem_pool = Some(dataset_name.pool().clone());
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: ip,
-                zone_type: OmicronZoneType::ClickhouseServer {
-                    address,
-                    dataset: OmicronZoneDataset {
-                        pool_name: dataset_name.pool().clone(),
+                zone_type: BlueprintZoneType::ClickhouseServer(
+                    blueprint_zone_type::ClickhouseServer {
+                        address,
+                        dataset: OmicronZoneDataset {
+                            pool_name: dataset_name.pool().clone(),
+                        },
                     },
-                },
+                ),
                 filesystem_pool,
             });
         }
@@ -703,16 +802,18 @@ impl Plan {
             let dataset_name =
                 sled.alloc_dataset_from_u2s(DatasetType::ClickhouseKeeper)?;
             let filesystem_pool = Some(dataset_name.pool().clone());
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: ip,
-                zone_type: OmicronZoneType::ClickhouseKeeper {
-                    address,
-                    dataset: OmicronZoneDataset {
-                        pool_name: dataset_name.pool().clone(),
+                zone_type: BlueprintZoneType::ClickhouseKeeper(
+                    blueprint_zone_type::ClickhouseKeeper {
+                        address,
+                        dataset: OmicronZoneDataset {
+                            pool_name: dataset_name.pool().clone(),
+                        },
                     },
-                },
+                ),
                 filesystem_pool,
             });
         }
@@ -737,13 +838,15 @@ impl Plan {
                     port,
                 )
                 .unwrap();
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: address,
-                zone_type: OmicronZoneType::CruciblePantry {
-                    address: SocketAddrV6::new(address, port, 0, 0),
-                },
+                zone_type: BlueprintZoneType::CruciblePantry(
+                    blueprint_zone_type::CruciblePantry {
+                        address: SocketAddrV6::new(address, port, 0, 0),
+                    },
+                ),
                 filesystem_pool,
             });
         }
@@ -765,14 +868,18 @@ impl Plan {
                     )
                     .unwrap();
 
-                sled.request.zones.push(OmicronZoneConfig {
-                    // TODO-cleanup use TypedUuid everywhere
-                    id: id.into_untyped_uuid(),
+                sled.request.zones.push(BlueprintZoneConfig {
+                    disposition: BlueprintZoneDisposition::InService,
+                    id,
                     underlay_address: ip,
-                    zone_type: OmicronZoneType::Crucible {
-                        address,
-                        dataset: OmicronZoneDataset { pool_name: pool.clone() },
-                    },
+                    zone_type: BlueprintZoneType::Crucible(
+                        blueprint_zone_type::Crucible {
+                            address,
+                            dataset: OmicronZoneDataset {
+                                pool_name: pool.clone(),
+                            },
+                        },
+                    ),
                     filesystem_pool: Some(pool.clone()),
                 });
             }
@@ -793,24 +900,31 @@ impl Plan {
                     .push(Host::for_zone(Zone::Other(id)).fqdn());
                 let (nic, snat_cfg) = svc_port_builder.next_snat(id)?;
                 (
-                    OmicronZoneType::BoundaryNtp {
-                        address: ntp_address,
-                        ntp_servers: config.ntp_servers.clone(),
-                        dns_servers: config.dns_servers.clone(),
-                        domain: None,
-                        nic,
-                        snat_cfg,
-                    },
+                    BlueprintZoneType::BoundaryNtp(
+                        blueprint_zone_type::BoundaryNtp {
+                            address: ntp_address,
+                            ntp_servers: config.ntp_servers.clone(),
+                            dns_servers: config.dns_servers.clone(),
+                            domain: None,
+                            nic,
+                            external_ip:
+                                from_source_nat_config_to_external_snat_ip(
+                                    snat_cfg,
+                                ),
+                        },
+                    ),
                     ServiceName::BoundaryNtp,
                 )
             } else {
                 (
-                    OmicronZoneType::InternalNtp {
-                        address: ntp_address,
-                        ntp_servers: boundary_ntp_servers.clone(),
-                        dns_servers: rack_dns_servers.clone(),
-                        domain: None,
-                    },
+                    BlueprintZoneType::InternalNtp(
+                        blueprint_zone_type::InternalNtp {
+                            address: ntp_address,
+                            ntp_servers: boundary_ntp_servers.clone(),
+                            dns_servers: rack_dns_servers.clone(),
+                            domain: None,
+                        },
+                    ),
                     ServiceName::InternalNtp,
                 )
             };
@@ -819,9 +933,9 @@ impl Plan {
                 .host_zone_with_one_backend(id, address, svcname, NTP_PORT)
                 .unwrap();
 
-            sled.request.zones.push(OmicronZoneConfig {
-                // TODO-cleanup use TypedUuid everywhere
-                id: id.into_untyped_uuid(),
+            sled.request.zones.push(BlueprintZoneConfig {
+                disposition: BlueprintZoneDisposition::InService,
+                id,
                 underlay_address: address,
                 zone_type,
                 filesystem_pool,
@@ -1379,10 +1493,10 @@ mod tests {
     }
 
     #[test]
-    fn test_rss_service_plan_v3_schema() {
+    fn test_rss_service_plan_v4_schema() {
         let schema = schemars::schema_for!(Plan);
         expectorate::assert_contents(
-            "../schema/rss-service-plan-v3.json",
+            "../schema/rss-service-plan-v4.json",
             &serde_json::to_string_pretty(&schema).unwrap(),
         );
     }
diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs
index 20cd5646c0..3f73e55d0f 100644
--- a/sled-agent/src/rack_setup/service.rs
+++ b/sled-agent/src/rack_setup/service.rs
@@ -71,7 +71,6 @@ use crate::bootstrap::early_networking::{
 };
 use crate::bootstrap::rss_handle::BootstrapAgentHandle;
 use crate::nexus::d2n_params;
-use crate::params::OmicronZoneTypeExt;
 use crate::rack_setup::plan::service::{
     Plan as ServicePlan, PlanError as ServicePlanError,
 };
@@ -91,9 +90,8 @@ use nexus_sled_agent_shared::inventory::{
     OmicronZoneConfig, OmicronZoneType, OmicronZonesConfig,
 };
 use nexus_types::deployment::{
-    Blueprint, BlueprintPhysicalDisksConfig, BlueprintZoneConfig,
-    BlueprintZoneDisposition, BlueprintZonesConfig,
-    CockroachDbPreserveDowngrade, InvalidOmicronZoneType,
+    blueprint_zone_type, Blueprint, BlueprintZoneType, BlueprintZonesConfig,
+    CockroachDbPreserveDowngrade,
 };
 use nexus_types::external_api::views::SledState;
 use omicron_common::address::get_sled_address;
@@ -108,8 +106,8 @@ use omicron_common::disk::{
 };
 use omicron_common::ledger::{self, Ledger, Ledgerable};
 use omicron_ddm_admin_client::{Client as DdmAdminClient, DdmError};
+use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::SledUuid;
-use omicron_uuid_kinds::{ExternalIpUuid, GenericUuid};
 use serde::{Deserialize, Serialize};
 use sled_agent_client::{
     types as SledAgentTypes, Client as SledAgentClient, Error as SledAgentError,
@@ -533,7 +531,7 @@ impl ServiceInner {
                         .iter()
                         .filter_map(|zone_config| {
                             match &zone_config.zone_type {
-                                OmicronZoneType::InternalDns { http_address, .. }
+                                BlueprintZoneType::InternalDns(blueprint_zone_type::InternalDns{ http_address, .. })
                                 => {
                                     Some(*http_address)
                                 },
@@ -719,15 +717,17 @@ impl ServiceInner {
         let mut datasets: Vec<NexusTypes::DatasetCreateRequest> = vec![];
         for sled_config in service_plan.services.values() {
             for zone in &sled_config.zones {
-                if let Some((dataset_name, dataset_address)) =
-                    zone.dataset_name_and_address()
-                {
+                if let Some(dataset) = zone.zone_type.durable_dataset() {
                     datasets.push(NexusTypes::DatasetCreateRequest {
-                        zpool_id: dataset_name.pool().id().into_untyped_uuid(),
-                        dataset_id: zone.id,
+                        zpool_id: dataset
+                            .dataset
+                            .pool_name
+                            .id()
+                            .into_untyped_uuid(),
+                        dataset_id: zone.id.into_untyped_uuid(),
                         request: NexusTypes::DatasetPutRequest {
-                            address: dataset_address.to_string(),
-                            kind: dataset_name.dataset().kind(),
+                            address: dataset.address.to_string(),
+                            kind: dataset.kind,
                         },
                     })
                 }
@@ -981,7 +981,7 @@ impl ServiceInner {
                 if sled_config.zones.iter().any(|zone_config| {
                     matches!(
                         &zone_config.zone_type,
-                        OmicronZoneType::CockroachDb { .. }
+                        BlueprintZoneType::CockroachDb(_)
                     )
                 }) {
                     Some(sled_address)
@@ -1398,7 +1398,7 @@ fn build_initial_blueprint_from_plan(
     let blueprint = build_initial_blueprint_from_sled_configs(
         sled_configs_by_id,
         internal_dns_version,
-    )?;
+    );
 
     Ok(blueprint)
 }
@@ -1406,47 +1406,11 @@ fn build_initial_blueprint_from_plan(
 pub(crate) fn build_initial_blueprint_from_sled_configs(
     sled_configs_by_id: &BTreeMap<SledUuid, SledConfig>,
     internal_dns_version: Generation,
-) -> Result<Blueprint, InvalidOmicronZoneType> {
-    // Helper to convert an `OmicronZoneConfig` into a `BlueprintZoneConfig`.
-    // This is separate primarily so rustfmt doesn't lose its mind.
-    let to_bp_zone_config = |z: &OmicronZoneConfig| {
-        // All initial zones are in-service.
-        let disposition = BlueprintZoneDisposition::InService;
-        BlueprintZoneConfig::from_omicron_zone_config(
-            z.clone(),
-            disposition,
-            // This is pretty weird: IP IDs don't exist yet, so it's fine for us
-            // to make them up (Nexus will record them as a part of the
-            // handoff). We could pass `None` here for some zone types, but it's
-            // a little simpler to just always pass a new ID, which will only be
-            // used if the zone type has an external IP.
-            //
-            // This should all go away once RSS starts using blueprints more
-            // directly (instead of this conversion after the fact):
-            // https://github.com/oxidecomputer/omicron/issues/5272
-            Some(ExternalIpUuid::new_v4()),
-        )
-    };
-
-    let mut blueprint_disks = BTreeMap::new();
-    for (sled_id, sled_config) in sled_configs_by_id {
-        blueprint_disks.insert(
-            *sled_id,
-            BlueprintPhysicalDisksConfig {
-                generation: sled_config.disks.generation,
-                disks: sled_config
-                    .disks
-                    .disks
-                    .iter()
-                    .map(|d| OmicronPhysicalDiskConfig {
-                        identity: d.identity.clone(),
-                        id: d.id,
-                        pool_id: d.pool_id,
-                    })
-                    .collect(),
-            },
-        );
-    }
+) -> Blueprint {
+    let blueprint_disks: BTreeMap<_, _> = sled_configs_by_id
+        .iter()
+        .map(|(sled_id, sled_config)| (*sled_id, sled_config.disks.clone()))
+        .collect();
 
     let mut blueprint_zones = BTreeMap::new();
     let mut sled_state = BTreeMap::new();
@@ -1463,18 +1427,14 @@ pub(crate) fn build_initial_blueprint_from_sled_configs(
             // value, we will need to revisit storing this in the serialized
             // RSS plan.
             generation: DeployStepVersion::V5_EVERYTHING,
-            zones: sled_config
-                .zones
-                .iter()
-                .map(to_bp_zone_config)
-                .collect::<Result<_, _>>()?,
+            zones: sled_config.zones.clone(),
         };
 
         blueprint_zones.insert(*sled_id, zones_config);
         sled_state.insert(*sled_id, SledState::Active);
     }
 
-    Ok(Blueprint {
+    Blueprint {
         id: Uuid::new_v4(),
         blueprint_zones,
         blueprint_disks,
@@ -1492,7 +1452,7 @@ pub(crate) fn build_initial_blueprint_from_sled_configs(
         time_created: Utc::now(),
         creator: "RSS".to_string(),
         comment: "initial blueprint from rack setup".to_string(),
-    })
+    }
 }
 
 /// Facilitates creating a sequence of OmicronZonesConfig objects for each sled
@@ -1570,11 +1530,14 @@ impl<'a> OmicronZonesConfigGenerator<'a> {
                     sled_config
                         .zones
                         .iter()
+                        .cloned()
+                        .map(|bp_zone_config| {
+                            OmicronZoneConfig::from(bp_zone_config)
+                        })
                         .filter(|z| {
                             !zones_already.contains(&z.id)
                                 && zone_filter(&z.zone_type)
-                        })
-                        .cloned(),
+                        }),
                 );
 
                 let config = OmicronZonesConfig { generation: version, zones };
diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs
index 6057d03f70..d75081f1e4 100644
--- a/sled-agent/src/sim/collection.rs
+++ b/sled-agent/src/sim/collection.rs
@@ -364,35 +364,6 @@ impl<S: Simulatable + 'static> SimCollection<S> {
     pub async fn contains_key(self: &Arc<Self>, id: &Uuid) -> bool {
         self.objects.lock().await.contains_key(id)
     }
-
-    /// Iterates over all of the existing objects in the collection and, for any
-    /// that meet `condition`, asks to transition them into the supplied target
-    /// state.
-    ///
-    /// If any such transition fails, this routine short-circuits and does not
-    /// attempt to transition any other objects.
-    //
-    // TODO: It's likely more idiomatic to have an `iter_mut` routine that
-    // returns a struct that impls Iterator and yields &mut S references. The
-    // tricky bit is that the struct must hold the objects lock during the
-    // iteration. Figure out if there's a better way to arrange all this.
-    pub async fn sim_ensure_for_each_where<C>(
-        self: &Arc<Self>,
-        condition: C,
-        target: &S::RequestedState,
-    ) -> Result<(), Error>
-    where
-        C: Fn(&S) -> bool,
-    {
-        let mut objects = self.objects.lock().await;
-        for o in objects.values_mut() {
-            if condition(&o.object) {
-                o.transition(target.clone())?;
-            }
-        }
-
-        Ok(())
-    }
 }
 
 impl<S: Simulatable + Clone + 'static> SimCollection<S> {
@@ -421,30 +392,24 @@ mod test {
     use omicron_common::api::external::Error;
     use omicron_common::api::external::Generation;
     use omicron_common::api::internal::nexus::DiskRuntimeState;
-    use omicron_common::api::internal::nexus::SledInstanceState;
+    use omicron_common::api::internal::nexus::SledVmmState;
     use omicron_common::api::internal::nexus::VmmRuntimeState;
     use omicron_common::api::internal::nexus::VmmState;
     use omicron_test_utils::dev::test_setup_log;
-    use omicron_uuid_kinds::PropolisUuid;
     use sled_agent_types::disk::DiskStateRequested;
-    use sled_agent_types::instance::InstanceStateRequested;
+    use sled_agent_types::instance::VmmStateRequested;
 
     fn make_instance(
         logctx: &LogContext,
     ) -> (SimObject<SimInstance>, Receiver<()>) {
-        let propolis_id = PropolisUuid::new_v4();
         let vmm_state = VmmRuntimeState {
             state: VmmState::Starting,
             gen: Generation::new(),
             time_updated: Utc::now(),
         };
 
-        let state = SledInstanceState {
-            vmm_state,
-            propolis_id,
-            migration_in: None,
-            migration_out: None,
-        };
+        let state =
+            SledVmmState { vmm_state, migration_in: None, migration_out: None };
 
         SimObject::new_simulated_auto(&state, logctx.log.new(o!()))
     }
@@ -488,8 +453,7 @@ mod test {
         // Stopping an instance that was never started synchronously destroys
         // its VMM.
         let rprev = r1;
-        let dropped =
-            instance.transition(InstanceStateRequested::Stopped).unwrap();
+        let dropped = instance.transition(VmmStateRequested::Stopped).unwrap();
         assert!(dropped.is_none());
         assert!(instance.object.desired().is_none());
         let rnext = instance.object.current();
@@ -529,8 +493,7 @@ mod test {
         // simulated instance's state, but it does queue up a transition.
         let mut rprev = r1;
         assert!(rx.try_next().is_err());
-        let dropped =
-            instance.transition(InstanceStateRequested::Running).unwrap();
+        let dropped = instance.transition(VmmStateRequested::Running).unwrap();
         assert!(dropped.is_none());
         assert!(instance.object.desired().is_some());
         assert!(rx.try_next().is_err());
@@ -562,8 +525,7 @@ mod test {
 
         // If we transition again to "Running", the process should complete
         // immediately.
-        let dropped =
-            instance.transition(InstanceStateRequested::Running).unwrap();
+        let dropped = instance.transition(VmmStateRequested::Running).unwrap();
         assert!(dropped.is_none());
         assert!(instance.object.desired().is_none());
         assert!(rx.try_next().is_err());
@@ -576,8 +538,7 @@ mod test {
         // If we go back to any stopped state, we go through the async process
         // again.
         assert!(rx.try_next().is_err());
-        let dropped =
-            instance.transition(InstanceStateRequested::Stopped).unwrap();
+        let dropped = instance.transition(VmmStateRequested::Stopped).unwrap();
         assert!(dropped.is_none());
         assert!(instance.object.desired().is_some());
         let rnext = instance.object.current();
@@ -634,7 +595,7 @@ mod test {
         assert_eq!(r1.vmm_state.state, VmmState::Starting);
         assert_eq!(r1.vmm_state.gen, Generation::new());
         assert!(instance
-            .transition(InstanceStateRequested::Running)
+            .transition(VmmStateRequested::Running)
             .unwrap()
             .is_none());
         instance.transition_finish();
@@ -650,7 +611,7 @@ mod test {
         // Now reboot the instance. This is dispatched to Propolis, which will
         // move to the Rebooting state and then back to Running.
         assert!(instance
-            .transition(InstanceStateRequested::Reboot)
+            .transition(VmmStateRequested::Reboot)
             .unwrap()
             .is_none());
         let (rprev, rnext) = (rnext, instance.object.current());
diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs
index e93bebad98..aead47658f 100644
--- a/sled-agent/src/sim/http_entrypoints.rs
+++ b/sled-agent/src/sim/http_entrypoints.rs
@@ -23,7 +23,7 @@ use dropshot::TypedBody;
 use nexus_sled_agent_shared::inventory::SledRole;
 use nexus_sled_agent_shared::inventory::{Inventory, OmicronZonesConfig};
 use omicron_common::api::internal::nexus::DiskRuntimeState;
-use omicron_common::api::internal::nexus::SledInstanceState;
+use omicron_common::api::internal::nexus::SledVmmState;
 use omicron_common::api::internal::nexus::UpdateArtifactId;
 use omicron_common::api::internal::shared::SledIdentifiers;
 use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost;
@@ -32,7 +32,6 @@ use omicron_common::api::internal::shared::{
 };
 use omicron_common::disk::DisksManagementResult;
 use omicron_common::disk::OmicronPhysicalDisksConfig;
-use omicron_uuid_kinds::{GenericUuid, InstanceUuid};
 use sled_agent_api::*;
 use sled_agent_types::boot_disk::BootDiskOsWriteStatus;
 use sled_agent_types::boot_disk::BootDiskPathParams;
@@ -44,9 +43,9 @@ use sled_agent_types::early_networking::EarlyNetworkConfig;
 use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody;
 use sled_agent_types::instance::InstanceEnsureBody;
 use sled_agent_types::instance::InstanceExternalIpBody;
-use sled_agent_types::instance::InstancePutStateBody;
-use sled_agent_types::instance::InstancePutStateResponse;
-use sled_agent_types::instance::InstanceUnregisterResponse;
+use sled_agent_types::instance::VmmPutStateBody;
+use sled_agent_types::instance::VmmPutStateResponse;
+use sled_agent_types::instance::VmmUnregisterResponse;
 use sled_agent_types::sled::AddSledRequest;
 use sled_agent_types::time_sync::TimeSync;
 use sled_agent_types::zone_bundle::BundleUtilization;
@@ -83,18 +82,18 @@ enum SledAgentSimImpl {}
 impl SledAgentApi for SledAgentSimImpl {
     type Context = Arc<SledAgent>;
 
-    async fn instance_register(
+    async fn vmm_register(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceEnsureBody>,
-    ) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
+    ) -> Result<HttpResponseOk<SledVmmState>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let propolis_id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
         Ok(HttpResponseOk(
             sa.instance_register(
-                instance_id,
-                body_args.propolis_id,
+                body_args.instance_id,
+                propolis_id,
                 body_args.hardware,
                 body_args.instance_runtime,
                 body_args.vmm_runtime,
@@ -104,58 +103,56 @@ impl SledAgentApi for SledAgentSimImpl {
         ))
     }
 
-    async fn instance_unregister(
+    async fn vmm_unregister(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-    ) -> Result<HttpResponseOk<InstanceUnregisterResponse>, HttpError> {
+        path_params: Path<VmmPathParam>,
+    ) -> Result<HttpResponseOk<VmmUnregisterResponse>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
-        Ok(HttpResponseOk(sa.instance_unregister(instance_id).await?))
+        let id = path_params.into_inner().propolis_id;
+        Ok(HttpResponseOk(sa.instance_unregister(id).await?))
     }
 
-    async fn instance_put_state(
+    async fn vmm_put_state(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-        body: TypedBody<InstancePutStateBody>,
-    ) -> Result<HttpResponseOk<InstancePutStateResponse>, HttpError> {
+        path_params: Path<VmmPathParam>,
+        body: TypedBody<VmmPutStateBody>,
+    ) -> Result<HttpResponseOk<VmmPutStateResponse>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
-        Ok(HttpResponseOk(
-            sa.instance_ensure_state(instance_id, body_args.state).await?,
-        ))
+        Ok(HttpResponseOk(sa.instance_ensure_state(id, body_args.state).await?))
     }
 
-    async fn instance_get_state(
+    async fn vmm_get_state(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
-    ) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
+        path_params: Path<VmmPathParam>,
+    ) -> Result<HttpResponseOk<SledVmmState>, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
-        Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?))
+        let id = path_params.into_inner().propolis_id;
+        Ok(HttpResponseOk(sa.instance_get_state(id).await?))
     }
 
-    async fn instance_put_external_ip(
+    async fn vmm_put_external_ip(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceExternalIpBody>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
-        sa.instance_put_external_ip(instance_id, &body_args).await?;
+        sa.instance_put_external_ip(id, &body_args).await?;
         Ok(HttpResponseUpdatedNoContent())
     }
 
-    async fn instance_delete_external_ip(
+    async fn vmm_delete_external_ip(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstancePathParam>,
+        path_params: Path<VmmPathParam>,
         body: TypedBody<InstanceExternalIpBody>,
     ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
         let sa = rqctx.context();
-        let instance_id = path_params.into_inner().instance_id;
+        let id = path_params.into_inner().propolis_id;
         let body_args = body.into_inner();
-        sa.instance_delete_external_ip(instance_id, &body_args).await?;
+        sa.instance_delete_external_ip(id, &body_args).await?;
         Ok(HttpResponseUpdatedNoContent())
     }
 
@@ -192,27 +189,25 @@ impl SledAgentApi for SledAgentSimImpl {
         Ok(HttpResponseUpdatedNoContent())
     }
 
-    async fn instance_issue_disk_snapshot_request(
+    async fn vmm_issue_disk_snapshot_request(
         rqctx: RequestContext<Self::Context>,
-        path_params: Path<InstanceIssueDiskSnapshotRequestPathParam>,
-        body: TypedBody<InstanceIssueDiskSnapshotRequestBody>,
-    ) -> Result<
-        HttpResponseOk<InstanceIssueDiskSnapshotRequestResponse>,
-        HttpError,
-    > {
+        path_params: Path<VmmIssueDiskSnapshotRequestPathParam>,
+        body: TypedBody<VmmIssueDiskSnapshotRequestBody>,
+    ) -> Result<HttpResponseOk<VmmIssueDiskSnapshotRequestResponse>, HttpError>
+    {
         let sa = rqctx.context();
         let path_params = path_params.into_inner();
         let body = body.into_inner();
 
         sa.instance_issue_disk_snapshot_request(
-            InstanceUuid::from_untyped_uuid(path_params.instance_id),
+            path_params.propolis_id,
             path_params.disk_id,
             body.snapshot_id,
         )
         .await
         .map_err(|e| HttpError::for_internal_error(e.to_string()))?;
 
-        Ok(HttpResponseOk(InstanceIssueDiskSnapshotRequestResponse {
+        Ok(HttpResponseOk(VmmIssueDiskSnapshotRequestResponse {
             snapshot_id: body.snapshot_id,
         }))
     }
@@ -512,45 +507,44 @@ fn method_unimplemented<T>() -> Result<T, HttpError> {
 
 #[endpoint {
     method = POST,
-    path = "/instances/{instance_id}/poke",
+    path = "/vmms/{propolis_id}/poke",
 }]
 async fn instance_poke_post(
     rqctx: RequestContext<Arc<SledAgent>>,
-    path_params: Path<InstancePathParam>,
+    path_params: Path<VmmPathParam>,
 ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
     let sa = rqctx.context();
-    let instance_id = path_params.into_inner().instance_id;
-    sa.instance_poke(instance_id, PokeMode::Drain).await;
+    let id = path_params.into_inner().propolis_id;
+    sa.vmm_poke(id, PokeMode::Drain).await;
     Ok(HttpResponseUpdatedNoContent())
 }
 
 #[endpoint {
     method = POST,
-    path = "/instances/{instance_id}/poke-single-step",
+    path = "/vmms/{propolis_id}/poke-single-step",
 }]
 async fn instance_poke_single_step_post(
     rqctx: RequestContext<Arc<SledAgent>>,
-    path_params: Path<InstancePathParam>,
+    path_params: Path<VmmPathParam>,
 ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
     let sa = rqctx.context();
-    let instance_id = path_params.into_inner().instance_id;
-    sa.instance_poke(instance_id, PokeMode::SingleStep).await;
+    let id = path_params.into_inner().propolis_id;
+    sa.vmm_poke(id, PokeMode::SingleStep).await;
     Ok(HttpResponseUpdatedNoContent())
 }
 
 #[endpoint {
     method = POST,
-    path = "/instances/{instance_id}/sim-migration-source",
+    path = "/vmms/{propolis_id}/sim-migration-source",
 }]
 async fn instance_post_sim_migration_source(
     rqctx: RequestContext<Arc<SledAgent>>,
-    path_params: Path<InstancePathParam>,
+    path_params: Path<VmmPathParam>,
     body: TypedBody<super::instance::SimulateMigrationSource>,
 ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
     let sa = rqctx.context();
-    let instance_id = path_params.into_inner().instance_id;
-    sa.instance_simulate_migration_source(instance_id, body.into_inner())
-        .await?;
+    let id = path_params.into_inner().propolis_id;
+    sa.instance_simulate_migration_source(id, body.into_inner()).await?;
     Ok(HttpResponseUpdatedNoContent())
 }
 
diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs
index 33bc1c40c1..eb7ea0ca79 100644
--- a/sled-agent/src/sim/instance.rs
+++ b/sled-agent/src/sim/instance.rs
@@ -14,13 +14,14 @@ use nexus_client;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::Generation;
 use omicron_common::api::external::ResourceType;
-use omicron_common::api::internal::nexus::{SledInstanceState, VmmState};
+use omicron_common::api::internal::nexus::{SledVmmState, VmmState};
+use omicron_uuid_kinds::{GenericUuid, PropolisUuid};
 use propolis_client::types::{
     InstanceMigrateStatusResponse as PropolisMigrateResponse,
     InstanceMigrationStatus as PropolisMigrationStatus,
     InstanceState as PropolisInstanceState, InstanceStateMonitorResponse,
 };
-use sled_agent_types::instance::InstanceStateRequested;
+use sled_agent_types::instance::VmmStateRequested;
 use std::collections::VecDeque;
 use std::sync::Arc;
 use std::sync::Mutex;
@@ -170,13 +171,13 @@ impl SimInstanceInner {
     /// returning an action for the caller to simulate.
     fn request_transition(
         &mut self,
-        target: &InstanceStateRequested,
+        target: &VmmStateRequested,
     ) -> Result<Option<InstanceAction>, Error> {
         match target {
             // When Nexus intends to migrate into a VMM, it should create that
             // VMM in the Migrating state and shouldn't request anything else
             // from it before asking to migrate in.
-            InstanceStateRequested::MigrationTarget(_) => {
+            VmmStateRequested::MigrationTarget(_) => {
                 if !self.queue.is_empty() {
                     return Err(Error::invalid_request(&format!(
                         "can't request migration in with a non-empty state
@@ -207,7 +208,7 @@ impl SimInstanceInner {
                     SimulatedMigrationResult::Success,
                 );
             }
-            InstanceStateRequested::Running => {
+            VmmStateRequested::Running => {
                 match self.next_resting_state() {
                     VmmState::Starting => {
                         self.queue_propolis_state(
@@ -234,7 +235,7 @@ impl SimInstanceInner {
                     }
                 }
             }
-            InstanceStateRequested::Stopped => {
+            VmmStateRequested::Stopped => {
                 match self.next_resting_state() {
                     VmmState::Starting => {
                         let mark_failed = false;
@@ -256,7 +257,7 @@ impl SimInstanceInner {
                     }
                 }
             }
-            InstanceStateRequested::Reboot => match self.next_resting_state() {
+            VmmStateRequested::Reboot => match self.next_resting_state() {
                 VmmState::Running => {
                     // Further requests to reboot are ignored if the instance
                     // is currently rebooting or about to reboot.
@@ -315,7 +316,7 @@ impl SimInstanceInner {
     /// If the state change queue contains at least once instance state change,
     /// returns the requested instance state associated with the last instance
     /// state on the queue. Returns None otherwise.
-    fn desired(&self) -> Option<InstanceStateRequested> {
+    fn desired(&self) -> Option<VmmStateRequested> {
         self.last_queued_instance_state().map(|terminal| match terminal {
             // State change requests may queue these states as intermediate
             // states, but the simulation (and the tests that rely on it) is
@@ -331,13 +332,11 @@ impl SimInstanceInner {
                 "pending resting state {:?} doesn't map to a requested state",
                 terminal
             ),
-            PropolisInstanceState::Running => InstanceStateRequested::Running,
+            PropolisInstanceState::Running => VmmStateRequested::Running,
             PropolisInstanceState::Stopping
             | PropolisInstanceState::Stopped
-            | PropolisInstanceState::Destroyed => {
-                InstanceStateRequested::Stopped
-            }
-            PropolisInstanceState::Rebooting => InstanceStateRequested::Reboot,
+            | PropolisInstanceState::Destroyed => VmmStateRequested::Stopped,
+            PropolisInstanceState::Rebooting => VmmStateRequested::Reboot,
         })
     }
 
@@ -388,7 +387,7 @@ impl SimInstanceInner {
 
     /// Simulates rude termination by moving the instance to the Destroyed state
     /// immediately and clearing the queue of pending state transitions.
-    fn terminate(&mut self) -> SledInstanceState {
+    fn terminate(&mut self) -> SledVmmState {
         let mark_failed = false;
         self.state.terminate_rudely(mark_failed);
         self.queue.clear();
@@ -418,7 +417,7 @@ pub struct SimInstance {
 }
 
 impl SimInstance {
-    pub fn terminate(&self) -> SledInstanceState {
+    pub fn terminate(&self) -> SledVmmState {
         self.inner.lock().unwrap().terminate()
     }
 
@@ -435,12 +434,12 @@ impl SimInstance {
 
 #[async_trait]
 impl Simulatable for SimInstance {
-    type CurrentState = SledInstanceState;
-    type RequestedState = InstanceStateRequested;
+    type CurrentState = SledVmmState;
+    type RequestedState = VmmStateRequested;
     type ProducerArgs = ();
     type Action = InstanceAction;
 
-    fn new(current: SledInstanceState) -> Self {
+    fn new(current: SledVmmState) -> Self {
         assert!(matches!(
             current.vmm_state.state,
             VmmState::Starting | VmmState::Migrating),
@@ -453,7 +452,6 @@ impl Simulatable for SimInstance {
             inner: Arc::new(Mutex::new(SimInstanceInner {
                 state: InstanceStates::new(
                     current.vmm_state,
-                    current.propolis_id,
                     current.migration_in.map(|m| m.migration_id),
                 ),
                 last_response: InstanceStateMonitorResponse {
@@ -480,7 +478,7 @@ impl Simulatable for SimInstance {
 
     fn request_transition(
         &mut self,
-        target: &InstanceStateRequested,
+        target: &VmmStateRequested,
     ) -> Result<Option<InstanceAction>, Error> {
         self.inner.lock().unwrap().request_transition(target)
     }
@@ -512,8 +510,8 @@ impl Simulatable for SimInstance {
     ) -> Result<(), Error> {
         nexus_client
             .cpapi_instances_put(
-                id,
-                &nexus_client::types::SledInstanceState::from(current),
+                &PropolisUuid::from_untyped_uuid(*id),
+                &nexus_client::types::SledVmmState::from(current),
             )
             .await
             .map(|_| ())
diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs
index 189f775adb..b546025654 100644
--- a/sled-agent/src/sim/server.rs
+++ b/sled-agent/src/sim/server.rs
@@ -12,6 +12,10 @@ use crate::nexus::d2n_params;
 use crate::nexus::NexusClient;
 use crate::rack_setup::service::build_initial_blueprint_from_sled_configs;
 use crate::rack_setup::SledConfig;
+use crate::rack_setup::{
+    from_ipaddr_to_external_floating_ip,
+    from_sockaddr_to_external_floating_addr,
+};
 use anyhow::anyhow;
 use crucible_agent_client::types::State as RegionState;
 use illumos_utils::zpool::ZpoolName;
@@ -19,9 +23,11 @@ use internal_dns::ServiceName;
 use nexus_client::types as NexusTypes;
 use nexus_client::types::{IpRange, Ipv4Range, Ipv6Range};
 use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES;
-use nexus_sled_agent_shared::inventory::OmicronZoneConfig;
 use nexus_sled_agent_shared::inventory::OmicronZoneDataset;
-use nexus_sled_agent_shared::inventory::OmicronZoneType;
+use nexus_types::deployment::blueprint_zone_type;
+use nexus_types::deployment::{
+    BlueprintZoneConfig, BlueprintZoneDisposition, BlueprintZoneType,
+};
 use nexus_types::inventory::NetworkInterfaceKind;
 use omicron_common::address::DNS_OPTE_IPV4_SUBNET;
 use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET;
@@ -36,6 +42,7 @@ use omicron_common::backoff::{
 use omicron_common::disk::DiskIdentity;
 use omicron_common::FileKv;
 use omicron_uuid_kinds::GenericUuid;
+use omicron_uuid_kinds::OmicronZoneUuid;
 use omicron_uuid_kinds::SledUuid;
 use omicron_uuid_kinds::ZpoolUuid;
 use oxnet::Ipv6Net;
@@ -375,19 +382,22 @@ pub async fn run_standalone_server(
         SocketAddr::V6(a) => a,
     };
     let pool_name = ZpoolName::new_external(ZpoolUuid::new_v4());
-    let mut zones = vec![OmicronZoneConfig {
-        id: Uuid::new_v4(),
+    let mut zones = vec![BlueprintZoneConfig {
+        disposition: BlueprintZoneDisposition::InService,
+        id: OmicronZoneUuid::new_v4(),
         underlay_address: *http_bound.ip(),
-        zone_type: OmicronZoneType::InternalDns {
-            dataset: OmicronZoneDataset { pool_name: pool_name.clone() },
-            http_address: http_bound,
-            dns_address: match dns.dns_server.local_address() {
-                SocketAddr::V4(_) => panic!("did not expect v4 address"),
-                SocketAddr::V6(a) => a,
+        zone_type: BlueprintZoneType::InternalDns(
+            blueprint_zone_type::InternalDns {
+                dataset: OmicronZoneDataset { pool_name: pool_name.clone() },
+                http_address: http_bound,
+                dns_address: match dns.dns_server.local_address() {
+                    SocketAddr::V4(_) => panic!("did not expect v4 address"),
+                    SocketAddr::V6(a) => a,
+                },
+                gz_address: Ipv6Addr::LOCALHOST,
+                gz_address_index: 0,
             },
-            gz_address: Ipv6Addr::LOCALHOST,
-            gz_address_index: 0,
-        },
+        ),
         // Co-locate the filesystem pool with the dataset
         filesystem_pool: Some(pool_name),
     }];
@@ -396,23 +406,26 @@ pub async fn run_standalone_server(
     let mut macs = MacAddr::iter_system();
     if let Some(nexus_external_addr) = rss_args.nexus_external_addr {
         let ip = nexus_external_addr.ip();
-        let id = Uuid::new_v4();
+        let id = OmicronZoneUuid::new_v4();
 
-        zones.push(OmicronZoneConfig {
+        zones.push(BlueprintZoneConfig {
+            disposition: BlueprintZoneDisposition::InService,
             id,
             underlay_address: match ip {
                 IpAddr::V4(_) => panic!("did not expect v4 address"),
                 IpAddr::V6(a) => a,
             },
-            zone_type: OmicronZoneType::Nexus {
+            zone_type: BlueprintZoneType::Nexus(blueprint_zone_type::Nexus {
                 internal_address: match config.nexus_address {
                     SocketAddr::V4(_) => panic!("did not expect v4 address"),
                     SocketAddr::V6(a) => a,
                 },
-                external_ip: ip,
+                external_ip: from_ipaddr_to_external_floating_ip(ip),
                 nic: nexus_types::inventory::NetworkInterface {
                     id: Uuid::new_v4(),
-                    kind: NetworkInterfaceKind::Service { id },
+                    kind: NetworkInterfaceKind::Service {
+                        id: id.into_untyped_uuid(),
+                    },
                     name: "nexus".parse().unwrap(),
                     ip: NEXUS_OPTE_IPV4_SUBNET
                         .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1)
@@ -427,7 +440,7 @@ pub async fn run_standalone_server(
                 },
                 external_tls: false,
                 external_dns_servers: vec![],
-            },
+            }),
             filesystem_pool: Some(get_random_zpool()),
         });
 
@@ -445,31 +458,40 @@ pub async fn run_standalone_server(
         rss_args.external_dns_internal_addr
     {
         let ip = *external_dns_internal_addr.ip();
-        let id = Uuid::new_v4();
+        let id = OmicronZoneUuid::new_v4();
         let pool_name = ZpoolName::new_external(ZpoolUuid::new_v4());
-        zones.push(OmicronZoneConfig {
+        zones.push(BlueprintZoneConfig {
+            disposition: BlueprintZoneDisposition::InService,
             id,
             underlay_address: ip,
-            zone_type: OmicronZoneType::ExternalDns {
-                dataset: OmicronZoneDataset { pool_name: pool_name.clone() },
-                http_address: external_dns_internal_addr,
-                dns_address: SocketAddr::V6(external_dns_internal_addr),
-                nic: nexus_types::inventory::NetworkInterface {
-                    id: Uuid::new_v4(),
-                    kind: NetworkInterfaceKind::Service { id },
-                    name: "external-dns".parse().unwrap(),
-                    ip: DNS_OPTE_IPV4_SUBNET
-                        .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1)
-                        .unwrap()
-                        .into(),
-                    mac: macs.next().unwrap(),
-                    subnet: (*DNS_OPTE_IPV4_SUBNET).into(),
-                    vni: Vni::SERVICES_VNI,
-                    primary: true,
-                    slot: 0,
-                    transit_ips: vec![],
+            zone_type: BlueprintZoneType::ExternalDns(
+                blueprint_zone_type::ExternalDns {
+                    dataset: OmicronZoneDataset {
+                        pool_name: pool_name.clone(),
+                    },
+                    http_address: external_dns_internal_addr,
+                    dns_address: from_sockaddr_to_external_floating_addr(
+                        SocketAddr::V6(external_dns_internal_addr),
+                    ),
+                    nic: nexus_types::inventory::NetworkInterface {
+                        id: Uuid::new_v4(),
+                        kind: NetworkInterfaceKind::Service {
+                            id: id.into_untyped_uuid(),
+                        },
+                        name: "external-dns".parse().unwrap(),
+                        ip: DNS_OPTE_IPV4_SUBNET
+                            .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1)
+                            .unwrap()
+                            .into(),
+                        mac: macs.next().unwrap(),
+                        subnet: (*DNS_OPTE_IPV4_SUBNET).into(),
+                        vni: Vni::SERVICES_VNI,
+                        primary: true,
+                        slot: 0,
+                        transit_ips: vec![],
+                    },
                 },
-            },
+            ),
             // Co-locate the filesystem pool with the dataset
             filesystem_pool: Some(pool_name),
         });
@@ -530,8 +552,7 @@ pub async fn run_standalone_server(
         blueprint: build_initial_blueprint_from_sled_configs(
             &sled_configs,
             internal_dns_version,
-        )
-        .expect("failed to construct initial blueprint"),
+        ),
         physical_disks,
         zpools,
         datasets,
diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs
index 10536c8c80..7292b3dee1 100644
--- a/sled-agent/src/sim/sled_agent.rs
+++ b/sled-agent/src/sim/sled_agent.rs
@@ -24,7 +24,7 @@ use omicron_common::api::external::{
     ByteCount, DiskState, Error, Generation, ResourceType,
 };
 use omicron_common::api::internal::nexus::{
-    DiskRuntimeState, MigrationRuntimeState, MigrationState, SledInstanceState,
+    DiskRuntimeState, MigrationRuntimeState, MigrationState, SledVmmState,
 };
 use omicron_common::api::internal::nexus::{
     InstanceRuntimeState, VmmRuntimeState,
@@ -50,8 +50,7 @@ use sled_agent_types::early_networking::{
 };
 use sled_agent_types::instance::{
     InstanceExternalIpBody, InstanceHardware, InstanceMetadata,
-    InstancePutStateResponse, InstanceStateRequested,
-    InstanceUnregisterResponse,
+    VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse,
 };
 use slog::Logger;
 use std::collections::{HashMap, HashSet, VecDeque};
@@ -71,8 +70,8 @@ use uuid::Uuid;
 pub struct SledAgent {
     pub id: Uuid,
     pub ip: IpAddr,
-    /// collection of simulated instances, indexed by instance uuid
-    instances: Arc<SimCollection<SimInstance>>,
+    /// collection of simulated VMMs, indexed by Propolis uuid
+    vmms: Arc<SimCollection<SimInstance>>,
     /// collection of simulated disks, indexed by disk uuid
     disks: Arc<SimCollection<SimDisk>>,
     storage: Mutex<Storage>,
@@ -84,7 +83,8 @@ pub struct SledAgent {
     mock_propolis:
         Mutex<Option<(HttpServer<Arc<PropolisContext>>, PropolisClient)>>,
     /// lists of external IPs assigned to instances
-    pub external_ips: Mutex<HashMap<Uuid, HashSet<InstanceExternalIpBody>>>,
+    pub external_ips:
+        Mutex<HashMap<PropolisUuid, HashSet<InstanceExternalIpBody>>>,
     pub vpc_routes: Mutex<HashMap<RouterId, RouteSet>>,
     config: Config,
     fake_zones: Mutex<OmicronZonesConfig>,
@@ -170,7 +170,7 @@ impl SledAgent {
         Arc::new(SledAgent {
             id,
             ip: config.dropshot.bind_address.ip(),
-            instances: Arc::new(SimCollection::new(
+            vmms: Arc::new(SimCollection::new(
                 Arc::clone(&nexus_client),
                 instance_log,
                 sim_mode,
@@ -269,7 +269,7 @@ impl SledAgent {
         instance_runtime: InstanceRuntimeState,
         vmm_runtime: VmmRuntimeState,
         metadata: InstanceMetadata,
-    ) -> Result<SledInstanceState, Error> {
+    ) -> Result<SledVmmState, Error> {
         // respond with a fake 500 level failure if asked to ensure an instance
         // with more than 16 CPUs.
         let ncpus: i64 = (&hardware.properties.ncpus).into();
@@ -317,11 +317,7 @@ impl SledAgent {
         //      point to the correct address.
         let mock_lock = self.mock_propolis.lock().await;
         if let Some((_srv, client)) = mock_lock.as_ref() {
-            if !self
-                .instances
-                .contains_key(&instance_id.into_untyped_uuid())
-                .await
-            {
+            if !self.vmms.contains_key(&instance_id.into_untyped_uuid()).await {
                 let metadata = propolis_client::types::InstanceMetadata {
                     project_id: metadata.project_id,
                     silo_id: metadata.silo_id,
@@ -379,12 +375,11 @@ impl SledAgent {
         });
 
         let instance_run_time_state = self
-            .instances
+            .vmms
             .sim_ensure(
-                &instance_id.into_untyped_uuid(),
-                SledInstanceState {
+                &propolis_id.into_untyped_uuid(),
+                SledVmmState {
                     vmm_state: vmm_runtime,
-                    propolis_id,
                     migration_in,
                     migration_out: None,
                 },
@@ -417,56 +412,53 @@ impl SledAgent {
     /// not notified.
     pub async fn instance_unregister(
         self: &Arc<Self>,
-        instance_id: InstanceUuid,
-    ) -> Result<InstanceUnregisterResponse, Error> {
+        propolis_id: PropolisUuid,
+    ) -> Result<VmmUnregisterResponse, Error> {
         let instance = match self
-            .instances
-            .sim_get_cloned_object(&instance_id.into_untyped_uuid())
+            .vmms
+            .sim_get_cloned_object(&propolis_id.into_untyped_uuid())
             .await
         {
             Ok(instance) => instance,
             Err(Error::ObjectNotFound { .. }) => {
-                return Ok(InstanceUnregisterResponse { updated_runtime: None })
+                return Ok(VmmUnregisterResponse { updated_runtime: None })
             }
             Err(e) => return Err(e),
         };
 
-        self.detach_disks_from_instance(instance_id).await?;
-        let response = InstanceUnregisterResponse {
+        let response = VmmUnregisterResponse {
             updated_runtime: Some(instance.terminate()),
         };
 
-        self.instances.sim_force_remove(instance_id.into_untyped_uuid()).await;
+        self.vmms.sim_force_remove(propolis_id.into_untyped_uuid()).await;
         Ok(response)
     }
 
     /// Asks the supplied instance to transition to the requested state.
     pub async fn instance_ensure_state(
         self: &Arc<Self>,
-        instance_id: InstanceUuid,
-        state: InstanceStateRequested,
-    ) -> Result<InstancePutStateResponse, Error> {
+        propolis_id: PropolisUuid,
+        state: VmmStateRequested,
+    ) -> Result<VmmPutStateResponse, Error> {
         if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref()
         {
             return Err(e.clone());
         }
 
         let current = match self
-            .instances
-            .sim_get_cloned_object(&instance_id.into_untyped_uuid())
+            .vmms
+            .sim_get_cloned_object(&propolis_id.into_untyped_uuid())
             .await
         {
             Ok(i) => i.current().clone(),
             Err(_) => match state {
-                InstanceStateRequested::Stopped => {
-                    return Ok(InstancePutStateResponse {
-                        updated_runtime: None,
-                    });
+                VmmStateRequested::Stopped => {
+                    return Ok(VmmPutStateResponse { updated_runtime: None });
                 }
                 _ => {
                     return Err(Error::invalid_request(&format!(
-                        "instance {} not registered on sled",
-                        instance_id,
+                        "Propolis {} not registered on sled",
+                        propolis_id,
                     )));
                 }
             },
@@ -475,43 +467,41 @@ impl SledAgent {
         let mock_lock = self.mock_propolis.lock().await;
         if let Some((_srv, client)) = mock_lock.as_ref() {
             let body = match state {
-                InstanceStateRequested::MigrationTarget(_) => {
+                VmmStateRequested::MigrationTarget(_) => {
                     return Err(Error::internal_error(
                         "migration not implemented for mock Propolis",
                     ));
                 }
-                InstanceStateRequested::Running => {
-                    let instances = self.instances.clone();
+                VmmStateRequested::Running => {
+                    let vmms = self.vmms.clone();
                     let log = self.log.new(
                         o!("component" => "SledAgent-insure_instance_state"),
                     );
                     tokio::spawn(async move {
                         tokio::time::sleep(Duration::from_secs(10)).await;
-                        match instances
+                        match vmms
                             .sim_ensure(
-                                &instance_id.into_untyped_uuid(),
+                                &propolis_id.into_untyped_uuid(),
                                 current,
                                 Some(state),
                             )
                             .await
                         {
                             Ok(state) => {
-                                let instance_state: nexus_client::types::SledInstanceState = state.into();
-                                info!(log, "sim_ensure success"; "instance_state" => #?instance_state);
+                                let vmm_state: nexus_client::types::SledVmmState = state.into();
+                                info!(log, "sim_ensure success"; "vmm_state" => #?vmm_state);
                             }
                             Err(instance_put_error) => {
                                 error!(log, "sim_ensure failure"; "error" => #?instance_put_error);
                             }
                         }
                     });
-                    return Ok(InstancePutStateResponse {
-                        updated_runtime: None,
-                    });
+                    return Ok(VmmPutStateResponse { updated_runtime: None });
                 }
-                InstanceStateRequested::Stopped => {
+                VmmStateRequested::Stopped => {
                     propolis_client::types::InstanceStateRequested::Stop
                 }
-                InstanceStateRequested::Reboot => {
+                VmmStateRequested::Reboot => {
                     propolis_client::types::InstanceStateRequested::Reboot
                 }
             };
@@ -521,30 +511,24 @@ impl SledAgent {
         }
 
         let new_state = self
-            .instances
-            .sim_ensure(&instance_id.into_untyped_uuid(), current, Some(state))
+            .vmms
+            .sim_ensure(&propolis_id.into_untyped_uuid(), current, Some(state))
             .await?;
 
-        // If this request will shut down the simulated instance, look for any
-        // disks that are attached to it and drive them to the Detached state.
-        if matches!(state, InstanceStateRequested::Stopped) {
-            self.detach_disks_from_instance(instance_id).await?;
-        }
-
-        Ok(InstancePutStateResponse { updated_runtime: Some(new_state) })
+        Ok(VmmPutStateResponse { updated_runtime: Some(new_state) })
     }
 
     pub async fn instance_get_state(
         &self,
-        instance_id: InstanceUuid,
-    ) -> Result<SledInstanceState, HttpError> {
+        propolis_id: PropolisUuid,
+    ) -> Result<SledVmmState, HttpError> {
         let instance = self
-            .instances
-            .sim_get_cloned_object(&instance_id.into_untyped_uuid())
+            .vmms
+            .sim_get_cloned_object(&propolis_id.into_untyped_uuid())
             .await
             .map_err(|_| {
                 crate::sled_agent::Error::Instance(
-                    crate::instance_manager::Error::NoSuchInstance(instance_id),
+                    crate::instance_manager::Error::NoSuchVmm(propolis_id),
                 )
             })?;
         Ok(instance.current())
@@ -552,16 +536,16 @@ impl SledAgent {
 
     pub async fn instance_simulate_migration_source(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         migration: instance::SimulateMigrationSource,
     ) -> Result<(), HttpError> {
         let instance = self
-            .instances
-            .sim_get_cloned_object(&instance_id.into_untyped_uuid())
+            .vmms
+            .sim_get_cloned_object(&propolis_id.into_untyped_uuid())
             .await
             .map_err(|_| {
                 crate::sled_agent::Error::Instance(
-                    crate::instance_manager::Error::NoSuchInstance(instance_id),
+                    crate::instance_manager::Error::NoSuchVmm(propolis_id),
                 )
             })?;
         instance.set_simulated_migration_source(migration);
@@ -572,25 +556,6 @@ impl SledAgent {
         *self.instance_ensure_state_error.lock().await = error;
     }
 
-    async fn detach_disks_from_instance(
-        &self,
-        instance_id: InstanceUuid,
-    ) -> Result<(), Error> {
-        self.disks
-            .sim_ensure_for_each_where(
-                |disk| match disk.current().disk_state {
-                    DiskState::Attached(id) | DiskState::Attaching(id) => {
-                        id == instance_id.into_untyped_uuid()
-                    }
-                    _ => false,
-                },
-                &DiskStateRequested::Detached,
-            )
-            .await?;
-
-        Ok(())
-    }
-
     /// Idempotently ensures that the given API Disk (described by `api_disk`)
     /// is attached (or not) as specified.  This simulates disk attach and
     /// detach, similar to instance boot and halt.
@@ -607,16 +572,16 @@ impl SledAgent {
         &self.updates
     }
 
-    pub async fn instance_count(&self) -> usize {
-        self.instances.size().await
+    pub async fn vmm_count(&self) -> usize {
+        self.vmms.size().await
     }
 
     pub async fn disk_count(&self) -> usize {
         self.disks.size().await
     }
 
-    pub async fn instance_poke(&self, id: InstanceUuid, mode: PokeMode) {
-        self.instances.sim_poke(id.into_untyped_uuid(), mode).await;
+    pub async fn vmm_poke(&self, id: PropolisUuid, mode: PokeMode) {
+        self.vmms.sim_poke(id.into_untyped_uuid(), mode).await;
     }
 
     pub async fn disk_poke(&self, id: Uuid) {
@@ -699,7 +664,7 @@ impl SledAgent {
     /// snapshot here.
     pub async fn instance_issue_disk_snapshot_request(
         &self,
-        _instance_id: InstanceUuid,
+        _propolis_id: PropolisUuid,
         disk_id: Uuid,
         snapshot_id: Uuid,
     ) -> Result<(), Error> {
@@ -760,18 +725,17 @@ impl SledAgent {
 
     pub async fn instance_put_external_ip(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         body_args: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
-        if !self.instances.contains_key(&instance_id.into_untyped_uuid()).await
-        {
+        if !self.vmms.contains_key(&propolis_id.into_untyped_uuid()).await {
             return Err(Error::internal_error(
-                "can't alter IP state for nonexistent instance",
+                "can't alter IP state for VMM that's not registered",
             ));
         }
 
         let mut eips = self.external_ips.lock().await;
-        let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default();
+        let my_eips = eips.entry(propolis_id).or_default();
 
         // High-level behaviour: this should always succeed UNLESS
         // trying to add a double ephemeral.
@@ -794,18 +758,17 @@ impl SledAgent {
 
     pub async fn instance_delete_external_ip(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         body_args: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
-        if !self.instances.contains_key(&instance_id.into_untyped_uuid()).await
-        {
+        if !self.vmms.contains_key(&propolis_id.into_untyped_uuid()).await {
             return Err(Error::internal_error(
-                "can't alter IP state for nonexistent instance",
+                "can't alter IP state for VMM that's not registered",
             ));
         }
 
         let mut eips = self.external_ips.lock().await;
-        let my_eips = eips.entry(instance_id.into_untyped_uuid()).or_default();
+        let my_eips = eips.entry(propolis_id).or_default();
 
         my_eips.remove(&body_args);
 
diff --git a/sled-agent/src/sim/storage.rs b/sled-agent/src/sim/storage.rs
index 556388ce93..ac8f80069b 100644
--- a/sled-agent/src/sim/storage.rs
+++ b/sled-agent/src/sim/storage.rs
@@ -24,8 +24,8 @@ use omicron_common::disk::DiskVariant;
 use omicron_common::disk::DisksManagementResult;
 use omicron_common::disk::OmicronPhysicalDisksConfig;
 use omicron_uuid_kinds::GenericUuid;
-use omicron_uuid_kinds::InstanceUuid;
 use omicron_uuid_kinds::OmicronZoneUuid;
+use omicron_uuid_kinds::PropolisUuid;
 use omicron_uuid_kinds::ZpoolUuid;
 use propolis_client::types::VolumeConstructionRequest;
 use slog::Logger;
@@ -869,7 +869,7 @@ impl Pantry {
 
         self.sled_agent
             .instance_issue_disk_snapshot_request(
-                InstanceUuid::new_v4(), // instance id, not used by function
+                PropolisUuid::new_v4(), // instance id, not used by function
                 volume_id.parse().unwrap(),
                 snapshot_id.parse().unwrap(),
             )
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index 50e5611027..d69ccedb7d 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -38,9 +38,7 @@ use omicron_common::address::{
     get_sled_address, get_switch_zone_address, Ipv6Subnet, SLED_PREFIX,
 };
 use omicron_common::api::external::{ByteCount, ByteCountRangeError, Vni};
-use omicron_common::api::internal::nexus::{
-    SledInstanceState, VmmRuntimeState,
-};
+use omicron_common::api::internal::nexus::{SledVmmState, VmmRuntimeState};
 use omicron_common::api::internal::shared::{
     HostPortConfig, RackNetworkConfig, ResolvedVpcFirewallRule,
     ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers,
@@ -61,8 +59,7 @@ use sled_agent_types::disk::DiskStateRequested;
 use sled_agent_types::early_networking::EarlyNetworkConfig;
 use sled_agent_types::instance::{
     InstanceExternalIpBody, InstanceHardware, InstanceMetadata,
-    InstancePutStateResponse, InstanceStateRequested,
-    InstanceUnregisterResponse,
+    VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse,
 };
 use sled_agent_types::sled::{BaseboardId, StartSledAgentRequest};
 use sled_agent_types::time_sync::TimeSync;
@@ -227,7 +224,7 @@ impl From<Error> for dropshot::HttpError {
                 }
             }
             Error::Instance(
-                e @ crate::instance_manager::Error::NoSuchInstance(_),
+                e @ crate::instance_manager::Error::NoSuchVmm(_),
             ) => HttpError::for_not_found(
                 Some(NO_SUCH_INSTANCE.to_string()),
                 e.to_string(),
@@ -966,7 +963,7 @@ impl SledAgent {
         vmm_runtime: VmmRuntimeState,
         propolis_addr: SocketAddr,
         metadata: InstanceMetadata,
-    ) -> Result<SledInstanceState, Error> {
+    ) -> Result<SledVmmState, Error> {
         self.inner
             .instances
             .ensure_registered(
@@ -990,11 +987,11 @@ impl SledAgent {
     /// rudely terminates the instance.
     pub async fn instance_ensure_unregistered(
         &self,
-        instance_id: InstanceUuid,
-    ) -> Result<InstanceUnregisterResponse, Error> {
+        propolis_id: PropolisUuid,
+    ) -> Result<VmmUnregisterResponse, Error> {
         self.inner
             .instances
-            .ensure_unregistered(instance_id)
+            .ensure_unregistered(propolis_id)
             .await
             .map_err(|e| Error::Instance(e))
     }
@@ -1003,12 +1000,12 @@ impl SledAgent {
     /// state.
     pub async fn instance_ensure_state(
         &self,
-        instance_id: InstanceUuid,
-        target: InstanceStateRequested,
-    ) -> Result<InstancePutStateResponse, Error> {
+        propolis_id: PropolisUuid,
+        target: VmmStateRequested,
+    ) -> Result<VmmPutStateResponse, Error> {
         self.inner
             .instances
-            .ensure_state(instance_id, target)
+            .ensure_state(propolis_id, target)
             .await
             .map_err(|e| Error::Instance(e))
     }
@@ -1020,12 +1017,12 @@ impl SledAgent {
     /// does not match the current ephemeral IP.
     pub async fn instance_put_external_ip(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         external_ip: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
         self.inner
             .instances
-            .add_external_ip(instance_id, external_ip)
+            .add_external_ip(propolis_id, external_ip)
             .await
             .map_err(|e| Error::Instance(e))
     }
@@ -1034,12 +1031,12 @@ impl SledAgent {
     /// specified external IP address in either its ephemeral or floating IP set.
     pub async fn instance_delete_external_ip(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         external_ip: &InstanceExternalIpBody,
     ) -> Result<(), Error> {
         self.inner
             .instances
-            .delete_external_ip(instance_id, external_ip)
+            .delete_external_ip(propolis_id, external_ip)
             .await
             .map_err(|e| Error::Instance(e))
     }
@@ -1047,11 +1044,11 @@ impl SledAgent {
     /// Returns the state of the instance with the provided ID.
     pub async fn instance_get_state(
         &self,
-        instance_id: InstanceUuid,
-    ) -> Result<SledInstanceState, Error> {
+        propolis_id: PropolisUuid,
+    ) -> Result<SledVmmState, Error> {
         self.inner
             .instances
-            .get_instance_state(instance_id)
+            .get_instance_state(propolis_id)
             .await
             .map_err(|e| Error::Instance(e))
     }
@@ -1082,19 +1079,15 @@ impl SledAgent {
     }
 
     /// Issue a snapshot request for a Crucible disk attached to an instance
-    pub async fn instance_issue_disk_snapshot_request(
+    pub async fn vmm_issue_disk_snapshot_request(
         &self,
-        instance_id: InstanceUuid,
+        propolis_id: PropolisUuid,
         disk_id: Uuid,
         snapshot_id: Uuid,
     ) -> Result<(), Error> {
         self.inner
             .instances
-            .instance_issue_disk_snapshot_request(
-                instance_id,
-                disk_id,
-                snapshot_id,
-            )
+            .issue_disk_snapshot_request(propolis_id, disk_id, snapshot_id)
             .await
             .map_err(Error::from)
     }
diff --git a/sled-agent/types/src/instance.rs b/sled-agent/types/src/instance.rs
index 0753e273dc..a39fae414b 100644
--- a/sled-agent/types/src/instance.rs
+++ b/sled-agent/types/src/instance.rs
@@ -11,14 +11,14 @@ use std::{
 
 use omicron_common::api::internal::{
     nexus::{
-        InstanceProperties, InstanceRuntimeState, SledInstanceState,
-        VmmRuntimeState,
+        InstanceProperties, InstanceRuntimeState, SledVmmState, VmmRuntimeState,
     },
     shared::{
         DhcpConfig, NetworkInterface, ResolvedVpcFirewallRule, SourceNatConfig,
     },
 };
-use omicron_uuid_kinds::PropolisUuid;
+use omicron_common::NoDebug;
+use omicron_uuid_kinds::InstanceUuid;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
@@ -37,10 +37,8 @@ pub struct InstanceEnsureBody {
     /// The initial VMM runtime state for the VMM being registered.
     pub vmm_runtime: VmmRuntimeState,
 
-    /// The ID of the VMM being registered. This may not be the active VMM ID in
-    /// the instance runtime state (e.g. if the new VMM is going to be a
-    /// migration target).
-    pub propolis_id: PropolisUuid,
+    /// The ID of the instance for which this VMM is being created.
+    pub instance_id: InstanceUuid,
 
     /// The address at which this VMM should serve a Propolis server API.
     pub propolis_addr: SocketAddr,
@@ -63,7 +61,7 @@ pub struct InstanceHardware {
     pub dhcp_config: DhcpConfig,
     // TODO: replace `propolis_client::*` with locally-modeled request type
     pub disks: Vec<propolis_client::types::DiskRequest>,
-    pub cloud_init_bytes: Option<String>,
+    pub cloud_init_bytes: Option<NoDebug<String>>,
 }
 
 /// Metadata used to track statistics about an instance.
@@ -80,19 +78,19 @@ pub struct InstanceMetadata {
 /// The body of a request to move a previously-ensured instance into a specific
 /// runtime state.
 #[derive(Serialize, Deserialize, JsonSchema)]
-pub struct InstancePutStateBody {
+pub struct VmmPutStateBody {
     /// The state into which the instance should be driven.
-    pub state: InstanceStateRequested,
+    pub state: VmmStateRequested,
 }
 
 /// The response sent from a request to move an instance into a specific runtime
 /// state.
 #[derive(Debug, Serialize, Deserialize, JsonSchema)]
-pub struct InstancePutStateResponse {
+pub struct VmmPutStateResponse {
     /// The current runtime state of the instance after handling the request to
     /// change its state. If the instance's state did not change, this field is
     /// `None`.
-    pub updated_runtime: Option<SledInstanceState>,
+    pub updated_runtime: Option<SledVmmState>,
 }
 
 /// Requestable running state of an Instance.
@@ -100,7 +98,7 @@ pub struct InstancePutStateResponse {
 /// A subset of [`omicron_common::api::external::InstanceState`].
 #[derive(Copy, Clone, Debug, Deserialize, Serialize, JsonSchema)]
 #[serde(rename_all = "snake_case", tag = "type", content = "value")]
-pub enum InstanceStateRequested {
+pub enum VmmStateRequested {
     /// Run this instance by migrating in from a previous running incarnation of
     /// the instance.
     MigrationTarget(InstanceMigrationTargetParams),
@@ -113,40 +111,40 @@ pub enum InstanceStateRequested {
     Reboot,
 }
 
-impl fmt::Display for InstanceStateRequested {
+impl fmt::Display for VmmStateRequested {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(f, "{}", self.label())
     }
 }
 
-impl InstanceStateRequested {
+impl VmmStateRequested {
     fn label(&self) -> &str {
         match self {
-            InstanceStateRequested::MigrationTarget(_) => "migrating in",
-            InstanceStateRequested::Running => "running",
-            InstanceStateRequested::Stopped => "stopped",
-            InstanceStateRequested::Reboot => "reboot",
+            VmmStateRequested::MigrationTarget(_) => "migrating in",
+            VmmStateRequested::Running => "running",
+            VmmStateRequested::Stopped => "stopped",
+            VmmStateRequested::Reboot => "reboot",
         }
     }
 
     /// Returns true if the state represents a stopped Instance.
     pub fn is_stopped(&self) -> bool {
         match self {
-            InstanceStateRequested::MigrationTarget(_) => false,
-            InstanceStateRequested::Running => false,
-            InstanceStateRequested::Stopped => true,
-            InstanceStateRequested::Reboot => false,
+            VmmStateRequested::MigrationTarget(_) => false,
+            VmmStateRequested::Running => false,
+            VmmStateRequested::Stopped => true,
+            VmmStateRequested::Reboot => false,
         }
     }
 }
 
 /// The response sent from a request to unregister an instance.
 #[derive(Serialize, Deserialize, JsonSchema)]
-pub struct InstanceUnregisterResponse {
+pub struct VmmUnregisterResponse {
     /// The current state of the instance after handling the request to
     /// unregister it. If the instance's state did not change, this field is
     /// `None`.
-    pub updated_runtime: Option<SledInstanceState>,
+    pub updated_runtime: Option<SledVmmState>,
 }
 
 /// Parameters used when directing Propolis to initialize itself via live
diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
index 2e3a8fe578..f0f40d282e 100644
--- a/smf/nexus/multi-sled/config-partial.toml
+++ b/smf/nexus/multi-sled/config-partial.toml
@@ -67,6 +67,7 @@ lookup_region_port.period_secs = 60
 instance_updater.period_secs = 30
 region_snapshot_replacement_start.period_secs = 30
 region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
 
 [default_region_allocation_strategy]
 # by default, allocate across 3 distinct sleds
diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
index dbd61e953d..23340b3c36 100644
--- a/smf/nexus/single-sled/config-partial.toml
+++ b/smf/nexus/single-sled/config-partial.toml
@@ -67,6 +67,7 @@ lookup_region_port.period_secs = 60
 instance_updater.period_secs = 30
 region_snapshot_replacement_start.period_secs = 30
 region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
 
 [default_region_allocation_strategy]
 # by default, allocate without requirement for distinct sleds.
diff --git a/tools/console_version b/tools/console_version
index ef59f6e40c..b2fc99daf3 100644
--- a/tools/console_version
+++ b/tools/console_version
@@ -1,2 +1,2 @@
-COMMIT="8dcddcef62b8d10dfcd3adb470439212b23b3d5e"
-SHA2="30a5ecc4d7b82dfc8bbd5ea59d5d92b8414d0362425c1ce1011da8c722a8ec4c"
+COMMIT="771276573549dd255c6749980636aa7140e8bab8"
+SHA2="4d441de0784bb0d775e0a7f4067758fd6c37fbf050ed76b744cd37d6e81af3d3"
diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version
index 2d0f4d4887..a9e13c083a 100755
--- a/tools/dendrite_openapi_version
+++ b/tools/dendrite_openapi_version
@@ -1,2 +1,2 @@
-COMMIT="21b16567f28e103f145cd18d53fac6958429c4ff"
+COMMIT="76c735d472e3badaeca08982e22496fccb1ce210"
 SHA2="3a54305ab4b1270c9a5fb0603f481fce199f3767c174a03559ff642f7f44687e"
diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums
index e3d16d779c..075ead4752 100644
--- a/tools/dendrite_stub_checksums
+++ b/tools/dendrite_stub_checksums
@@ -1,3 +1,3 @@
-CIDL_SHA256_ILLUMOS="3771671f0069b33143774e560eb258db99253dba9b78fa3ca974f02a8e1145b4"
-CIDL_SHA256_LINUX_DPD="6aa070ab0590aca7458f2555012acc5571e61b3b1523de862d4bbb04b9d34135"
+CIDL_SHA256_ILLUMOS="3ee6cfe770da2855b4eb44c048637d56f8d72de45c8c396186dfe7232d8548fa"
+CIDL_SHA256_LINUX_DPD="5c70318c6feb7595bdbf41d8b33827100d28fcdf34ad738a5af10e0411463f64"
 CIDL_SHA256_LINUX_SWADM="e1e35784538a4fdd76dc257cc636ac3f43f7ef2842dabfe981f17f8ce6b8e1a2"
diff --git a/tools/opte_version b/tools/opte_version
index dfbb589f24..0e2023666f 100644
--- a/tools/opte_version
+++ b/tools/opte_version
@@ -1 +1 @@
-0.33.277
+0.33.293
diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs
index 55b4d61c9a..3f460f1e37 100644
--- a/wicketd/src/http_entrypoints.rs
+++ b/wicketd/src/http_entrypoints.rs
@@ -82,6 +82,7 @@ impl WicketdApi for WicketdApiImpl {
         config.update_with_inventory_and_bootstrap_peers(
             &inventory,
             &ctx.bootstrap_peers,
+            &ctx.log,
         );
 
         Ok(HttpResponseOk((&*config).into()))
@@ -101,6 +102,7 @@ impl WicketdApi for WicketdApiImpl {
         config.update_with_inventory_and_bootstrap_peers(
             &inventory,
             &ctx.bootstrap_peers,
+            &ctx.log,
         );
         config
             .update(body.into_inner(), ctx.baseboard.as_ref())
diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs
index 56e83fcd41..46ede25eaa 100644
--- a/wicketd/src/rss_config.rs
+++ b/wicketd/src/rss_config.rs
@@ -26,6 +26,7 @@ use omicron_common::api::external::AllowedSourceIps;
 use omicron_common::api::external::SwitchLocation;
 use once_cell::sync::Lazy;
 use sled_hardware_types::Baseboard;
+use slog::debug;
 use slog::warn;
 use std::collections::btree_map;
 use std::collections::BTreeMap;
@@ -115,6 +116,7 @@ impl CurrentRssConfig {
         &mut self,
         inventory: &RackV1Inventory,
         bootstrap_peers: &BootstrapPeers,
+        log: &slog::Logger,
     ) {
         let bootstrap_sleds = bootstrap_peers.sleds();
 
@@ -126,7 +128,15 @@ impl CurrentRssConfig {
                     return None;
                 }
 
-                let state = sp.state.as_ref()?;
+                let Some(state) = sp.state.as_ref() else {
+                    debug!(
+                        log,
+                        "in update_with_inventory_and_bootstrap_peers, \
+                         filtering out SP with no state";
+                        "sp" => ?sp,
+                    );
+                    return None;
+                };
                 let baseboard = Baseboard::new_gimlet(
                     state.serial_number.clone(),
                     state.model.clone(),
diff --git a/wicketd/tests/integration_tests/inventory.rs b/wicketd/tests/integration_tests/inventory.rs
index ed5ad22d5d..c7057e3adc 100644
--- a/wicketd/tests/integration_tests/inventory.rs
+++ b/wicketd/tests/integration_tests/inventory.rs
@@ -10,6 +10,7 @@ use super::setup::WicketdTestContext;
 use gateway_messages::SpPort;
 use gateway_test_utils::setup as gateway_setup;
 use sled_hardware_types::Baseboard;
+use slog::{info, warn};
 use wicket::OutputKind;
 use wicket_common::inventory::{SpIdentifier, SpType};
 use wicket_common::rack_setup::BootstrapSledDescription;
@@ -32,13 +33,29 @@ async fn test_inventory() {
                 .into_inner();
             match response {
                 GetInventoryResponse::Response { inventory, .. } => {
-                    break inventory
-                }
-                GetInventoryResponse::Unavailable => {
-                    // Keep polling wicketd until it receives its first results from MGS.
-                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    // Ensure that the SP state is populated -- if it's not,
+                    // then the `configured-bootstrap-sleds` command below
+                    // might return an empty list.
+                    let sp_state_none: Vec<_> = inventory
+                        .sps
+                        .iter()
+                        .filter(|sp| sp.state.is_none())
+                        .collect();
+                    if sp_state_none.is_empty() {
+                        break inventory;
+                    }
+
+                    warn!(
+                        wicketd_testctx.log(),
+                        "SP state not yet populated for some SPs, retrying";
+                        "sps" => ?sp_state_none
+                    )
                 }
+                GetInventoryResponse::Unavailable => {}
             }
+
+            // Keep polling wicketd until it receives its first results from MGS.
+            tokio::time::sleep(Duration::from_millis(100)).await;
         }
     };
     let inventory =
@@ -46,6 +63,8 @@ async fn test_inventory() {
             .await
             .expect("get_inventory completed within 10 seconds");
 
+    info!(wicketd_testctx.log(), "inventory returned"; "inventory" => ?inventory);
+
     // 4 SPs attached to the inventory.
     assert_eq!(inventory.sps.len(), 4);
 
@@ -70,17 +89,17 @@ async fn test_inventory() {
             serde_json::from_slice(&stdout).expect("stdout is valid JSON");
 
         // This only tests the case that we get sleds back with no current
-        // bootstrap IP. This does provide svalue: it check that the command
-        // exists, accesses data within wicket, and returns it in the schema we
-        // expect. But it does not test the case where a sled does have a
-        // bootstrap IP.
+        // bootstrap IP. This does provide some value: it checks that the
+        // command exists, accesses data within wicket, and returns it in the
+        // schema we expect. But it does not test the case where a sled does
+        // have a bootstrap IP.
         //
         // Unfortunately, that's a difficult thing to test today. Wicket gets
         // that information by enumerating the IPs on the bootstrap network and
         // reaching out to the bootstrap_agent on them directly to ask them who
         // they are. Our testing setup does not have a way to provide such an
         // IP, or run a bootstrap_agent on an IP to respond. We should update
-        // this test when we do have that capabilitiy.
+        // this test when we do have that capability.
         assert_eq!(
             response,
             vec![
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index a39daa5735..746a0bd3ab 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -102,15 +102,14 @@ sha2 = { version = "0.10.8", features = ["oid"] }
 similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] }
 slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] }
 smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] }
-socket2 = { version = "0.5.7", default-features = false, features = ["all"] }
 spin = { version = "0.9.8" }
 string_cache = { version = "0.8.7" }
 subtle = { version = "2.5.0" }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] }
-tokio = { version = "1.38.1", features = ["full", "test-util"] }
+tokio = { version = "1.39.3", features = ["full", "test-util"] }
 tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] }
-tokio-stream = { version = "0.1.15", features = ["net"] }
+tokio-stream = { version = "0.1.15", features = ["net", "sync"] }
 tokio-util = { version = "0.7.11", features = ["codec", "io-util"] }
 toml = { version = "0.7.8" }
 toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] }
@@ -211,7 +210,6 @@ sha2 = { version = "0.10.8", features = ["oid"] }
 similar = { version = "2.6.0", features = ["bytes", "inline", "unicode"] }
 slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug", "release_max_level_trace"] }
 smallvec = { version = "1.13.2", default-features = false, features = ["const_new"] }
-socket2 = { version = "0.5.7", default-features = false, features = ["all"] }
 spin = { version = "0.9.8" }
 string_cache = { version = "0.8.7" }
 subtle = { version = "2.5.0" }
@@ -219,9 +217,9 @@ syn-dff4ba8e3ae991db = { package = "syn", version = "1.0.109", features = ["extr
 syn-f595c2ba2a3f28df = { package = "syn", version = "2.0.74", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time = { version = "0.3.36", features = ["formatting", "local-offset", "macros", "parsing"] }
 time-macros = { version = "0.2.18", default-features = false, features = ["formatting", "parsing"] }
-tokio = { version = "1.38.1", features = ["full", "test-util"] }
+tokio = { version = "1.39.3", features = ["full", "test-util"] }
 tokio-postgres = { version = "0.7.11", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] }
-tokio-stream = { version = "0.1.15", features = ["net"] }
+tokio-stream = { version = "0.1.15", features = ["net", "sync"] }
 tokio-util = { version = "0.7.11", features = ["codec", "io-util"] }
 toml = { version = "0.7.8" }
 toml_datetime = { version = "0.6.8", default-features = false, features = ["serde"] }
@@ -239,7 +237,7 @@ zeroize = { version = "1.7.0", features = ["std", "zeroize_derive"] }
 [target.x86_64-unknown-linux-gnu.dependencies]
 dof = { version = "0.3.0", default-features = false, features = ["des"] }
 linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] }
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }
@@ -248,35 +246,35 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su
 [target.x86_64-unknown-linux-gnu.build-dependencies]
 dof = { version = "0.3.0", default-features = false, features = ["des"] }
 linux-raw-sys = { version = "0.4.13", default-features = false, features = ["elf", "errno", "general", "ioctl", "no_std", "std", "system"] }
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }
 signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] }
 
 [target.x86_64-apple-darwin.dependencies]
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }
 signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] }
 
 [target.x86_64-apple-darwin.build-dependencies]
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }
 signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] }
 
 [target.aarch64-apple-darwin.dependencies]
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }
 signal-hook-mio = { version = "0.2.4", default-features = false, features = ["support-v0_8", "support-v1_0"] }
 
 [target.aarch64-apple-darwin.build-dependencies]
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }
@@ -284,7 +282,7 @@ signal-hook-mio = { version = "0.2.4", default-features = false, features = ["su
 
 [target.x86_64-unknown-illumos.dependencies]
 dof = { version = "0.3.0", default-features = false, features = ["des"] }
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }
@@ -293,7 +291,7 @@ toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19.15", featu
 
 [target.x86_64-unknown-illumos.build-dependencies]
 dof = { version = "0.3.0", default-features = false, features = ["des"] }
-mio = { version = "0.8.11", features = ["net", "os-ext"] }
+mio = { version = "1.0.2", features = ["net", "os-ext"] }
 nix = { version = "0.28.0", features = ["feature", "fs", "ioctl", "poll", "signal", "term", "uio"] }
 once_cell = { version = "1.19.0" }
 rustix = { version = "0.38.34", features = ["fs", "stdio", "system", "termios"] }