From b4b7d7aa8eab976b0b995f4db0c971914583462b Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 12 Apr 2024 13:20:16 -0700
Subject: [PATCH 01/69] initial plumbing for looking up VMMs by sled agent

---
 nexus/db-model/src/schema.rs               |  2 +
 nexus/db-queries/src/db/datastore/vmm.rs   | 43 ++++++++++++++++++++++
 nexus/src/app/background/instance_state.rs | 32 ++++++++++++++++
 nexus/src/app/background/mod.rs            |  1 +
 4 files changed, 78 insertions(+)
 create mode 100644 nexus/src/app/background/instance_state.rs
diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
index fa03aca4fb..a684b097c9 100644
--- a/nexus/db-model/src/schema.rs
+++ b/nexus/db-model/src/schema.rs
@@ -1716,3 +1716,5 @@ allow_tables_to_appear_in_same_query!(volume, virtual_provisioning_resource);
 allow_tables_to_appear_in_same_query!(ssh_key, instance_ssh_key, instance);
 joinable!(instance_ssh_key -> ssh_key (ssh_key_id));
 joinable!(instance_ssh_key -> instance (instance_id));
+
+allow_tables_to_appear_in_same_query!(inv_sled_agent, vmm);
diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs
index a837d1289b..d110045b96 100644
--- a/nexus/db-queries/src/db/datastore/vmm.rs
+++ b/nexus/db-queries/src/db/datastore/vmm.rs
@@ -9,6 +9,7 @@ use crate::authz;
 use crate::context::OpContext;
 use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
+use crate::db::model::InvSledAgent;
 use crate::db::model::Vmm;
 use crate::db::model::VmmRuntimeState;
 use crate::db::schema::vmm::dsl;
@@ -19,6 +20,7 @@ use chrono::Utc;
 use diesel::prelude::*;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::Error;
+use omicron_common::api::external::ListResultVec;
 use omicron_common::api::external::LookupResult;
 use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;
@@ -164,4 +166,45 @@ impl DataStore {
 
         Ok(vmm)
     }
+
+    pub async fn vmm_list_by_sled_agent(
+        &self,
+        opctx: &OpContext,
+    ) -> ListResultVec<(InvSledAgent, Vmm)> {
+        // TODO(eliza): should probably paginate this?
+        use crate::db::schema::inv_sled_agent;
+        opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
+
+        // Get the latest inventory collection ID.
+        let collection_id = {
+            use crate::db::schema::inv_collection::dsl;
+            dsl::inv_collection
+                .select(dsl::id)
+                .order_by(dsl::time_started.desc())
+                .first_async::<Uuid>(&*conn)
+                .await
+                .optional()
+                .map_err(|e| {
+                    public_error_from_diesel(e, ErrorHandler::Server)
+                })?
+        };
+        let Some(collection_id) = collection_id else {
+            return Ok(Vec::new());
+        };
+
+        let result = inv_sled_agent::dsl::inv_sled_agent
+            // Only list sled agents from the latest collection.
+            .filter(inv_sled_agent::dsl::inv_collection_id.eq(collection_id))
+            .inner_join(
+                dsl::vmm.on(dsl::sled_id.eq(inv_sled_agent::dsl::sled_id)),
+            )
+            .filter(dsl::time_deleted.is_null())
+            .select((InvSledAgent::as_select(), Vmm::as_select()))
+            .load_async::<(InvSledAgent, Vmm)>(&*conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        Ok(result)
+    }
 }
diff --git a/nexus/src/app/background/instance_state.rs b/nexus/src/app/background/instance_state.rs
new file mode 100644
index 0000000000..89ab0378ed
--- /dev/null
+++ b/nexus/src/app/background/instance_state.rs
@@ -0,0 +1,32 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Background task for pulling instance state from sled-agents.
+
+use super::common::BackgroundTask;
+use futures::{future::BoxFuture, FutureExt};
+use nexus_db_queries::{context::OpContext, db::DataStore};
+use serde::Serialize;
+use serde_json::json;
+use std::sync::Arc;
+
+/// Background task that periodically checks instance states.
+pub struct InstanceWatcher {
+    datastore: Arc<DataStore>,
+}
+
+impl BackgroundTask for InstanceWatcher {
+    fn activate<'a>(
+        &'a mut self,
+        opctx: &'a OpContext,
+    ) -> BoxFuture<'a, serde_json::Value> {
+        async {
+            let sleds_and_vmms =
+                self.datastore.vmm_list_by_sled_agent(opctx).await;
+
+            todo!()
+        }
+        .boxed()
+    }
+}
diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs
index 0e3b162404..56d2acb375 100644
--- a/nexus/src/app/background/mod.rs
+++ b/nexus/src/app/background/mod.rs
@@ -13,6 +13,7 @@ mod dns_propagation;
 mod dns_servers;
 mod external_endpoints;
 mod init;
+mod instance_state;
 mod inventory_collection;
 mod metrics_producer_gc;
 mod nat_cleanup;

From d011d6ab44afa4b612e742eddb490a97ea49e03c Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 12 Apr 2024 15:08:08 -0700
Subject: [PATCH 02/69] stub sled agent plumbing

---
 sled-agent/src/http_entrypoints.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
index b457047ad6..4f356db872 100644
--- a/sled-agent/src/http_entrypoints.rs
+++ b/sled-agent/src/http_entrypoints.rs
@@ -1004,3 +1004,15 @@ async fn bootstore_status(
         .into();
     Ok(HttpResponseOk(status))
 }
+
+/// Get the status of a VMM
+#[endpoint {
+    method = GET,
+    path = "/vmm/{id}/sled-instance-state"
+}]
+async fn vmm_get_sled_instance_state(
+    request_context: RequestContext<SledAgent>,
+) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
+    let sa = request_context.context();
+    todo!("eliza")
+}

From 8e5f758a8ca88d49d5bedce5b2a2995666e3b5c7 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 17 Apr 2024 14:33:46 -0700
Subject: [PATCH 03/69] more wip

---
 sled-agent/src/http_entrypoints.rs | 21 +++++++++++++++++++++
 sled-agent/src/sled_agent.rs       |  8 ++++++++
 2 files changed, 29 insertions(+)

diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
index 4f356db872..4c4d75aa8a 100644
--- a/sled-agent/src/http_entrypoints.rs
+++ b/sled-agent/src/http_entrypoints.rs
@@ -473,6 +473,26 @@ async fn instance_put_state(
     ))
 }
 
+#[endpoint {
+    method = GET,
+    path = "/instances/{instance_id}/state",
+}]
+async fn instance_get_state(
+    rqctx: RequestContext<SledAgent>,
+    path_params: Path<InstancePathParam>,
+) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
+    let sa = rqctx.context();
+    let instance_id = path_params.into_inner().instance_id;
+    Ok(HttpResponseOk(
+        sa.instance_get_state(instance_id).await?.ok_or_else(|| {
+            HttpError::for_not_found(
+                None,
+                format!("instance {} not found", instance_id),
+            )
+        })?
+    ))
+}
+
 #[endpoint {
     method = PUT,
     path = "/instances/{instance_id}/migration-ids",
@@ -1014,5 +1034,6 @@ async fn vmm_get_sled_instance_state(
     request_context: RequestContext<SledAgent>,
 ) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
     let sa = request_context.context();
+    let id = reques
     todo!("eliza")
 }
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index 4216cd4b6a..cf4bfd727a 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -982,6 +982,14 @@ impl SledAgent {
             .map_err(|e| Error::Instance(e))
     }
 
+    /// Returns the state of the instance with the provided ID.
+    pub async fn instance_get_state(
+        &self,
+        instance_id: Uuid,
+    ) -> Result<Option<SledInstanceState>, ()> {
+        todo!("eliza do this")
+    }
+
     /// Idempotently ensures that the given virtual disk is attached (or not) as
     /// specified.
     ///

From c4f9654056a1fc13f5d72ae9e865c866a5bc4975 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 22 Apr 2024 12:10:30 -0700
Subject: [PATCH 04/69] wip

---
 nexus/db-model/src/lib.rs                     |  2 +-
 nexus/db-model/src/schema.rs                  |  2 +-
 nexus/db-model/src/typed_uuid.rs              |  7 +--
 .../db-queries/src/db/datastore/inventory.rs  | 35 +++++++------
 .../src/db/datastore/sled_instance.rs         | 38 ++++++++++++++
 nexus/db-queries/src/db/datastore/vmm.rs      | 42 ----------------
 nexus/src/app/background/instance_state.rs    | 50 +++++++++++++++++--
 sled-agent/src/http_entrypoints.rs            | 23 +--------
 sled-agent/src/instance.rs                    |  4 +-
 sled-agent/src/instance_manager.rs            | 41 +++++++++++++++
 sled-agent/src/sled_agent.rs                  |  8 ++-
 uuid-kinds/src/lib.rs                         |  1 +
 12 files changed, 160 insertions(+), 93 deletions(-)

diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs
index 6495a0c960..cda22f42f8 100644
--- a/nexus/db-model/src/lib.rs
+++ b/nexus/db-model/src/lib.rs
@@ -182,7 +182,7 @@ pub use switch::*;
 pub use switch_interface::*;
 pub use switch_port::*;
 pub use tuf_repo::*;
-pub use typed_uuid::to_db_typed_uuid;
+pub use typed_uuid::*;
 pub use upstairs_repair::*;
 pub use user_builtin::*;
 pub use utilization::*;
diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
index a684b097c9..2173ff556e 100644
--- a/nexus/db-model/src/schema.rs
+++ b/nexus/db-model/src/schema.rs
@@ -1717,4 +1717,4 @@ allow_tables_to_appear_in_same_query!(ssh_key, instance_ssh_key, instance);
 joinable!(instance_ssh_key -> ssh_key (ssh_key_id));
 joinable!(instance_ssh_key -> instance (instance_id));
 
-allow_tables_to_appear_in_same_query!(inv_sled_agent, vmm);
+allow_tables_to_appear_in_same_query!(inv_sled_agent, sled_instance);
diff --git a/nexus/db-model/src/typed_uuid.rs b/nexus/db-model/src/typed_uuid.rs
index 1e54e242f3..7a172703c7 100644
--- a/nexus/db-model/src/typed_uuid.rs
+++ b/nexus/db-model/src/typed_uuid.rs
@@ -19,18 +19,13 @@ use uuid::Uuid;
 /// Returns the corresponding `DbTypedUuid` for this `TypedUuid`.
 ///
 /// Code external to the `db-model` crate sometimes needs a way to convert a
-/// `TypedUuid` to a `DbTypedUuid`. We don't want `DbTypedUuid` to be used
-/// anywhere, so we don't make it public. Instead, we expose this function.
+/// `TypedUuid` to a `DbTypedUuid`.
 #[inline]
 pub fn to_db_typed_uuid<T: TypedUuidKind>(id: TypedUuid<T>) -> DbTypedUuid<T> {
     DbTypedUuid(id)
 }
 
 /// A UUID with information about the kind of type it is.
-///
-/// Despite the fact that this is marked `pub`, this is *private* to the
-/// `db-model` crate (this type is not exported at the top level). External
-/// users must use omicron-common's `TypedUuid`.
 #[derive_where(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
 #[derive(AsExpression, FromSqlRow, Serialize, Deserialize, JsonSchema)]
 #[diesel(sql_type = sql_types::Uuid)]
diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs
index 6faa8ea251..bba96c76f9 100644
--- a/nexus/db-queries/src/db/datastore/inventory.rs
+++ b/nexus/db-queries/src/db/datastore/inventory.rs
@@ -1312,28 +1312,33 @@ impl DataStore {
         &self,
         opctx: &OpContext,
     ) -> Result<Option<Collection>, Error> {
+        let Some(id) = self.inventory_get_latest_collection_id(opctx).await?
+        else {
+            return Ok(None);
+        };
+        Ok(Some(self.inventory_collection_read(opctx, id).await?))
+    }
+
+    /// Returns the ID of the latest collection, if any collections exist.
+    ///
+    /// If there aren't any collections, returns `Ok(None)`.
+    pub async fn inventory_get_latest_collection_id(
+        &self,
+        opctx: &OpContext,
+    ) -> Result<Option<CollectionUuid>, Error> {
+        use db::schema::inv_collection::dsl;
+
         opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
         let conn = self.pool_connection_authorized(opctx).await?;
-        use db::schema::inv_collection::dsl;
-        let collection_id = dsl::inv_collection
+        let id = dsl::inv_collection
             .select(dsl::id)
             .order_by(dsl::time_started.desc())
             .first_async::<Uuid>(&*conn)
             .await
             .optional()
-            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
-
-        let Some(collection_id) = collection_id else {
-            return Ok(None);
-        };
-
-        Ok(Some(
-            self.inventory_collection_read(
-                opctx,
-                CollectionUuid::from_untyped_uuid(collection_id),
-            )
-            .await?,
-        ))
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?
+            .map(CollectionUuid::from_untyped_uuid);
+        Ok(id)
     }
 
     /// Attempt to read the current collection
diff --git a/nexus/db-queries/src/db/datastore/sled_instance.rs b/nexus/db-queries/src/db/datastore/sled_instance.rs
index dbdd696d70..f97f43569a 100644
--- a/nexus/db-queries/src/db/datastore/sled_instance.rs
+++ b/nexus/db-queries/src/db/datastore/sled_instance.rs
@@ -5,12 +5,17 @@ use crate::context::OpContext;
 use crate::db;
 use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
+use crate::db::model::InvSledAgent;
 use crate::db::pagination::paginated;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use diesel::prelude::*;
+use nexus_db_model::DbTypedUuid;
 use nexus_db_model::SledInstance;
 use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::ListResultVec;
+use omicron_uuid_kinds::CollectionUuid;
+use omicron_uuid_kinds::GenericUuid;
+use omicron_uuid_kinds::SledKind;
 use uuid::Uuid;
 
 impl DataStore {
@@ -31,4 +36,37 @@ impl DataStore {
             .await
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
     }
+
+    pub async fn sled_instance_list_by_sled_agent(
+        &self,
+        opctx: &OpContext,
+        collection: CollectionUuid,
+        pagparams: &DataPageParams<'_, DbTypedUuid<SledKind>>,
+    ) -> ListResultVec<(InvSledAgent, SledInstance)> {
+        // TODO(eliza): should probably paginate this?
+        use crate::db::schema::{inv_sled_agent, sled_instance::dsl};
+        opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
+
+        let result = paginated(
+            inv_sled_agent::dsl::inv_sled_agent,
+            inv_sled_agent::dsl::sled_id,
+            &pagparams,
+        )
+        // Only list sled agents from the latest collection.
+        .filter(
+            inv_sled_agent::dsl::inv_collection_id
+                .eq(collection.into_untyped_uuid()),
+        )
+        .inner_join(
+            dsl::sled_instance
+                .on(dsl::active_sled_id.eq(inv_sled_agent::dsl::sled_id)),
+        )
+        .select((InvSledAgent::as_select(), SledInstance::as_select()))
+        .load_async::<(InvSledAgent, SledInstance)>(&*conn)
+        .await
+        .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        Ok(result)
+    }
 }
diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs
index d110045b96..d1a2446356 100644
--- a/nexus/db-queries/src/db/datastore/vmm.rs
+++ b/nexus/db-queries/src/db/datastore/vmm.rs
@@ -9,7 +9,6 @@ use crate::authz;
 use crate::context::OpContext;
 use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
-use crate::db::model::InvSledAgent;
 use crate::db::model::Vmm;
 use crate::db::model::VmmRuntimeState;
 use crate::db::schema::vmm::dsl;
@@ -166,45 +165,4 @@ impl DataStore {
 
         Ok(vmm)
     }
-
-    pub async fn vmm_list_by_sled_agent(
-        &self,
-        opctx: &OpContext,
-    ) -> ListResultVec<(InvSledAgent, Vmm)> {
-        // TODO(eliza): should probably paginate this?
-        use crate::db::schema::inv_sled_agent;
-        opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
-        let conn = self.pool_connection_authorized(opctx).await?;
-
-        // Get the latest inventory collection ID.
-        let collection_id = {
-            use crate::db::schema::inv_collection::dsl;
-            dsl::inv_collection
-                .select(dsl::id)
-                .order_by(dsl::time_started.desc())
-                .first_async::<Uuid>(&*conn)
-                .await
-                .optional()
-                .map_err(|e| {
-                    public_error_from_diesel(e, ErrorHandler::Server)
-                })?
-        };
-        let Some(collection_id) = collection_id else {
-            return Ok(Vec::new());
-        };
-
-        let result = inv_sled_agent::dsl::inv_sled_agent
-            // Only list sled agents from the latest collection.
-            .filter(inv_sled_agent::dsl::inv_collection_id.eq(collection_id))
-            .inner_join(
-                dsl::vmm.on(dsl::sled_id.eq(inv_sled_agent::dsl::sled_id)),
-            )
-            .filter(dsl::time_deleted.is_null())
-            .select((InvSledAgent::as_select(), Vmm::as_select()))
-            .load_async::<(InvSledAgent, Vmm)>(&*conn)
-            .await
-            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
-
-        Ok(result)
-    }
 }
diff --git a/nexus/src/app/background/instance_state.rs b/nexus/src/app/background/instance_state.rs
index 89ab0378ed..d0763166f2 100644
--- a/nexus/src/app/background/instance_state.rs
+++ b/nexus/src/app/background/instance_state.rs
@@ -6,9 +6,12 @@
 
 use super::common::BackgroundTask;
 use futures::{future::BoxFuture, FutureExt};
-use nexus_db_queries::{context::OpContext, db::DataStore};
+use nexus_db_queries::{
+    context::OpContext, db::pagination::Paginator, db::DataStore,
+};
 use serde::Serialize;
 use serde_json::json;
+use std::num::NonZeroU32;
 use std::sync::Arc;
 
 /// Background task that periodically checks instance states.
@@ -16,14 +19,55 @@ pub struct InstanceWatcher {
     datastore: Arc<DataStore>,
 }
 
+const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
+    // Safety: last time I checked, 100 was greater than zero.
+    NonZeroU32::new_unchecked(100)
+};
+
 impl BackgroundTask for InstanceWatcher {
     fn activate<'a>(
         &'a mut self,
         opctx: &'a OpContext,
     ) -> BoxFuture<'a, serde_json::Value> {
         async {
-            let sleds_and_vmms =
-                self.datastore.vmm_list_by_sled_agent(opctx).await;
+            let latest_collection = {
+                let maybe_id = self
+                    .datastore
+                    .inventory_get_latest_collection_id(opctx)
+                    .await;
+                match maybe_id {
+                    Ok(Some(collection)) => collection,
+                    Ok(None) => {
+                        slog::debug!(opctx.log, "no inventory collection exists, not querying sled agents.");
+                        return json!({});
+                        }
+                    Err(e) => {
+                        slog::warn!(opctx.log, "failed to get latest collection ID: {e}");
+                        return json!({});
+                    }
+                }
+            };
+
+            let mut requests = tokio::task::JoinSet::new();
+            let mut paginator = Paginator::new(MAX_SLED_AGENTS);
+            while let Some(p) = paginator.next() {
+                let maybe_batch = self.datastore.sled_instance_list_by_sled_agent(
+                    opctx,
+                    latest_collection,
+                    &p.current_pagparams(),
+                ).await;
+                let batch = match maybe_batch {
+                    Ok(batch) => batch,
+                    Err(e) => {
+                        slog::warn!(opctx.log, "sled instances by sled agent query failed: {e}");
+                        break;
+                    }
+                };
+                paginator = p.found_batch(&batch, &|(sled_agent, _)| *sled_agent.sled_id);
+                for (sled_agent, sled_instance) in batch {
+                    todo!()
+                }
+            }
 
             todo!()
         }
diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
index 4c4d75aa8a..99c7725fe3 100644
--- a/sled-agent/src/http_entrypoints.rs
+++ b/sled-agent/src/http_entrypoints.rs
@@ -51,6 +51,7 @@ pub fn api() -> SledApiDescription {
         api.register(instance_issue_disk_snapshot_request)?;
         api.register(instance_put_migration_ids)?;
         api.register(instance_put_state)?;
+        api.register(instance_get_state)?;
         api.register(instance_put_external_ip)?;
         api.register(instance_delete_external_ip)?;
         api.register(instance_register)?;
@@ -483,14 +484,7 @@ async fn instance_get_state(
 ) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
     let sa = rqctx.context();
     let instance_id = path_params.into_inner().instance_id;
-    Ok(HttpResponseOk(
-        sa.instance_get_state(instance_id).await?.ok_or_else(|| {
-            HttpError::for_not_found(
-                None,
-                format!("instance {} not found", instance_id),
-            )
-        })?
-    ))
+    Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?))
 }
 
 #[endpoint {
@@ -1024,16 +1018,3 @@ async fn bootstore_status(
         .into();
     Ok(HttpResponseOk(status))
 }
-
-/// Get the status of a VMM
-#[endpoint {
-    method = GET,
-    path = "/vmm/{id}/sled-instance-state"
-}]
-async fn vmm_get_sled_instance_state(
-    request_context: RequestContext<SledAgent>,
-) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
-    let sa = request_context.context();
-    let id = reques
-    todo!("eliza")
-}
diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
index 94ad8522c7..271eceb556 100644
--- a/sled-agent/src/instance.rs
+++ b/sled-agent/src/instance.rs
@@ -405,7 +405,7 @@ impl InstanceRunner {
                                 .map_err(|_| Error::FailedSendClientClosed)
                         },
                         Some(CurrentState{ tx }) => {
-                            tx.send(self.current_state().await)
+                            tx.send(self.current_state())
                                 .map_err(|_| Error::FailedSendClientClosed)
                         },
                         Some(PutState{ state, tx }) => {
@@ -1176,7 +1176,7 @@ impl InstanceRunner {
         }
     }
 
-    async fn current_state(&self) -> SledInstanceState {
+    fn current_state(&self) -> SledInstanceState {
         self.state.sled_instance_state()
     }
 
diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs
index cf6563b117..ee1425f0d7 100644
--- a/sled-agent/src/instance_manager.rs
+++ b/sled-agent/src/instance_manager.rs
@@ -310,6 +310,19 @@ impl InstanceManager {
     pub fn reservoir_size(&self) -> ByteCount {
         self.inner.vmm_reservoir_manager.reservoir_size()
     }
+
+    pub async fn get_instance_state(
+        &self,
+        instance_id: Uuid,
+    ) -> Result<SledInstanceState, Error> {
+        let (tx, rx) = oneshot::channel();
+        self.inner
+            .tx
+            .send(InstanceManagerRequest::GetState { instance_id, tx })
+            .await
+            .map_err(|_| Error::FailedSendInstanceManagerClosed)?;
+        rx.await?
+    }
 }
 
 // Most requests that can be sent to the "InstanceManagerRunner" task.
@@ -365,6 +378,10 @@ enum InstanceManagerRequest {
         ip: InstanceExternalIpBody,
         tx: oneshot::Sender<Result<(), Error>>,
     },
+    GetState {
+        instance_id: Uuid,
+        tx: oneshot::Sender<Result<SledInstanceState, Error>>,
+    },
 }
 
 // Requests that the instance manager stop processing information about a
@@ -467,6 +484,14 @@ impl InstanceManagerRunner {
                         Some(InstanceDeleteExternalIp { instance_id, ip, tx }) => {
                             self.delete_external_ip(tx, instance_id, &ip).await
                         },
+                        Some(GetState { instance_id, tx }) => {
+                            // TODO(eliza): it could potentially be nice to
+                            // refactor this to use `tokio::sync::watch`, rather
+                            // than having to force `GetState` requests to
+                            // serialize with the requests that actually update
+                            // the state...
+                            self.get_instance_state(tx, instance_id).await
+                        },
                         None => {
                             warn!(self.log, "InstanceManager's request channel closed; shutting down");
                             break;
@@ -732,6 +757,22 @@ impl InstanceManagerRunner {
         instance.delete_external_ip(tx, ip).await?;
         Ok(())
     }
+
+    async fn get_instance_state(
+        &self,
+        tx: oneshot::Sender<Result<SledInstanceState, Error>>,
+        instance_id: Uuid,
+    ) -> Result<(), Error> {
+        let Some(instance) = self.get_instance(instance_id) else {
+            return tx
+                .send(Err(Error::NoSuchInstance(instance_id)))
+                .map_err(|_| Error::FailedSendClientClosed);
+        };
+
+        let state = instance.current_state().await?;
+        tx.send(Ok(state)).map_err(|_| Error::FailedSendClientClosed)?;
+        Ok(())
+    }
 }
 
 /// Represents membership of an instance in the [`InstanceManager`].
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index cf4bfd727a..d987c6fa1b 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -986,8 +986,12 @@ impl SledAgent {
     pub async fn instance_get_state(
         &self,
         instance_id: Uuid,
-    ) -> Result<Option<SledInstanceState>, ()> {
-        todo!("eliza do this")
+    ) -> Result<SledInstanceState, Error> {
+        self.inner
+            .instances
+            .get_instance_state(instance_id)
+            .await
+            .map_err(|e| Error::Instance(e))
     }
 
     /// Idempotently ensures that the given virtual disk is attached (or not) as
diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs
index 41d1bfc1f6..489e0da365 100644
--- a/uuid-kinds/src/lib.rs
+++ b/uuid-kinds/src/lib.rs
@@ -53,6 +53,7 @@ impl_typed_uuid_kind! {
     Downstairs => "downstairs",
     DownstairsRegion => "downstairs_region",
     ExternalIp => "external_ip",
+    Instance => "instance",
     LoopbackAddress => "loopback_address",
     OmicronZone => "service",
     PhysicalDisk => "physical_disk",

From a36b8774009da06d35b625f1ffd90c2b04a9cb88 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 23 Apr 2024 12:43:03 -0700
Subject: [PATCH 05/69] wip

---
 nexus/db-model/src/sled_instance.rs           |  6 ++
 .../src/db/datastore/sled_instance.rs         |  2 +-
 nexus/src/app/background/instance_state.rs    | 73 ++++++++++++++++++-
 openapi/sled-agent.json                       | 32 ++++++++
 4 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/nexus/db-model/src/sled_instance.rs b/nexus/db-model/src/sled_instance.rs
index e3a901264d..bbc92ddf18 100644
--- a/nexus/db-model/src/sled_instance.rs
+++ b/nexus/db-model/src/sled_instance.rs
@@ -41,3 +41,9 @@ impl From<SledInstance> for views::SledInstance {
         }
     }
 }
+
+impl SledInstance {
+    pub fn instance_id(&self) -> Uuid {
+        self.identity.id
+    }
+}
diff --git a/nexus/db-queries/src/db/datastore/sled_instance.rs b/nexus/db-queries/src/db/datastore/sled_instance.rs
index f97f43569a..58b227b357 100644
--- a/nexus/db-queries/src/db/datastore/sled_instance.rs
+++ b/nexus/db-queries/src/db/datastore/sled_instance.rs
@@ -51,7 +51,7 @@ impl DataStore {
         let result = paginated(
             inv_sled_agent::dsl::inv_sled_agent,
             inv_sled_agent::dsl::sled_id,
-            &pagparams,
+            pagparams,
         )
         // Only list sled agents from the latest collection.
         .filter(
diff --git a/nexus/src/app/background/instance_state.rs b/nexus/src/app/background/instance_state.rs
index d0763166f2..9514110ad6 100644
--- a/nexus/src/app/background/instance_state.rs
+++ b/nexus/src/app/background/instance_state.rs
@@ -6,11 +6,13 @@
 
 use super::common::BackgroundTask;
 use futures::{future::BoxFuture, FutureExt};
+use nexus_db_model::{InvSledAgent, SledInstance};
 use nexus_db_queries::{
     context::OpContext, db::pagination::Paginator, db::DataStore,
 };
 use serde::Serialize;
 use serde_json::json;
+use sled_agent_client::{types::SledInstanceState, Client as SledAgentClient};
 use std::num::NonZeroU32;
 use std::sync::Arc;
 
@@ -63,14 +65,79 @@ impl BackgroundTask for InstanceWatcher {
                         break;
                     }
                 };
-                paginator = p.found_batch(&batch, &|(sled_agent, _)| *sled_agent.sled_id);
-                for (sled_agent, sled_instance) in batch {
-                    todo!()
+                paginator = p.found_batch(&batch, &|(sled_agent, _)| sled_agent.sled_id);
+                let mut batch = batch.into_iter();
+
+                if let Some((mut curr_sled_agent, sled_instance)) = batch.next() {
+                    let mut client = mk_sled_agent_client(&opctx.log, &curr_sled_agent);
+
+                    for (sled_agent, sled_instance) in batch {
+                        // We're now talking to a new sled agent; update the client.
+                        if sled_agent.sled_id != curr_sled_agent.sled_id {
+                            client = mk_sled_agent_client(&opctx.log, &sled_agent);
+                            curr_sled_agent = sled_agent;
+                        }
+                        spawn_get_state(&client, &mut requests, sled_instance);
+                    }
                 }
             }
 
+            // All requests fired off, let's wait for them to come back.
+            while let Some(result) = requests.join_next().await {
+                let (instance, state) = match result {
+                    Err(_) => unreachable!(
+                        "a `JoinError` is returned if a spawned task \
+                        panics, or if the task is aborted. we never abort \
+                        tasks on this `JoinSet`, and nexus is compiled with \
+                        `panic=\"abort\"`, so neither of these cases should \
+                        ever occur."
+                    ),
+                    Ok(Ok(rsp)) => rsp,
+                    Ok(Err(e)) => {
+                        // Here is where it gets interesting. This is where we
+                        // might learn that the sled-agent we were trying to
+                        // talk to is dead.
+                        todo!("eliza: implement the interesting parts!");
+                    }
+                };
+            }
+
             todo!()
         }
         .boxed()
     }
 }
+
+type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
+
+fn spawn_get_state(
+    client: &SledAgentClient,
+    tasks: &mut tokio::task::JoinSet<
+        Result<(SledInstance, SledInstanceState), ClientError>,
+    >,
+    instance: SledInstance,
+) {
+    let client = client.clone();
+    tasks.spawn(async move {
+        let state = client
+            .instance_get_state(&instance.instance_id())
+            .await?
+            .into_inner();
+        Ok((instance, state))
+    });
+}
+
+fn mk_sled_agent_client(
+    log: &slog::Logger,
+    InvSledAgent {
+        ref sled_id, ref sled_agent_ip, ref sled_agent_port, ..
+    }: &InvSledAgent,
+) -> SledAgentClient {
+    // Ipv6Addr's `fmt::Debug` impl is the same as its Display impl, so we
+    // should get the RFC 5952 textual representation here even though the DB
+    // `Ipv6Addr` type doesn't expose `Display`.
+    let url = format!("http://{sled_agent_ip:?}:{sled_agent_port}");
+    let log =
+        log.new(o!("sled_id" => sled_id.to_string(), "url" => url.clone()));
+    SledAgentClient::new(&url, log)
+}
diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json
index 0875d0d53a..c12217381e 100644
--- a/openapi/sled-agent.json
+++ b/openapi/sled-agent.json
@@ -468,6 +468,38 @@
       }
     },
     "/instances/{instance_id}/state": {
+      "get": {
+        "operationId": "instance_get_state",
+        "parameters": [
+          {
+            "in": "path",
+            "name": "instance_id",
+            "required": true,
+            "schema": {
+              "type": "string",
+              "format": "uuid"
+            }
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "successful operation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SledInstanceState"
+                }
+              }
+            }
+          },
+          "4XX": {
+            "$ref": "#/components/responses/Error"
+          },
+          "5XX": {
+            "$ref": "#/components/responses/Error"
+          }
+        }
+      },
       "put": {
         "operationId": "instance_put_state",
         "parameters": [

From 7422676ee34a64713217491c2b83b2e21ba002a4 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 10:20:53 -0700
Subject: [PATCH 06/69] wip

---
 nexus/src/app/background/instance_state.rs |   6 +-
 nexus/src/app/instance_network.rs          | 119 +++++++++++++++++++++
 2 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/nexus/src/app/background/instance_state.rs b/nexus/src/app/background/instance_state.rs
index 9514110ad6..2149167334 100644
--- a/nexus/src/app/background/instance_state.rs
+++ b/nexus/src/app/background/instance_state.rs
@@ -84,7 +84,7 @@ impl BackgroundTask for InstanceWatcher {
 
             // All requests fired off, let's wait for them to come back.
             while let Some(result) = requests.join_next().await {
-                let (instance, state) = match result {
+                match result {
                     Err(_) => unreachable!(
                         "a `JoinError` is returned if a spawned task \
                         panics, or if the task is aborted. we never abort \
@@ -92,7 +92,9 @@ impl BackgroundTask for InstanceWatcher {
                         `panic=\"abort\"`, so neither of these cases should \
                         ever occur."
                     ),
-                    Ok(Ok(rsp)) => rsp,
+                    Ok(Ok(rsp)) => {
+                        todo!("eliza");
+                    }
                     Ok(Err(e)) => {
                         // Here is where it gets interesting. This is where we
                         // might learn that the sled-agent we were trying to
diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs
index 2c258c8064..89b4dd2626 100644
--- a/nexus/src/app/instance_network.rs
+++ b/nexus/src/app/instance_network.rs
@@ -1314,3 +1314,122 @@ async fn ensure_nat_entry(
         }
     }
 }
+
+/// Ensure that the necessary v2p mappings for an instance are deleted
+pub(crate) async fn delete_instance_v2p_mappings(
+    datastore: &DataStore,
+    log: &slog::Logger,
+    opctx: &OpContext,
+    opctx_alloc: &OpContext,
+    instance_id: Uuid,
+) -> Result<(), Error> {
+    // For every sled that isn't the sled this instance was allocated to, delete
+    // the virtual to physical mapping for each of this instance's NICs. If
+    // there isn't a V2P mapping, del_v2p should be a no-op.
+    let (.., authz_instance) = LookupPath::new(&opctx, datastore)
+        .instance_id(instance_id)
+        .lookup_for(authz::Action::Read)
+        .await?;
+
+    let instance_nics = datastore
+        .derive_guest_network_interface_info(&opctx, &authz_instance)
+        .await?;
+
+    let mut last_sled_id: Option<Uuid> = None;
+
+    loop {
+        let pagparams = DataPageParams {
+            marker: last_sled_id.as_ref(),
+            direction: dropshot::PaginationOrder::Ascending,
+            limit: std::num::NonZeroU32::new(10).unwrap(),
+        };
+
+        let sleds_page = datastore.sled_list(&opctx_alloc, &pagparams).await?;
+        let mut join_handles =
+            Vec::with_capacity(sleds_page.len() * instance_nics.len());
+
+        for sled in &sleds_page {
+            for nic in &instance_nics {
+                let client = nexus_networking::sled_client(
+                    &datastore,
+                    &opctx_alloc,
+                    sled.id(),
+                    &log,
+                )
+                .await?;
+                let nic_id = nic.id;
+                let mapping = DeleteVirtualNetworkInterfaceHost {
+                    virtual_ip: nic.ip,
+                    vni: nic.vni,
+                };
+
+                let log = log.clone();
+
+                // This function is idempotent: calling the set_v2p ioctl with
+                // the same information is a no-op.
+                join_handles.push(tokio::spawn(futures::future::lazy(
+                    move |_ctx| async move {
+                        retry_until_known_result(&log, || async {
+                            client.del_v2p(&nic_id, &mapping).await
+                        })
+                        .await
+                    },
+                )));
+            }
+        }
+
+        // Concurrently run each future to completion, but return the last
+        // error seen.
+        let mut error = None;
+        for join_handle in join_handles {
+            let result = join_handle
+                .await
+                .map_err(|e| Error::internal_error(&e.to_string()))?
+                .await;
+
+            if result.is_err() {
+                error!(log, "{:?}", result);
+                error = Some(result);
+            }
+        }
+        if let Some(e) = error {
+            return e.map(|_| ()).map_err(|e| e.into());
+        }
+
+        if sleds_page.len() < 10 {
+            break;
+        }
+
+        if let Some(last) = sleds_page.last() {
+            last_sled_id = Some(last.id());
+        }
+    }
+
+    Ok(())
+}
+
+/// Soft-delete an individual external IP from the NAT RPW, without
+/// triggering a Dendrite notification.
+async fn external_ip_delete_dpd_config_inner(
+    datastore: &DataStore,
+    log: &slog::Logger,
+    opctx: &OpContext,
+    external_ip: &ExternalIp,
+) -> Result<(), Error> {
+    // Soft delete the NAT entry
+    match datastore.ipv4_nat_delete_by_external_ip(&opctx, external_ip).await {
+        Ok(_) => Ok(()),
+        Err(err) => match err {
+            Error::ObjectNotFound { .. } => {
+                warn!(log, "no matching nat entries to soft delete");
+                Ok(())
+            }
+            _ => {
+                let message =
+                    format!("failed to delete nat entry due to error: {err:?}");
+                error!(log, "{}", message);
+                Err(Error::internal_error(&message))
+            }
+        },
+    }
+}

From 464c4c0cfd66df1cc40f3820868fc9f3ae137bec Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 11:28:03 -0700
Subject: [PATCH 07/69] tear apart the entire `instance_network` module

---
 nexus/src/app/instance_network.rs | 628 ++++++++++++++++++++++++++++++
 1 file changed, 628 insertions(+)

diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs
index 89b4dd2626..372e31af15 100644
--- a/nexus/src/app/instance_network.rs
+++ b/nexus/src/app/instance_network.rs
@@ -1313,6 +1313,489 @@ async fn ensure_nat_entry(
             });
         }
     }
+
+    // If the instance still has a migration in progress, don't change
+    // any networking state until an update arrives that retires that
+    // migration.
+    //
+    // This is needed to avoid the following race:
+    //
+    // 1. Migration from S to T completes.
+    // 2. Migration source sends an update that changes the instance's
+    //    active VMM but leaves the migration ID in place.
+    // 3. Meanwhile, migration target sends an update that changes the
+    //    instance's active VMM and clears the migration ID.
+    // 4. The migration target's call updates networking state and commits
+    //    the new instance record.
+    // 5. The instance migrates from T to T' and Nexus applies networking
+    //    configuration reflecting that the instance is on T'.
+    // 6. The update in step 2 applies configuration saying the instance
+    //    is on sled T.
+    if new_instance_state.migration_id.is_some() {
+        debug!(log,
+                "instance still has a migration in progress, won't touch \
+                network config";
+                "instance_id" => %instance_id,
+                "migration_id" => ?new_instance_state.migration_id);
+
+        return Ok(());
+    }
+
+    let new_propolis_id = new_instance_state.propolis_id.unwrap();
+
+    // Updates that end live migration need to push OPTE V2P state even if
+    // the instance's active sled did not change (see below).
+    let migration_retired = prev_instance_state.migration_id.is_some()
+        && new_instance_state.migration_id.is_none();
+
+    if (prev_instance_state.propolis_id == new_instance_state.propolis_id)
+        && !migration_retired
+    {
+        debug!(log, "instance didn't move, won't touch network config";
+                "instance_id" => %instance_id);
+
+        return Ok(());
+    }
+
+    // Either the instance moved from one sled to another, or it attempted
+    // to migrate and failed. Ensure the correct networking configuration
+    // exists for its current home.
+    //
+    // TODO(#3107) This is necessary even if the instance didn't move,
+    // because registering a migration target on a sled creates OPTE ports
+    // for its VNICs, and that creates new V2P mappings on that sled that
+    // place the relevant virtual IPs on the local sled. Once OPTE stops
+    // creating these mappings, this path only needs to be taken if an
+    // instance has changed sleds.
+    let new_sled_id = match datastore
+        .vmm_fetch(&opctx, authz_instance, &new_propolis_id)
+        .await
+    {
+        Ok(vmm) => vmm.sled_id,
+
+        // A VMM in the active position should never be destroyed. If the
+        // sled sending this message is the owner of the instance's last
+        // active VMM and is destroying it, it should also have retired that
+        // VMM.
+        Err(Error::ObjectNotFound { .. }) => {
+            error!(log, "instance's active vmm unexpectedly not found";
+                    "instance_id" => %instance_id,
+                    "propolis_id" => %new_propolis_id);
+
+            return Ok(());
+        }
+
+        Err(e) => return Err(e),
+    };
+
+    create_instance_v2p_mappings(
+        datastore,
+        log,
+        opctx,
+        opctx_alloc,
+        instance_id,
+        new_sled_id,
+    )
+    .await?;
+
+    let (.., sled) =
+        LookupPath::new(opctx, datastore).sled_id(new_sled_id).fetch().await?;
+
+    instance_ensure_dpd_config(
+        datastore,
+        log,
+        resolver,
+        opctx,
+        opctx_alloc,
+        instance_id,
+        &sled.address(),
+        None,
+    )
+    .await?;
+
+    Ok(())
+}
+
+/// Ensures that the Dendrite configuration for the supplied instance is
+/// up-to-date.
+///
+/// Returns a list of live NAT RPW table entries from this call. Generally
+/// these should only be needed for specific unwind operations, like in
+/// the IP attach saga.
+///
+/// # Parameters
+///
+/// - `datastore`: The datastore to use for lookups and updates.
+/// - `opctx`: An operation context that grants read and list-children
+///   permissions on the identified instance.
+/// - `instance_id`: The ID of the instance to act on.
+/// - `sled_ip_address`: The internal IP address assigned to the sled's
+///   sled agent.
+/// - `ip_filter`: An optional filter on the index into the instance's
+///   external IP array.
+///   - If this is `Some(id)`, this routine configures DPD state for only the
+///     external IP with `id` in the collection returned from CRDB. This will
+///     proceed even when the target IP is 'attaching'.
+///   - If this is `None`, this routine configures DPD for all external
+///     IPs and *will back out* if any IPs are not yet fully attached to
+///     the instance.
+pub(crate) async fn instance_ensure_dpd_config(
+    datastore: &DataStore,
+    log: &slog::Logger,
+    resolver: &internal_dns::resolver::Resolver,
+    opctx: &OpContext,
+    opctx_alloc: &OpContext,
+    instance_id: Uuid,
+    sled_ip_address: &std::net::SocketAddrV6,
+    ip_filter: Option<Uuid>,
+) -> Result<Vec<Ipv4NatEntry>, Error> {
+    info!(log, "looking up instance's primary network interface";
+            "instance_id" => %instance_id);
+
+    let (.., authz_instance) = LookupPath::new(opctx, datastore)
+        .instance_id(instance_id)
+        .lookup_for(authz::Action::ListChildren)
+        .await?;
+
+    // XXX: Need to abstract over v6 and v4 entries here.
+    let mut nat_entries = vec![];
+
+    // All external IPs map to the primary network interface, so find that
+    // interface. If there is no such interface, there's no way to route
+    // traffic destined to those IPs, so there's nothing to configure and
+    // it's safe to return early.
+    let network_interface = match datastore
+        .derive_guest_network_interface_info(&opctx, &authz_instance)
+        .await?
+        .into_iter()
+        .find(|interface| interface.primary)
+    {
+        Some(interface) => interface,
+        None => {
+            info!(log, "Instance has no primary network interface";
+                    "instance_id" => %instance_id);
+            return Ok(nat_entries);
+        }
+    };
+
+    let mac_address =
+        macaddr::MacAddr6::from_str(&network_interface.mac.to_string())
+            .map_err(|e| {
+                Error::internal_error(&format!(
+                    "failed to convert mac address: {e}"
+                ))
+            })?;
+
+    info!(log, "looking up instance's external IPs";
+            "instance_id" => %instance_id);
+
+    let ips =
+        datastore.instance_lookup_external_ips(&opctx, instance_id).await?;
+
+    let (ips_of_interest, must_all_be_attached) = if let Some(wanted_id) =
+        ip_filter
+    {
+        if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) {
+            (std::slice::from_ref(ip), false)
+        } else {
+            return Err(Error::internal_error(&format!(
+                "failed to find external ip address with id: {wanted_id}, saw {ips:?}",
+            )));
+        }
+    } else {
+        (&ips[..], true)
+    };
+
+    // This is performed so that an IP attach/detach will block the
+    // instance_start saga. Return service unavailable to indicate
+    // the request is retryable.
+    if must_all_be_attached
+        && ips_of_interest.iter().any(|ip| ip.state != IpAttachState::Attached)
+    {
+        return Err(Error::unavail(
+            "cannot push all DPD state: IP attach/detach in progress",
+        ));
+    }
+
+    let sled_address =
+        Ipv6Net(Ipv6Network::new(*sled_ip_address.ip(), 128).unwrap());
+
+    // If all of our IPs are attached or are guaranteed to be owned
+    // by the saga calling this fn, then we need to disregard and
+    // remove conflicting rows. No other instance/service should be
+    // using these as its own, and we are dealing with detritus, e.g.,
+    // the case where we have a concurrent stop -> detach followed
+    // by an attach to another instance, or other ongoing attach saga
+    // cleanup.
+    let mut err_and_limit = None;
+    for (i, external_ip) in ips_of_interest.iter().enumerate() {
+        // For each external ip, add a nat entry to the database
+        if let Ok(id) = ensure_nat_entry(
+            datastore,
+            external_ip,
+            sled_address,
+            &network_interface,
+            mac_address,
+            opctx,
+        )
+        .await
+        {
+            nat_entries.push(id);
+            continue;
+        }
+
+        // We seem to be blocked by a bad row -- take it out and retry.
+        // This will return Ok() for a non-existent row.
+        if let Err(e) = external_ip_delete_dpd_config_inner(
+            datastore,
+            log,
+            opctx,
+            external_ip,
+        )
+        .await
+        {
+            err_and_limit = Some((e, i));
+            break;
+        };
+
+        match ensure_nat_entry(
+            datastore,
+            external_ip,
+            sled_address,
+            &network_interface,
+            mac_address,
+            opctx,
+        )
+        .await
+        {
+            Ok(id) => nat_entries.push(id),
+            Err(e) => {
+                err_and_limit = Some((e, i));
+                break;
+            }
+        }
+    }
+
+    // In the event of an unresolvable failure, we need to remove
+    // the entries we just added because the undo won't call into
+    // `instance_delete_dpd_config`. These entries won't stop a
+    // future caller, but it's better not to pollute switch state.
+    if let Some((e, max)) = err_and_limit {
+        for external_ip in &ips_of_interest[..max] {
+            let _ = external_ip_delete_dpd_config_inner(
+                datastore,
+                log,
+                opctx,
+                external_ip,
+            )
+            .await;
+        }
+        return Err(e);
+    }
+
+    notify_dendrite_nat_state(
+        datastore,
+        log,
+        resolver,
+        opctx_alloc,
+        Some(instance_id),
+        true,
+    )
+    .await?;
+
+    Ok(nat_entries)
+}
+
+/// Deletes an instance's OPTE V2P mappings and the boundary switch NAT
+/// entries for its external IPs.
+///
+/// This routine returns immediately upon encountering any errors (and will
+/// not try to destroy any more objects after the point of failure).
+async fn clear_instance_networking_state(
+    datastore: &DataStore,
+    log: &slog::Logger,
+
+    resolver: &internal_dns::resolver::Resolver,
+    opctx: &OpContext,
+    opctx_alloc: &OpContext,
+    authz_instance: &authz::Instance,
+) -> Result<(), Error> {
+    delete_instance_v2p_mappings(
+        datastore,
+        log,
+        opctx,
+        opctx_alloc,
+        authz_instance.id(),
+    )
+    .await?;
+
+    instance_delete_dpd_config(
+        datastore,
+        log,
+        resolver,
+        opctx,
+        opctx_alloc,
+        authz_instance,
+    )
+    .await?;
+
+    notify_dendrite_nat_state(
+        datastore,
+        log,
+        resolver,
+        opctx_alloc,
+        Some(authz_instance.id()),
+        true,
+    )
+    .await
+}
+
+/// Ensures that V2P mappings exist that indicate that the instance with ID
+/// `instance_id` is resident on the sled with ID `sled_id`.
+pub(crate) async fn create_instance_v2p_mappings(
+    datastore: &DataStore,
+    log: &slog::Logger,
+    opctx: &OpContext,
+    opctx_alloc: &OpContext,
+    instance_id: Uuid,
+    sled_id: Uuid,
+) -> Result<(), Error> {
+    info!(log, "creating V2P mappings for instance";
+            "instance_id" => %instance_id,
+            "sled_id" => %sled_id);
+
+    // For every sled that isn't the sled this instance was allocated to, create
+    // a virtual to physical mapping for each of this instance's NICs.
+    //
+    // For the mappings to be correct, a few invariants must hold:
+    //
+    // - mappings must be set whenever an instance's sled changes (eg.
+    //   during instance creation, migration, stop + start)
+    //
+    // - an instances' sled must not change while its corresponding mappings
+    //   are being created
+    //
+    // - the same mapping creation must be broadcast to all sleds
+    //
+    // A more targeted approach would be to see what other instances share
+    // the VPC this instance is in (or more generally, what instances should
+    // have connectivity to this one), see what sleds those are allocated
+    // to, and only create V2P mappings for those sleds.
+    //
+    // There's additional work with this approach:
+    //
+    // - it means that delete calls are required as well as set calls,
+    //   meaning that now the ordering of those matters (this may also
+    //   necessitate a generation number for V2P mappings)
+    //
+    // - V2P mappings have to be bidirectional in order for both instances's
+    //   packets to make a round trip. This isn't a problem with the
+    //   broadcast approach because one of the sides will exist already, but
+    //   it is something to orchestrate with a more targeted approach.
+    //
+    // TODO-correctness Default firewall rules currently will block
+    // instances in different VPCs from connecting to each other. If it ever
+    // stops doing this, the broadcast approach will create V2P mappings
+    // that shouldn't exist.
+    let (.., authz_instance) = LookupPath::new(&opctx, &datastore)
+        .instance_id(instance_id)
+        .lookup_for(authz::Action::Read)
+        .await?;
+
+    let instance_nics = datastore
+        .derive_guest_network_interface_info(&opctx, &authz_instance)
+        .await?;
+
+    // Look up the supplied sled's physical host IP.
+    let physical_host_ip =
+        nexus_networking::sled_lookup(&datastore, &opctx_alloc, sled_id)?
+            .fetch()
+            .await?
+            .1
+            .ip
+            .into();
+
+    let mut last_sled_id: Option<Uuid> = None;
+    loop {
+        let pagparams = DataPageParams {
+            marker: last_sled_id.as_ref(),
+            direction: dropshot::PaginationOrder::Ascending,
+            limit: std::num::NonZeroU32::new(10).unwrap(),
+        };
+
+        let sleds_page = datastore.sled_list(&opctx_alloc, &pagparams).await?;
+        let mut join_handles =
+            Vec::with_capacity(sleds_page.len() * instance_nics.len());
+
+        for sled in &sleds_page {
+            // set_v2p not required for sled instance was allocated to, OPTE
+            // currently does that automatically
+            //
+            // TODO(#3107): Remove this when XDE stops creating mappings
+            // implicitly.
+            if sled.id() == sled_id {
+                continue;
+            }
+
+            for nic in &instance_nics {
+                let client = nexus_networking::sled_client(
+                    datastore,
+                    opctx_alloc,
+                    sled.id(),
+                    log,
+                )
+                .await?;
+                let nic_id = nic.id;
+                let mapping = SetVirtualNetworkInterfaceHost {
+                    virtual_ip: nic.ip,
+                    virtual_mac: nic.mac,
+                    physical_host_ip,
+                    vni: nic.vni,
+                };
+
+                let log = log.clone();
+
+                // This function is idempotent: calling the set_v2p ioctl with
+                // the same information is a no-op.
+                join_handles.push(tokio::spawn(futures::future::lazy(
+                    move |_ctx| async move {
+                        retry_until_known_result(&log, || async {
+                            client.set_v2p(&nic_id, &mapping).await
+                        })
+                        .await
+                    },
+                )));
+            }
+        }
+
+        // Concurrently run each future to completion, but return the last
+        // error seen.
+        let mut error = None;
+        for join_handle in join_handles {
+            let result = join_handle
+                .await
+                .map_err(|e| Error::internal_error(&e.to_string()))?
+                .await;
+
+            if result.is_err() {
+                error!(log, "{:?}", result);
+                error = Some(result);
+            }
+        }
+        if let Some(e) = error {
+            return e.map(|_| ()).map_err(|e| e.into());
+        }
+
+        if sleds_page.len() < 10 {
+            break;
+        }
+
+        if let Some(last) = sleds_page.last() {
+            last_sled_id = Some(last.id());
+        }
+    }
+
+    Ok(())
 }
 
 /// Ensure that the necessary v2p mappings for an instance are deleted
@@ -1408,6 +1891,60 @@ pub(crate) async fn delete_instance_v2p_mappings(
     Ok(())
 }
 
+/// Attempts to delete all of the Dendrite NAT configuration for the
+/// instance identified by `authz_instance`.
+///
+/// Unlike `instance_ensure_dpd_config`, this function will disregard the
+/// attachment states of any external IPs because likely callers (instance
+/// delete) cannot be piecewise undone.
+///
+/// # Return value
+///
+/// - `Ok(())` if all NAT entries were successfully deleted.
+/// - If an operation fails before this routine begins to walk and delete
+///   individual NAT entries, this routine returns `Err` and reports that
+///   error.
+/// - If an operation fails while this routine is walking NAT entries, it
+///   will continue trying to delete subsequent entries but will return the
+///   first error it encountered.
+/// - `ip_filter`: An optional filter on the index into the instance's
+///   external IP array.
+///   - If this is `Some(id)`, this routine configures DPD state for only the
+///     external IP with `id` in the collection returned from CRDB.
+///   - If this is `None`, this routine configures DPD for all external
+///     IPs.
+pub(crate) async fn instance_delete_dpd_config(
+    datastore: &DataStore,
+    log: &slog::Logger,
+    resolver: &internal_dns::resolver::Resolver,
+    opctx: &OpContext,
+    opctx_alloc: &OpContext,
+    authz_instance: &authz::Instance,
+) -> Result<(), Error> {
+    let instance_id = authz_instance.id();
+
+    info!(log, "deleting instance dpd configuration";
+            "instance_id" => %instance_id);
+
+    let external_ips =
+        datastore.instance_lookup_external_ips(opctx, instance_id).await?;
+
+    for entry in external_ips {
+        external_ip_delete_dpd_config_inner(&datastore, &log, opctx, &entry)
+            .await?;
+    }
+
+    notify_dendrite_nat_state(
+        datastore,
+        log,
+        resolver,
+        opctx_alloc,
+        Some(instance_id),
+        false,
+    )
+    .await
+}
+
 /// Soft-delete an individual external IP from the NAT RPW, without
 /// triggering a Dendrite notification.
 async fn external_ip_delete_dpd_config_inner(
@@ -1433,3 +1970,94 @@ async fn external_ip_delete_dpd_config_inner(
         },
     }
 }
+
+/// Informs all available boundary switches that the set of NAT entries
+/// has changed.
+///
+/// When `fail_fast` is set, this function will return on any error when
+/// acquiring a handle to a DPD client. Otherwise, it will attempt to notify
+/// all clients and then finally return the first error.
+async fn notify_dendrite_nat_state(
+    datastore: &DataStore,
+    log: &slog::Logger,
+    resolver: &internal_dns::resolver::Resolver,
+    opctx_alloc: &OpContext,
+    instance_id: Option<Uuid>,
+    fail_fast: bool,
+) -> Result<(), Error> {
+    // Querying boundary switches also requires fleet access and the use of the
+    // instance allocator context.
+    let boundary_switches = boundary_switches(datastore, opctx_alloc).await?;
+
+    let mut errors = vec![];
+    for switch in &boundary_switches {
+        debug!(log, "notifying dendrite of updates";
+                    "instance_id" => ?instance_id,
+                    "switch" => switch.to_string());
+
+        let clients = super::dpd_clients(resolver, log).await.map_err(|e| {
+            Error::internal_error(&format!("failed to get dpd clients: {e}"))
+        })?;
+        let client_result = clients.get(switch).ok_or_else(|| {
+            Error::internal_error(&format!(
+                "unable to find dendrite client for {switch}"
+            ))
+        });
+
+        let dpd_client = match client_result {
+            Ok(client) => client,
+            Err(new_error) => {
+                errors.push(new_error);
+                if fail_fast {
+                    break;
+                } else {
+                    continue;
+                }
+            }
+        };
+
+        // Notify dendrite that there are changes for it to reconcile.
+        // In the event of a failure to notify dendrite, we'll log an error
+        // and rely on dendrite's RPW timer to catch it up.
+        if let Err(e) = dpd_client.ipv4_nat_trigger_update().await {
+            error!(log, "failed to notify dendrite of nat updates"; "error" => ?e);
+        };
+    }
+
+    if let Some(e) = errors.into_iter().next() {
+        return Err(e);
+    }
+
+    Ok(())
+}
+
+async fn ensure_nat_entry(
+    datastore: &DataStore,
+    target_ip: &nexus_db_model::ExternalIp,
+    sled_address: Ipv6Net,
+    network_interface: &NetworkInterface,
+    mac_address: macaddr::MacAddr6,
+    opctx: &OpContext,
+) -> Result<Ipv4NatEntry, Error> {
+    match target_ip.ip {
+        IpNetwork::V4(v4net) => {
+            let nat_entry = Ipv4NatValues {
+                external_address: Ipv4Net(v4net).into(),
+                first_port: target_ip.first_port,
+                last_port: target_ip.last_port,
+                sled_address: sled_address.into(),
+                vni: DbVni(network_interface.vni),
+                mac: nexus_db_model::MacAddr(
+                    omicron_common::api::external::MacAddr(mac_address),
+                ),
+            };
+            Ok(datastore.ensure_ipv4_nat_entry(opctx, nat_entry).await?)
+        }
+        IpNetwork::V6(_v6net) => {
+            // TODO: implement handling of v6 nat.
+            return Err(Error::InternalError {
+                internal_message: "ipv6 nat is not yet implemented".into(),
+            });
+        }
+    }
+}

From 4823392dbeced71d244226ba92b8001cfd2303a5 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 11:58:40 -0700
Subject: [PATCH 08/69] plumbing

---
 nexus/src/app/background/init.rs           | 21 ++++++++++++
 nexus/src/app/background/instance_state.rs | 37 ++++++++++++++--------
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index 9997953921..3364d40366 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -12,6 +12,7 @@ use super::dns_config;
 use super::dns_propagation;
 use super::dns_servers;
 use super::external_endpoints;
+use super::instance_state;
 use super::inventory_collection;
 use super::metrics_producer_gc;
 use super::nat_cleanup;
@@ -90,6 +91,9 @@ pub struct BackgroundTasks {
     /// task handle for the task that detects if regions need replacement and
     /// begins the process
     pub task_region_replacement: common::TaskHandle,
+
+    /// task handle for the task that polls sled agents for instance states.
+    pub task_instance_watcher: common::TaskHandle,
 }
 
 impl BackgroundTasks {
@@ -341,6 +345,22 @@ impl BackgroundTasks {
             task
         };
 
+        let task_instance_watcher = {
+            let watcher = instance_state::InstanceWatcher::new(
+                datastore.clone(),
+                resolver.clone(),
+                opctx.child(BTreeMap::new()),
+            );
+            driver.register(
+                "instance_watcher".to_string(),
+                "periodically checks instance states".to_string(),
+                config.instance_watcher.period_secs,
+                Box::new(watcher),
+                opctx.child(BTreeMap::new()),
+                vec![],
+            )
+        };
+
         BackgroundTasks {
             driver,
             task_internal_dns_config,
@@ -360,6 +380,7 @@ impl BackgroundTasks {
             task_service_zone_nat_tracker,
             task_switch_port_settings_manager,
             task_region_replacement,
+            task_instance_watcher,
         }
     }
 
diff --git a/nexus/src/app/background/instance_state.rs b/nexus/src/app/background/instance_state.rs
index 2149167334..d8904e19b7 100644
--- a/nexus/src/app/background/instance_state.rs
+++ b/nexus/src/app/background/instance_state.rs
@@ -10,7 +10,6 @@ use nexus_db_model::{InvSledAgent, SledInstance};
 use nexus_db_queries::{
     context::OpContext, db::pagination::Paginator, db::DataStore,
 };
-use serde::Serialize;
 use serde_json::json;
 use sled_agent_client::{types::SledInstanceState, Client as SledAgentClient};
 use std::num::NonZeroU32;
@@ -19,6 +18,8 @@ use std::sync::Arc;
 /// Background task that periodically checks instance states.
 pub struct InstanceWatcher {
     datastore: Arc<DataStore>,
+    resolver: internal_dns::resolver::Resolver,
+    opctx_alloc: OpContext,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -70,6 +71,7 @@ impl BackgroundTask for InstanceWatcher {
 
                 if let Some((mut curr_sled_agent, sled_instance)) = batch.next() {
                     let mut client = mk_sled_agent_client(&opctx.log, &curr_sled_agent);
+                    spawn_get_state(&client, &mut requests, sled_instance);
 
                     for (sled_agent, sled_instance) in batch {
                         // We're now talking to a new sled agent; update the client.
@@ -84,7 +86,7 @@ impl BackgroundTask for InstanceWatcher {
 
             // All requests fired off, let's wait for them to come back.
             while let Some(result) = requests.join_next().await {
-                match result {
+                let (instance, state) = match result {
                     Err(_) => unreachable!(
                         "a `JoinError` is returned if a spawned task \
                         panics, or if the task is aborted. we never abort \
@@ -92,19 +94,27 @@ impl BackgroundTask for InstanceWatcher {
                         `panic=\"abort\"`, so neither of these cases should \
                         ever occur."
                     ),
-                    Ok(Ok(rsp)) => {
-                        todo!("eliza");
+                    Ok((instance, Ok(state))) => {
+                        (instance, state)
                     }
-                    Ok(Err(e)) => {
+                    Ok((instance, Err(client_error))) => {
                         // Here is where it gets interesting. This is where we
                         // might learn that the sled-agent we were trying to
                         // talk to is dead.
                         todo!("eliza: implement the interesting parts!");
                     }
                 };
+                let log = opctx.log.new(slog::o!("instance_id" => instance.instance_id().to_string(), "state" => format!("{state:?}")));
+                // TODO(eliza): it would be nice to do this in parallel as part
+                // of the task we spawn for each instance, but apparently we
+                // can't clone the `OpCtx`...so, do it here for now.
+                let result = crate::app::instance::notify_instance_updated(&self.datastore, &self.resolver, &self.opctx_alloc, opctx, &log, &instance.instance_id(), &state.into()).await;
+                match result {
+                    Ok(_) => slog::debug!(log, "instance state updated"),
+                    Err(e) =>  slog::error!(log, "failed to update instance state: {e}"),
+                }
             }
-
-            todo!()
+            serde_json::json!({})
         }
         .boxed()
     }
@@ -114,18 +124,19 @@ type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
 
 fn spawn_get_state(
     client: &SledAgentClient,
-    tasks: &mut tokio::task::JoinSet<
-        Result<(SledInstance, SledInstanceState), ClientError>,
-    >,
+    tasks: &mut tokio::task::JoinSet<(
+        SledInstance,
+        Result<SledInstanceState, ClientError>,
+    )>,
     instance: SledInstance,
 ) {
     let client = client.clone();
     tasks.spawn(async move {
         let state = client
             .instance_get_state(&instance.instance_id())
-            .await?
-            .into_inner();
-        Ok((instance, state))
+            .await
+            .map(|rsp| rsp.into_inner());
+        (instance, state)
     });
 }
 

From ff64c0878c826d66deb2a1979a47c0b32e60608d Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 12:19:20 -0700
Subject: [PATCH 09/69] oh that's how you're supposed to make child opctxs

---
 nexus/src/app/background/init.rs           |   5 +-
 nexus/src/app/background/instance_state.rs | 162 ++++++++++++---------
 2 files changed, 95 insertions(+), 72 deletions(-)

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index 3364d40366..b33b6a1b96 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -329,7 +329,7 @@ impl BackgroundTasks {
         // process
         let task_region_replacement = {
             let detector = region_replacement::RegionReplacementDetector::new(
-                datastore,
+                datastore.clone(),
                 saga_request.clone(),
             );
 
@@ -347,9 +347,8 @@ impl BackgroundTasks {
 
         let task_instance_watcher = {
             let watcher = instance_state::InstanceWatcher::new(
-                datastore.clone(),
+                datastore,
                 resolver.clone(),
-                opctx.child(BTreeMap::new()),
             );
             driver.register(
                 "instance_watcher".to_string(),
diff --git a/nexus/src/app/background/instance_state.rs b/nexus/src/app/background/instance_state.rs
index d8904e19b7..f79c694250 100644
--- a/nexus/src/app/background/instance_state.rs
+++ b/nexus/src/app/background/instance_state.rs
@@ -7,19 +7,21 @@
 use super::common::BackgroundTask;
 use futures::{future::BoxFuture, FutureExt};
 use nexus_db_model::{InvSledAgent, SledInstance};
-use nexus_db_queries::{
-    context::OpContext, db::pagination::Paginator, db::DataStore,
-};
+use nexus_db_queries::context::OpContext;
+use nexus_db_queries::db::pagination::Paginator;
+use nexus_db_queries::db::DataStore;
+use omicron_uuid_kinds::GenericUuid;
 use serde_json::json;
-use sled_agent_client::{types::SledInstanceState, Client as SledAgentClient};
+use sled_agent_client::Client as SledAgentClient;
+use std::future::Future;
 use std::num::NonZeroU32;
 use std::sync::Arc;
 
 /// Background task that periodically checks instance states.
-pub struct InstanceWatcher {
+#[derive(Clone)]
+pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
-    opctx_alloc: OpContext,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -27,6 +29,72 @@ const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
     NonZeroU32::new_unchecked(100)
 };
 
+impl InstanceWatcher {
+    pub(crate) fn new(
+        datastore: Arc<DataStore>,
+        resolver: internal_dns::resolver::Resolver,
+    ) -> Self {
+        Self { datastore, resolver }
+    }
+
+    fn check_instance(
+        &self,
+        opctx: &OpContext,
+        client: &SledAgentClient,
+        instance: SledInstance,
+    ) -> impl Future<Output = ()> + Send + 'static {
+        let instance_id = instance.instance_id();
+        let watcher = self.clone();
+        let opctx = opctx.child(
+            std::iter::once((
+                "instance_id".to_string(),
+                instance_id.to_string(),
+            ))
+            .collect(),
+        );
+        let client = client.clone();
+
+        async move {
+            let InstanceWatcher { datastore, resolver } = watcher;
+            slog::trace!(opctx.log, "checking on instance...");
+            let rsp = client.instance_get_state(&instance.instance_id()).await;
+            let state = match rsp {
+                Ok(rsp) => rsp.into_inner(),
+                Err(error) => {
+                    // Here is where it gets interesting. This is where we
+                    // might learn that the sled-agent we were trying to
+                    // talk to is dead.
+                    slog::info!(
+                        opctx.log,
+                        "client error checking on instance: {error:?}"
+                    );
+                    todo!("eliza: implement the interesting parts!");
+                }
+            };
+            slog::debug!(opctx.log, "updating instance state: {state:?}");
+            let result = crate::app::instance::notify_instance_updated(
+                &datastore,
+                &resolver,
+                &opctx,
+                &opctx,
+                &opctx.log,
+                &instance_id,
+                &state.into(),
+            )
+            .await;
+            match result {
+                Ok(_) => slog::debug!(opctx.log, "instance state updated"),
+                Err(e) => slog::error!(
+                    opctx.log,
+                    "failed to update instance state: {e}"
+                ),
+            }
+        }
+    }
+}
+
+type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
+
 impl BackgroundTask for InstanceWatcher {
     fn activate<'a>(
         &'a mut self,
@@ -51,7 +119,7 @@ impl BackgroundTask for InstanceWatcher {
                 }
             };
 
-            let mut requests = tokio::task::JoinSet::new();
+            let mut tasks = tokio::task::JoinSet::new();
             let mut paginator = Paginator::new(MAX_SLED_AGENTS);
             while let Some(p) = paginator.next() {
                 let maybe_batch = self.datastore.sled_instance_list_by_sled_agent(
@@ -70,87 +138,43 @@ impl BackgroundTask for InstanceWatcher {
                 let mut batch = batch.into_iter();
 
                 if let Some((mut curr_sled_agent, sled_instance)) = batch.next() {
-                    let mut client = mk_sled_agent_client(&opctx.log, &curr_sled_agent);
-                    spawn_get_state(&client, &mut requests, sled_instance);
+                    let mk_client = |&InvSledAgent {
+                        ref sled_id, sled_agent_ip, sled_agent_port, ..
+                    }: &InvSledAgent| {
+                        let address = std::net::SocketAddrV6::new(sled_agent_ip.into(), sled_agent_port.into(), 0, 0);
+                        nexus_networking::sled_client_from_address(sled_id.into_untyped_uuid(), address, &opctx.log)
+                    };
+
+                    let mut client = mk_client(&curr_sled_agent);
+                    tasks.spawn(self.check_instance(opctx, &client, sled_instance));
 
                     for (sled_agent, sled_instance) in batch {
                         // We're now talking to a new sled agent; update the client.
                         if sled_agent.sled_id != curr_sled_agent.sled_id {
-                            client = mk_sled_agent_client(&opctx.log, &sled_agent);
+                            client = mk_client(&sled_agent);
                             curr_sled_agent = sled_agent;
                         }
-                        spawn_get_state(&client, &mut requests, sled_instance);
+                        tasks.spawn(self.check_instance(opctx, &client, sled_instance));
                     }
                 }
             }
 
             // All requests fired off, let's wait for them to come back.
-            while let Some(result) = requests.join_next().await {
-                let (instance, state) = match result {
-                    Err(_) => unreachable!(
+            while let Some(result) = tasks.join_next().await {
+                if let Err(e) = result {
+                    unreachable!(
                         "a `JoinError` is returned if a spawned task \
                         panics, or if the task is aborted. we never abort \
                         tasks on this `JoinSet`, and nexus is compiled with \
                         `panic=\"abort\"`, so neither of these cases should \
-                        ever occur."
-                    ),
-                    Ok((instance, Ok(state))) => {
-                        (instance, state)
-                    }
-                    Ok((instance, Err(client_error))) => {
-                        // Here is where it gets interesting. This is where we
-                        // might learn that the sled-agent we were trying to
-                        // talk to is dead.
-                        todo!("eliza: implement the interesting parts!");
-                    }
-                };
-                let log = opctx.log.new(slog::o!("instance_id" => instance.instance_id().to_string(), "state" => format!("{state:?}")));
-                // TODO(eliza): it would be nice to do this in parallel as part
-                // of the task we spawn for each instance, but apparently we
-                // can't clone the `OpCtx`...so, do it here for now.
-                let result = crate::app::instance::notify_instance_updated(&self.datastore, &self.resolver, &self.opctx_alloc, opctx, &log, &instance.instance_id(), &state.into()).await;
-                match result {
-                    Ok(_) => slog::debug!(log, "instance state updated"),
-                    Err(e) =>  slog::error!(log, "failed to update instance state: {e}"),
+                        ever occur: {e}",
+                    );
                 }
             }
+
+            slog::trace!(opctx.log, "all instance checks complete");
             serde_json::json!({})
         }
         .boxed()
     }
 }
-
-type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
-
-fn spawn_get_state(
-    client: &SledAgentClient,
-    tasks: &mut tokio::task::JoinSet<(
-        SledInstance,
-        Result<SledInstanceState, ClientError>,
-    )>,
-    instance: SledInstance,
-) {
-    let client = client.clone();
-    tasks.spawn(async move {
-        let state = client
-            .instance_get_state(&instance.instance_id())
-            .await
-            .map(|rsp| rsp.into_inner());
-        (instance, state)
-    });
-}
-
-fn mk_sled_agent_client(
-    log: &slog::Logger,
-    InvSledAgent {
-        ref sled_id, ref sled_agent_ip, ref sled_agent_port, ..
-    }: &InvSledAgent,
-) -> SledAgentClient {
-    // Ipv6Addr's `fmt::Debug` impl is the same as its Display impl, so we
-    // should get the RFC 5952 textual representation here even though the DB
-    // `Ipv6Addr` type doesn't expose `Display`.
-    let url = format!("http://{sled_agent_ip:?}:{sled_agent_port}");
-    let log =
-        log.new(o!("sled_id" => sled_id.to_string(), "url" => url.clone()));
-    SledAgentClient::new(&url, log)
-}

From 78fd6fac98baa423345b35a1a17ede36f5e1ede9 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 12:25:23 -0700
Subject: [PATCH 10/69] add config

---
 nexus-config/src/nexus_config.rs          | 14 ++++++++++++++
 smf/nexus/multi-sled/config-partial.toml  |  1 +
 smf/nexus/single-sled/config-partial.toml |  1 +
 3 files changed, 16 insertions(+)

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 540a347150..053e3d600b 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -375,6 +375,8 @@ pub struct BackgroundTaskConfig {
     pub switch_port_settings_manager: SwitchPortSettingsManagerConfig,
     /// configuration for region replacement task
     pub region_replacement: RegionReplacementConfig,
+    /// configuration for instance watcher task
+    pub instance_watcher: InstanceWatcherConfig,
 }
 
 #[serde_as]
@@ -519,6 +521,14 @@ pub struct RegionReplacementConfig {
     pub period_secs: Duration,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct InstanceWatcherConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+}
+
 /// Configuration for a nexus server
 #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
 pub struct PackageConfig {
@@ -883,6 +893,9 @@ mod test {
                         region_replacement: RegionReplacementConfig {
                             period_secs: Duration::from_secs(30),
                         },
+                        instance_watcher: InstanceWatcherConfig {
+                            period_secs: Duration::from_secs(30),
+                        }
                     },
                     default_region_allocation_strategy:
                         crate::nexus_config::RegionAllocationStrategy::Random {
@@ -949,6 +962,7 @@ mod test {
             sync_service_zone_nat.period_secs = 30
             switch_port_settings_manager.period_secs = 30
             region_replacement.period_secs = 30
+            instance_watcher.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             "##,
diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
index 400a987786..d998a3c396 100644
--- a/smf/nexus/multi-sled/config-partial.toml
+++ b/smf/nexus/multi-sled/config-partial.toml
@@ -55,6 +55,7 @@ blueprints.period_secs_execute = 60
 sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
+instance_watcher.period_secs = 30
 
 [default_region_allocation_strategy]
 # by default, allocate across 3 distinct sleds
diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
index 524d521c89..4f61609b4e 100644
--- a/smf/nexus/single-sled/config-partial.toml
+++ b/smf/nexus/single-sled/config-partial.toml
@@ -55,6 +55,7 @@ blueprints.period_secs_execute = 60
 sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
+instance_watcher.period_secs = 30
 
 [default_region_allocation_strategy]
 # by default, allocate without requirement for distinct sleds.

From bedf76edbf3b185f598c01f6202f8819e6ee9c3d Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 12:26:31 -0700
Subject: [PATCH 11/69] mv instance_state.rs instance_watcher.rs

---
 .../src/app/background/{instance_state.rs => instance_watcher.rs} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename nexus/src/app/background/{instance_state.rs => instance_watcher.rs} (100%)

diff --git a/nexus/src/app/background/instance_state.rs b/nexus/src/app/background/instance_watcher.rs
similarity index 100%
rename from nexus/src/app/background/instance_state.rs
rename to nexus/src/app/background/instance_watcher.rs

From 79d2af2e4f25a5a350d6c28bb8ead9dde2a4a284 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 12:26:31 -0700
Subject: [PATCH 12/69] mv instance_state.rs instance_watcher.rs

---
 nexus/src/app/background/init.rs | 4 ++--
 nexus/src/app/background/mod.rs  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index b33b6a1b96..be770b9287 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -12,7 +12,7 @@ use super::dns_config;
 use super::dns_propagation;
 use super::dns_servers;
 use super::external_endpoints;
-use super::instance_state;
+use super::instance_watcher;
 use super::inventory_collection;
 use super::metrics_producer_gc;
 use super::nat_cleanup;
@@ -346,7 +346,7 @@ impl BackgroundTasks {
         };
 
         let task_instance_watcher = {
-            let watcher = instance_state::InstanceWatcher::new(
+            let watcher = instance_watcher::InstanceWatcher::new(
                 datastore,
                 resolver.clone(),
             );
diff --git a/nexus/src/app/background/mod.rs b/nexus/src/app/background/mod.rs
index 56d2acb375..809d1c4873 100644
--- a/nexus/src/app/background/mod.rs
+++ b/nexus/src/app/background/mod.rs
@@ -13,7 +13,7 @@ mod dns_propagation;
 mod dns_servers;
 mod external_endpoints;
 mod init;
-mod instance_state;
+mod instance_watcher;
 mod inventory_collection;
 mod metrics_producer_gc;
 mod nat_cleanup;

From 1c6372b96b7625e0fd56a6f1d8c4b627f1e01228 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 13:22:44 -0700
Subject: [PATCH 13/69] sketch out retry stuff

---
 nexus-config/src/nexus_config.rs             |  7 +++
 nexus/src/app/background/init.rs             |  1 +
 nexus/src/app/background/instance_watcher.rs | 62 ++++++++++++++++----
 smf/nexus/multi-sled/config-partial.toml     |  1 +
 smf/nexus/single-sled/config-partial.toml    |  1 +
 5 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 053e3d600b..abd633fec2 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -27,6 +27,7 @@ use std::collections::HashMap;
 use std::fmt;
 use std::net::IpAddr;
 use std::net::SocketAddr;
+use std::num::NonZeroU32;
 use std::time::Duration;
 use uuid::Uuid;
 
@@ -527,6 +528,10 @@ pub struct InstanceWatcherConfig {
     /// period (in seconds) for periodic activations of this background task
     #[serde_as(as = "DurationSeconds<u64>")]
     pub period_secs: Duration,
+
+    /// maximum number of retries to attempt before considering a sled-agent
+    /// dead.
+    pub max_retries: NonZeroU32,
 }
 
 /// Configuration for a nexus server
@@ -895,6 +900,7 @@ mod test {
                         },
                         instance_watcher: InstanceWatcherConfig {
                             period_secs: Duration::from_secs(30),
+                            max_retries: NonZeroU32::new(5).unwrap(),
                         }
                     },
                     default_region_allocation_strategy:
@@ -963,6 +969,7 @@ mod test {
             switch_port_settings_manager.period_secs = 30
             region_replacement.period_secs = 30
             instance_watcher.period_secs = 30
+            instance_watcher.max_retries = 10
             [default_region_allocation_strategy]
             type = "random"
             "##,
diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index be770b9287..384b55c2d9 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -349,6 +349,7 @@ impl BackgroundTasks {
             let watcher = instance_watcher::InstanceWatcher::new(
                 datastore,
                 resolver.clone(),
+                config.instance_watcher.max_retries,
             );
             driver.register(
                 "instance_watcher".to_string(),
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index f79c694250..9decfff4e9 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -10,6 +10,7 @@ use nexus_db_model::{InvSledAgent, SledInstance};
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
+use omicron_common::backoff::{self, BackoffError};
 use omicron_uuid_kinds::GenericUuid;
 use serde_json::json;
 use sled_agent_client::Client as SledAgentClient;
@@ -22,6 +23,7 @@ use std::sync::Arc;
 pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
+    max_retries: NonZeroU32,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -33,8 +35,9 @@ impl InstanceWatcher {
     pub(crate) fn new(
         datastore: Arc<DataStore>,
         resolver: internal_dns::resolver::Resolver,
+        max_retries: NonZeroU32,
     ) -> Self {
-        Self { datastore, resolver }
+        Self { datastore, resolver, max_retries }
     }
 
     fn check_instance(
@@ -55,22 +58,61 @@ impl InstanceWatcher {
         let client = client.clone();
 
         async move {
-            let InstanceWatcher { datastore, resolver } = watcher;
+            let InstanceWatcher { datastore, resolver, max_retries } = watcher;
             slog::trace!(opctx.log, "checking on instance...");
-            let rsp = client.instance_get_state(&instance.instance_id()).await;
+            let backoff = backoff::retry_policy_internal_service();
+            let mut retries = 0;
+            let rsp = backoff::retry_notify(
+                backoff,
+                || async {
+                    let rsp = client
+                        .instance_get_state(&instance.instance_id())
+                        .await;
+                    match rsp {
+                        Ok(rsp) => Ok(rsp.into_inner()),
+                        Err(e) if retries == max_retries.get() => {
+                            Err(BackoffError::Permanent(e))
+                        }
+                        Err(
+                            e @ ClientError::InvalidRequest(_)
+                            | e @ ClientError::InvalidUpgrade(_)
+                            | e @ ClientError::UnexpectedResponse(_)
+                            | e @ ClientError::PreHookError(_),
+                        ) => Err(BackoffError::Permanent(e)),
+                        Err(e) => Err(BackoffError::transient(e)),
+                    }
+                },
+                |err, duration| {
+                    slog::info!(
+                        opctx.log,
+                        "instance check failed; retrying: {err}";
+                        "duration" => ?duration,
+                        "retries_remaining" => max_retries.get() - retries,
+                    );
+                },
+            )
+            .await;
             let state = match rsp {
-                Ok(rsp) => rsp.into_inner(),
+                Ok(state) => state,
                 Err(error) => {
-                    // Here is where it gets interesting. This is where we
-                    // might learn that the sled-agent we were trying to
-                    // talk to is dead.
-                    slog::info!(
+                    // TODO(eliza): here is where it gets interesting --- if the
+                    // sled-agent is in a bad state, we need to:
+                    // 1. figure out whether the instance's VMM is reachable directly
+                    // 2. figure out whether we can recover the sled agent?
+                    // 3. if the instances' VMMs are also gone, mark them as
+                    //    "failed"
+                    // 4. this might mean that the whole sled is super gone,
+                    //    figure that out too.
+                    //
+                    // for now though, we'll just log a really big error.
+                    slog::error!(
                         opctx.log,
-                        "client error checking on instance: {error:?}"
+                        "instance seems to be in a bad state: {error}"
                     );
-                    todo!("eliza: implement the interesting parts!");
+                    return;
                 }
             };
+
             slog::debug!(opctx.log, "updating instance state: {state:?}");
             let result = crate::app::instance::notify_instance_updated(
                 &datastore,
diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
index d998a3c396..6d69f4c8fa 100644
--- a/smf/nexus/multi-sled/config-partial.toml
+++ b/smf/nexus/multi-sled/config-partial.toml
@@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 instance_watcher.period_secs = 30
+instance_watcher.max_retries = 5
 
 [default_region_allocation_strategy]
 # by default, allocate across 3 distinct sleds
diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
index 4f61609b4e..607af64fce 100644
--- a/smf/nexus/single-sled/config-partial.toml
+++ b/smf/nexus/single-sled/config-partial.toml
@@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 instance_watcher.period_secs = 30
+instance_watcher.max_retries = 5
 
 [default_region_allocation_strategy]
 # by default, allocate without requirement for distinct sleds.

From dfde7c3a50680719547a54262db77333d2dc755c Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 24 Apr 2024 13:32:36 -0700
Subject: [PATCH 14/69] rm unused import

---
 nexus/db-queries/src/db/datastore/vmm.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs
index d1a2446356..a837d1289b 100644
--- a/nexus/db-queries/src/db/datastore/vmm.rs
+++ b/nexus/db-queries/src/db/datastore/vmm.rs
@@ -19,7 +19,6 @@ use chrono::Utc;
 use diesel::prelude::*;
 use omicron_common::api::external::CreateResult;
 use omicron_common::api::external::Error;
-use omicron_common::api::external::ListResultVec;
 use omicron_common::api::external::LookupResult;
 use omicron_common::api::external::LookupType;
 use omicron_common::api::external::ResourceType;

From de3303bd6c8baae10ae53cd54f51d6dcea34035d Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 25 Apr 2024 12:59:24 -0700
Subject: [PATCH 15/69] use the `sled` table instead

---
 nexus/db-model/src/schema.rs                  |  2 +-
 .../src/db/datastore/sled_instance.rs         | 38 +++++------
 nexus/src/app/background/instance_watcher.rs  | 67 +++++++++----------
 3 files changed, 48 insertions(+), 59 deletions(-)

diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
index 2173ff556e..9ff3e01de7 100644
--- a/nexus/db-model/src/schema.rs
+++ b/nexus/db-model/src/schema.rs
@@ -1717,4 +1717,4 @@ allow_tables_to_appear_in_same_query!(ssh_key, instance_ssh_key, instance);
 joinable!(instance_ssh_key -> ssh_key (ssh_key_id));
 joinable!(instance_ssh_key -> instance (instance_id));
 
-allow_tables_to_appear_in_same_query!(inv_sled_agent, sled_instance);
+allow_tables_to_appear_in_same_query!(sled, sled_instance);
diff --git a/nexus/db-queries/src/db/datastore/sled_instance.rs b/nexus/db-queries/src/db/datastore/sled_instance.rs
index 58b227b357..dd3aac9f8e 100644
--- a/nexus/db-queries/src/db/datastore/sled_instance.rs
+++ b/nexus/db-queries/src/db/datastore/sled_instance.rs
@@ -5,16 +5,17 @@ use crate::context::OpContext;
 use crate::db;
 use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
-use crate::db::model::InvSledAgent;
+use crate::db::model::Sled;
 use crate::db::pagination::paginated;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use diesel::prelude::*;
+use nexus_db_model::ApplySledFilterExt;
 use nexus_db_model::DbTypedUuid;
 use nexus_db_model::SledInstance;
+use nexus_types::deployment::SledFilter;
 use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::ListResultVec;
 use omicron_uuid_kinds::CollectionUuid;
-use omicron_uuid_kinds::GenericUuid;
 use omicron_uuid_kinds::SledKind;
 use uuid::Uuid;
 
@@ -40,32 +41,23 @@ impl DataStore {
     pub async fn sled_instance_list_by_sled_agent(
         &self,
         opctx: &OpContext,
-        collection: CollectionUuid,
         pagparams: &DataPageParams<'_, DbTypedUuid<SledKind>>,
-    ) -> ListResultVec<(InvSledAgent, SledInstance)> {
+    ) -> ListResultVec<(Sled, SledInstance)> {
         // TODO(eliza): should probably paginate this?
-        use crate::db::schema::{inv_sled_agent, sled_instance::dsl};
+        use crate::db::schema::{sled::dsl as sled_dsl, sled_instance::dsl};
         opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
         let conn = self.pool_connection_authorized(opctx).await?;
 
-        let result = paginated(
-            inv_sled_agent::dsl::inv_sled_agent,
-            inv_sled_agent::dsl::sled_id,
-            pagparams,
-        )
-        // Only list sled agents from the latest collection.
-        .filter(
-            inv_sled_agent::dsl::inv_collection_id
-                .eq(collection.into_untyped_uuid()),
-        )
-        .inner_join(
-            dsl::sled_instance
-                .on(dsl::active_sled_id.eq(inv_sled_agent::dsl::sled_id)),
-        )
-        .select((InvSledAgent::as_select(), SledInstance::as_select()))
-        .load_async::<(InvSledAgent, SledInstance)>(&*conn)
-        .await
-        .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+        let result = paginated(sled_dsl::sled, sled_dsl::id, pagparams)
+            .filter(sled_dsl::time_deleted.is_null())
+            .sled_filter(SledFilter::InService)
+            .inner_join(
+                dsl::sled_instance.on(dsl::active_sled_id.eq(sled_dsl::id)),
+            )
+            .select((Sled::as_select(), SledInstance::as_select()))
+            .load_async::<(Sled, SledInstance)>(&*conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
 
         Ok(result)
     }
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 9decfff4e9..742e92c0d7 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -6,12 +6,12 @@
 
 use super::common::BackgroundTask;
 use futures::{future::BoxFuture, FutureExt};
-use nexus_db_model::{InvSledAgent, SledInstance};
+use nexus_db_model::SledInstance;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
+use nexus_types::identity::Asset;
 use omicron_common::backoff::{self, BackoffError};
-use omicron_uuid_kinds::GenericUuid;
 use serde_json::json;
 use sled_agent_client::Client as SledAgentClient;
 use std::future::Future;
@@ -143,52 +143,45 @@ impl BackgroundTask for InstanceWatcher {
         opctx: &'a OpContext,
     ) -> BoxFuture<'a, serde_json::Value> {
         async {
-            let latest_collection = {
-                let maybe_id = self
-                    .datastore
-                    .inventory_get_latest_collection_id(opctx)
-                    .await;
-                match maybe_id {
-                    Ok(Some(collection)) => collection,
-                    Ok(None) => {
-                        slog::debug!(opctx.log, "no inventory collection exists, not querying sled agents.");
-                        return json!({});
-                        }
-                    Err(e) => {
-                        slog::warn!(opctx.log, "failed to get latest collection ID: {e}");
-                        return json!({});
-                    }
-                }
-            };
-
             let mut tasks = tokio::task::JoinSet::new();
             let mut paginator = Paginator::new(MAX_SLED_AGENTS);
             while let Some(p) = paginator.next() {
-                let maybe_batch = self.datastore.sled_instance_list_by_sled_agent(
-                    opctx,
-                    latest_collection,
-                    &p.current_pagparams(),
-                ).await;
+                let maybe_batch = self
+                    .datastore
+                    .sled_instance_list_by_sled_agent(
+                        opctx,
+                        &p.current_pagparams(),
+                    )
+                    .await;
                 let batch = match maybe_batch {
                     Ok(batch) => batch,
                     Err(e) => {
-                        slog::warn!(opctx.log, "sled instances by sled agent query failed: {e}");
+                        slog::warn!(
+                            opctx.log,
+                            "sled instances by sled agent query failed: {e}"
+                        );
                         break;
                     }
                 };
-                paginator = p.found_batch(&batch, &|(sled_agent, _)| sled_agent.sled_id);
+                paginator = p.found_batch(&batch, &|(sled, _)| sled.id());
                 let mut batch = batch.into_iter();
 
-                if let Some((mut curr_sled_agent, sled_instance)) = batch.next() {
-                    let mk_client = |&InvSledAgent {
-                        ref sled_id, sled_agent_ip, sled_agent_port, ..
-                    }: &InvSledAgent| {
-                        let address = std::net::SocketAddrV6::new(sled_agent_ip.into(), sled_agent_port.into(), 0, 0);
-                        nexus_networking::sled_client_from_address(sled_id.into_untyped_uuid(), address, &opctx.log)
+                if let Some((mut curr_sled_agent, sled_instance)) = batch.next()
+                {
+                    let mk_client = |sled| {
+                        nexus_networking::sled_client_from_address(
+                            sled.id(),
+                            sled.address(),
+                            &opctx.log,
+                        )
                     };
 
                     let mut client = mk_client(&curr_sled_agent);
-                    tasks.spawn(self.check_instance(opctx, &client, sled_instance));
+                    tasks.spawn(self.check_instance(
+                        opctx,
+                        &client,
+                        sled_instance,
+                    ));
 
                     for (sled_agent, sled_instance) in batch {
                         // We're now talking to a new sled agent; update the client.
@@ -196,7 +189,11 @@ impl BackgroundTask for InstanceWatcher {
                             client = mk_client(&sled_agent);
                             curr_sled_agent = sled_agent;
                         }
-                        tasks.spawn(self.check_instance(opctx, &client, sled_instance));
+                        tasks.spawn(self.check_instance(
+                            opctx,
+                            &client,
+                            sled_instance,
+                        ));
                     }
                 }
             }

From 4c68fceeb58ef2405d0752f30ce25df00d85bb15 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 25 Apr 2024 13:07:06 -0700
Subject: [PATCH 16/69] whoopsie bad rebase

---
 nexus/src/app/instance_network.rs | 747 ------------------------------
 1 file changed, 747 deletions(-)

diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs
index 372e31af15..2c258c8064 100644
--- a/nexus/src/app/instance_network.rs
+++ b/nexus/src/app/instance_network.rs
@@ -1284,753 +1284,6 @@ async fn notify_dendrite_nat_state(
     Ok(())
 }
 
-async fn ensure_nat_entry(
-    datastore: &DataStore,
-    target_ip: &nexus_db_model::ExternalIp,
-    sled_address: Ipv6Net,
-    network_interface: &NetworkInterface,
-    mac_address: macaddr::MacAddr6,
-    opctx: &OpContext,
-) -> Result<Ipv4NatEntry, Error> {
-    match target_ip.ip {
-        IpNetwork::V4(v4net) => {
-            let nat_entry = Ipv4NatValues {
-                external_address: Ipv4Net(v4net).into(),
-                first_port: target_ip.first_port,
-                last_port: target_ip.last_port,
-                sled_address: sled_address.into(),
-                vni: DbVni(network_interface.vni),
-                mac: nexus_db_model::MacAddr(
-                    omicron_common::api::external::MacAddr(mac_address),
-                ),
-            };
-            Ok(datastore.ensure_ipv4_nat_entry(opctx, nat_entry).await?)
-        }
-        IpNetwork::V6(_v6net) => {
-            // TODO: implement handling of v6 nat.
-            return Err(Error::InternalError {
-                internal_message: "ipv6 nat is not yet implemented".into(),
-            });
-        }
-    }
-
-    // If the instance still has a migration in progress, don't change
-    // any networking state until an update arrives that retires that
-    // migration.
-    //
-    // This is needed to avoid the following race:
-    //
-    // 1. Migration from S to T completes.
-    // 2. Migration source sends an update that changes the instance's
-    //    active VMM but leaves the migration ID in place.
-    // 3. Meanwhile, migration target sends an update that changes the
-    //    instance's active VMM and clears the migration ID.
-    // 4. The migration target's call updates networking state and commits
-    //    the new instance record.
-    // 5. The instance migrates from T to T' and Nexus applies networking
-    //    configuration reflecting that the instance is on T'.
-    // 6. The update in step 2 applies configuration saying the instance
-    //    is on sled T.
-    if new_instance_state.migration_id.is_some() {
-        debug!(log,
-                "instance still has a migration in progress, won't touch \
-                network config";
-                "instance_id" => %instance_id,
-                "migration_id" => ?new_instance_state.migration_id);
-
-        return Ok(());
-    }
-
-    let new_propolis_id = new_instance_state.propolis_id.unwrap();
-
-    // Updates that end live migration need to push OPTE V2P state even if
-    // the instance's active sled did not change (see below).
-    let migration_retired = prev_instance_state.migration_id.is_some()
-        && new_instance_state.migration_id.is_none();
-
-    if (prev_instance_state.propolis_id == new_instance_state.propolis_id)
-        && !migration_retired
-    {
-        debug!(log, "instance didn't move, won't touch network config";
-                "instance_id" => %instance_id);
-
-        return Ok(());
-    }
-
-    // Either the instance moved from one sled to another, or it attempted
-    // to migrate and failed. Ensure the correct networking configuration
-    // exists for its current home.
-    //
-    // TODO(#3107) This is necessary even if the instance didn't move,
-    // because registering a migration target on a sled creates OPTE ports
-    // for its VNICs, and that creates new V2P mappings on that sled that
-    // place the relevant virtual IPs on the local sled. Once OPTE stops
-    // creating these mappings, this path only needs to be taken if an
-    // instance has changed sleds.
-    let new_sled_id = match datastore
-        .vmm_fetch(&opctx, authz_instance, &new_propolis_id)
-        .await
-    {
-        Ok(vmm) => vmm.sled_id,
-
-        // A VMM in the active position should never be destroyed. If the
-        // sled sending this message is the owner of the instance's last
-        // active VMM and is destroying it, it should also have retired that
-        // VMM.
-        Err(Error::ObjectNotFound { .. }) => {
-            error!(log, "instance's active vmm unexpectedly not found";
-                    "instance_id" => %instance_id,
-                    "propolis_id" => %new_propolis_id);
-
-            return Ok(());
-        }
-
-        Err(e) => return Err(e),
-    };
-
-    create_instance_v2p_mappings(
-        datastore,
-        log,
-        opctx,
-        opctx_alloc,
-        instance_id,
-        new_sled_id,
-    )
-    .await?;
-
-    let (.., sled) =
-        LookupPath::new(opctx, datastore).sled_id(new_sled_id).fetch().await?;
-
-    instance_ensure_dpd_config(
-        datastore,
-        log,
-        resolver,
-        opctx,
-        opctx_alloc,
-        instance_id,
-        &sled.address(),
-        None,
-    )
-    .await?;
-
-    Ok(())
-}
-
-/// Ensures that the Dendrite configuration for the supplied instance is
-/// up-to-date.
-///
-/// Returns a list of live NAT RPW table entries from this call. Generally
-/// these should only be needed for specific unwind operations, like in
-/// the IP attach saga.
-///
-/// # Parameters
-///
-/// - `datastore`: The datastore to use for lookups and updates.
-/// - `opctx`: An operation context that grants read and list-children
-///   permissions on the identified instance.
-/// - `instance_id`: The ID of the instance to act on.
-/// - `sled_ip_address`: The internal IP address assigned to the sled's
-///   sled agent.
-/// - `ip_filter`: An optional filter on the index into the instance's
-///   external IP array.
-///   - If this is `Some(id)`, this routine configures DPD state for only the
-///     external IP with `id` in the collection returned from CRDB. This will
-///     proceed even when the target IP is 'attaching'.
-///   - If this is `None`, this routine configures DPD for all external
-///     IPs and *will back out* if any IPs are not yet fully attached to
-///     the instance.
-pub(crate) async fn instance_ensure_dpd_config(
-    datastore: &DataStore,
-    log: &slog::Logger,
-    resolver: &internal_dns::resolver::Resolver,
-    opctx: &OpContext,
-    opctx_alloc: &OpContext,
-    instance_id: Uuid,
-    sled_ip_address: &std::net::SocketAddrV6,
-    ip_filter: Option<Uuid>,
-) -> Result<Vec<Ipv4NatEntry>, Error> {
-    info!(log, "looking up instance's primary network interface";
-            "instance_id" => %instance_id);
-
-    let (.., authz_instance) = LookupPath::new(opctx, datastore)
-        .instance_id(instance_id)
-        .lookup_for(authz::Action::ListChildren)
-        .await?;
-
-    // XXX: Need to abstract over v6 and v4 entries here.
-    let mut nat_entries = vec![];
-
-    // All external IPs map to the primary network interface, so find that
-    // interface. If there is no such interface, there's no way to route
-    // traffic destined to those IPs, so there's nothing to configure and
-    // it's safe to return early.
-    let network_interface = match datastore
-        .derive_guest_network_interface_info(&opctx, &authz_instance)
-        .await?
-        .into_iter()
-        .find(|interface| interface.primary)
-    {
-        Some(interface) => interface,
-        None => {
-            info!(log, "Instance has no primary network interface";
-                    "instance_id" => %instance_id);
-            return Ok(nat_entries);
-        }
-    };
-
-    let mac_address =
-        macaddr::MacAddr6::from_str(&network_interface.mac.to_string())
-            .map_err(|e| {
-                Error::internal_error(&format!(
-                    "failed to convert mac address: {e}"
-                ))
-            })?;
-
-    info!(log, "looking up instance's external IPs";
-            "instance_id" => %instance_id);
-
-    let ips =
-        datastore.instance_lookup_external_ips(&opctx, instance_id).await?;
-
-    let (ips_of_interest, must_all_be_attached) = if let Some(wanted_id) =
-        ip_filter
-    {
-        if let Some(ip) = ips.iter().find(|v| v.id == wanted_id) {
-            (std::slice::from_ref(ip), false)
-        } else {
-            return Err(Error::internal_error(&format!(
-                "failed to find external ip address with id: {wanted_id}, saw {ips:?}",
-            )));
-        }
-    } else {
-        (&ips[..], true)
-    };
-
-    // This is performed so that an IP attach/detach will block the
-    // instance_start saga. Return service unavailable to indicate
-    // the request is retryable.
-    if must_all_be_attached
-        && ips_of_interest.iter().any(|ip| ip.state != IpAttachState::Attached)
-    {
-        return Err(Error::unavail(
-            "cannot push all DPD state: IP attach/detach in progress",
-        ));
-    }
-
-    let sled_address =
-        Ipv6Net(Ipv6Network::new(*sled_ip_address.ip(), 128).unwrap());
-
-    // If all of our IPs are attached or are guaranteed to be owned
-    // by the saga calling this fn, then we need to disregard and
-    // remove conflicting rows. No other instance/service should be
-    // using these as its own, and we are dealing with detritus, e.g.,
-    // the case where we have a concurrent stop -> detach followed
-    // by an attach to another instance, or other ongoing attach saga
-    // cleanup.
-    let mut err_and_limit = None;
-    for (i, external_ip) in ips_of_interest.iter().enumerate() {
-        // For each external ip, add a nat entry to the database
-        if let Ok(id) = ensure_nat_entry(
-            datastore,
-            external_ip,
-            sled_address,
-            &network_interface,
-            mac_address,
-            opctx,
-        )
-        .await
-        {
-            nat_entries.push(id);
-            continue;
-        }
-
-        // We seem to be blocked by a bad row -- take it out and retry.
-        // This will return Ok() for a non-existent row.
-        if let Err(e) = external_ip_delete_dpd_config_inner(
-            datastore,
-            log,
-            opctx,
-            external_ip,
-        )
-        .await
-        {
-            err_and_limit = Some((e, i));
-            break;
-        };
-
-        match ensure_nat_entry(
-            datastore,
-            external_ip,
-            sled_address,
-            &network_interface,
-            mac_address,
-            opctx,
-        )
-        .await
-        {
-            Ok(id) => nat_entries.push(id),
-            Err(e) => {
-                err_and_limit = Some((e, i));
-                break;
-            }
-        }
-    }
-
-    // In the event of an unresolvable failure, we need to remove
-    // the entries we just added because the undo won't call into
-    // `instance_delete_dpd_config`. These entries won't stop a
-    // future caller, but it's better not to pollute switch state.
-    if let Some((e, max)) = err_and_limit {
-        for external_ip in &ips_of_interest[..max] {
-            let _ = external_ip_delete_dpd_config_inner(
-                datastore,
-                log,
-                opctx,
-                external_ip,
-            )
-            .await;
-        }
-        return Err(e);
-    }
-
-    notify_dendrite_nat_state(
-        datastore,
-        log,
-        resolver,
-        opctx_alloc,
-        Some(instance_id),
-        true,
-    )
-    .await?;
-
-    Ok(nat_entries)
-}
-
-/// Deletes an instance's OPTE V2P mappings and the boundary switch NAT
-/// entries for its external IPs.
-///
-/// This routine returns immediately upon encountering any errors (and will
-/// not try to destroy any more objects after the point of failure).
-async fn clear_instance_networking_state(
-    datastore: &DataStore,
-    log: &slog::Logger,
-
-    resolver: &internal_dns::resolver::Resolver,
-    opctx: &OpContext,
-    opctx_alloc: &OpContext,
-    authz_instance: &authz::Instance,
-) -> Result<(), Error> {
-    delete_instance_v2p_mappings(
-        datastore,
-        log,
-        opctx,
-        opctx_alloc,
-        authz_instance.id(),
-    )
-    .await?;
-
-    instance_delete_dpd_config(
-        datastore,
-        log,
-        resolver,
-        opctx,
-        opctx_alloc,
-        authz_instance,
-    )
-    .await?;
-
-    notify_dendrite_nat_state(
-        datastore,
-        log,
-        resolver,
-        opctx_alloc,
-        Some(authz_instance.id()),
-        true,
-    )
-    .await
-}
-
-/// Ensures that V2P mappings exist that indicate that the instance with ID
-/// `instance_id` is resident on the sled with ID `sled_id`.
-pub(crate) async fn create_instance_v2p_mappings(
-    datastore: &DataStore,
-    log: &slog::Logger,
-    opctx: &OpContext,
-    opctx_alloc: &OpContext,
-    instance_id: Uuid,
-    sled_id: Uuid,
-) -> Result<(), Error> {
-    info!(log, "creating V2P mappings for instance";
-            "instance_id" => %instance_id,
-            "sled_id" => %sled_id);
-
-    // For every sled that isn't the sled this instance was allocated to, create
-    // a virtual to physical mapping for each of this instance's NICs.
-    //
-    // For the mappings to be correct, a few invariants must hold:
-    //
-    // - mappings must be set whenever an instance's sled changes (eg.
-    //   during instance creation, migration, stop + start)
-    //
-    // - an instances' sled must not change while its corresponding mappings
-    //   are being created
-    //
-    // - the same mapping creation must be broadcast to all sleds
-    //
-    // A more targeted approach would be to see what other instances share
-    // the VPC this instance is in (or more generally, what instances should
-    // have connectivity to this one), see what sleds those are allocated
-    // to, and only create V2P mappings for those sleds.
-    //
-    // There's additional work with this approach:
-    //
-    // - it means that delete calls are required as well as set calls,
-    //   meaning that now the ordering of those matters (this may also
-    //   necessitate a generation number for V2P mappings)
-    //
-    // - V2P mappings have to be bidirectional in order for both instances's
-    //   packets to make a round trip. This isn't a problem with the
-    //   broadcast approach because one of the sides will exist already, but
-    //   it is something to orchestrate with a more targeted approach.
-    //
-    // TODO-correctness Default firewall rules currently will block
-    // instances in different VPCs from connecting to each other. If it ever
-    // stops doing this, the broadcast approach will create V2P mappings
-    // that shouldn't exist.
-    let (.., authz_instance) = LookupPath::new(&opctx, &datastore)
-        .instance_id(instance_id)
-        .lookup_for(authz::Action::Read)
-        .await?;
-
-    let instance_nics = datastore
-        .derive_guest_network_interface_info(&opctx, &authz_instance)
-        .await?;
-
-    // Look up the supplied sled's physical host IP.
-    let physical_host_ip =
-        nexus_networking::sled_lookup(&datastore, &opctx_alloc, sled_id)?
-            .fetch()
-            .await?
-            .1
-            .ip
-            .into();
-
-    let mut last_sled_id: Option<Uuid> = None;
-    loop {
-        let pagparams = DataPageParams {
-            marker: last_sled_id.as_ref(),
-            direction: dropshot::PaginationOrder::Ascending,
-            limit: std::num::NonZeroU32::new(10).unwrap(),
-        };
-
-        let sleds_page = datastore.sled_list(&opctx_alloc, &pagparams).await?;
-        let mut join_handles =
-            Vec::with_capacity(sleds_page.len() * instance_nics.len());
-
-        for sled in &sleds_page {
-            // set_v2p not required for sled instance was allocated to, OPTE
-            // currently does that automatically
-            //
-            // TODO(#3107): Remove this when XDE stops creating mappings
-            // implicitly.
-            if sled.id() == sled_id {
-                continue;
-            }
-
-            for nic in &instance_nics {
-                let client = nexus_networking::sled_client(
-                    datastore,
-                    opctx_alloc,
-                    sled.id(),
-                    log,
-                )
-                .await?;
-                let nic_id = nic.id;
-                let mapping = SetVirtualNetworkInterfaceHost {
-                    virtual_ip: nic.ip,
-                    virtual_mac: nic.mac,
-                    physical_host_ip,
-                    vni: nic.vni,
-                };
-
-                let log = log.clone();
-
-                // This function is idempotent: calling the set_v2p ioctl with
-                // the same information is a no-op.
-                join_handles.push(tokio::spawn(futures::future::lazy(
-                    move |_ctx| async move {
-                        retry_until_known_result(&log, || async {
-                            client.set_v2p(&nic_id, &mapping).await
-                        })
-                        .await
-                    },
-                )));
-            }
-        }
-
-        // Concurrently run each future to completion, but return the last
-        // error seen.
-        let mut error = None;
-        for join_handle in join_handles {
-            let result = join_handle
-                .await
-                .map_err(|e| Error::internal_error(&e.to_string()))?
-                .await;
-
-            if result.is_err() {
-                error!(log, "{:?}", result);
-                error = Some(result);
-            }
-        }
-        if let Some(e) = error {
-            return e.map(|_| ()).map_err(|e| e.into());
-        }
-
-        if sleds_page.len() < 10 {
-            break;
-        }
-
-        if let Some(last) = sleds_page.last() {
-            last_sled_id = Some(last.id());
-        }
-    }
-
-    Ok(())
-}
-
-/// Ensure that the necessary v2p mappings for an instance are deleted
-pub(crate) async fn delete_instance_v2p_mappings(
-    datastore: &DataStore,
-    log: &slog::Logger,
-    opctx: &OpContext,
-    opctx_alloc: &OpContext,
-    instance_id: Uuid,
-) -> Result<(), Error> {
-    // For every sled that isn't the sled this instance was allocated to, delete
-    // the virtual to physical mapping for each of this instance's NICs. If
-    // there isn't a V2P mapping, del_v2p should be a no-op.
-    let (.., authz_instance) = LookupPath::new(&opctx, datastore)
-        .instance_id(instance_id)
-        .lookup_for(authz::Action::Read)
-        .await?;
-
-    let instance_nics = datastore
-        .derive_guest_network_interface_info(&opctx, &authz_instance)
-        .await?;
-
-    let mut last_sled_id: Option<Uuid> = None;
-
-    loop {
-        let pagparams = DataPageParams {
-            marker: last_sled_id.as_ref(),
-            direction: dropshot::PaginationOrder::Ascending,
-            limit: std::num::NonZeroU32::new(10).unwrap(),
-        };
-
-        let sleds_page = datastore.sled_list(&opctx_alloc, &pagparams).await?;
-        let mut join_handles =
-            Vec::with_capacity(sleds_page.len() * instance_nics.len());
-
-        for sled in &sleds_page {
-            for nic in &instance_nics {
-                let client = nexus_networking::sled_client(
-                    &datastore,
-                    &opctx_alloc,
-                    sled.id(),
-                    &log,
-                )
-                .await?;
-                let nic_id = nic.id;
-                let mapping = DeleteVirtualNetworkInterfaceHost {
-                    virtual_ip: nic.ip,
-                    vni: nic.vni,
-                };
-
-                let log = log.clone();
-
-                // This function is idempotent: calling the set_v2p ioctl with
-                // the same information is a no-op.
-                join_handles.push(tokio::spawn(futures::future::lazy(
-                    move |_ctx| async move {
-                        retry_until_known_result(&log, || async {
-                            client.del_v2p(&nic_id, &mapping).await
-                        })
-                        .await
-                    },
-                )));
-            }
-        }
-
-        // Concurrently run each future to completion, but return the last
-        // error seen.
-        let mut error = None;
-        for join_handle in join_handles {
-            let result = join_handle
-                .await
-                .map_err(|e| Error::internal_error(&e.to_string()))?
-                .await;
-
-            if result.is_err() {
-                error!(log, "{:?}", result);
-                error = Some(result);
-            }
-        }
-        if let Some(e) = error {
-            return e.map(|_| ()).map_err(|e| e.into());
-        }
-
-        if sleds_page.len() < 10 {
-            break;
-        }
-
-        if let Some(last) = sleds_page.last() {
-            last_sled_id = Some(last.id());
-        }
-    }
-
-    Ok(())
-}
-
-/// Attempts to delete all of the Dendrite NAT configuration for the
-/// instance identified by `authz_instance`.
-///
-/// Unlike `instance_ensure_dpd_config`, this function will disregard the
-/// attachment states of any external IPs because likely callers (instance
-/// delete) cannot be piecewise undone.
-///
-/// # Return value
-///
-/// - `Ok(())` if all NAT entries were successfully deleted.
-/// - If an operation fails before this routine begins to walk and delete
-///   individual NAT entries, this routine returns `Err` and reports that
-///   error.
-/// - If an operation fails while this routine is walking NAT entries, it
-///   will continue trying to delete subsequent entries but will return the
-///   first error it encountered.
-/// - `ip_filter`: An optional filter on the index into the instance's
-///   external IP array.
-///   - If this is `Some(id)`, this routine configures DPD state for only the
-///     external IP with `id` in the collection returned from CRDB.
-///   - If this is `None`, this routine configures DPD for all external
-///     IPs.
-pub(crate) async fn instance_delete_dpd_config(
-    datastore: &DataStore,
-    log: &slog::Logger,
-    resolver: &internal_dns::resolver::Resolver,
-    opctx: &OpContext,
-    opctx_alloc: &OpContext,
-    authz_instance: &authz::Instance,
-) -> Result<(), Error> {
-    let instance_id = authz_instance.id();
-
-    info!(log, "deleting instance dpd configuration";
-            "instance_id" => %instance_id);
-
-    let external_ips =
-        datastore.instance_lookup_external_ips(opctx, instance_id).await?;
-
-    for entry in external_ips {
-        external_ip_delete_dpd_config_inner(&datastore, &log, opctx, &entry)
-            .await?;
-    }
-
-    notify_dendrite_nat_state(
-        datastore,
-        log,
-        resolver,
-        opctx_alloc,
-        Some(instance_id),
-        false,
-    )
-    .await
-}
-
-/// Soft-delete an individual external IP from the NAT RPW, without
-/// triggering a Dendrite notification.
-async fn external_ip_delete_dpd_config_inner(
-    datastore: &DataStore,
-    log: &slog::Logger,
-    opctx: &OpContext,
-    external_ip: &ExternalIp,
-) -> Result<(), Error> {
-    // Soft delete the NAT entry
-    match datastore.ipv4_nat_delete_by_external_ip(&opctx, external_ip).await {
-        Ok(_) => Ok(()),
-        Err(err) => match err {
-            Error::ObjectNotFound { .. } => {
-                warn!(log, "no matching nat entries to soft delete");
-                Ok(())
-            }
-            _ => {
-                let message =
-                    format!("failed to delete nat entry due to error: {err:?}");
-                error!(log, "{}", message);
-                Err(Error::internal_error(&message))
-            }
-        },
-    }
-}
-
-/// Informs all available boundary switches that the set of NAT entries
-/// has changed.
-///
-/// When `fail_fast` is set, this function will return on any error when
-/// acquiring a handle to a DPD client. Otherwise, it will attempt to notify
-/// all clients and then finally return the first error.
-async fn notify_dendrite_nat_state(
-    datastore: &DataStore,
-    log: &slog::Logger,
-    resolver: &internal_dns::resolver::Resolver,
-    opctx_alloc: &OpContext,
-    instance_id: Option<Uuid>,
-    fail_fast: bool,
-) -> Result<(), Error> {
-    // Querying boundary switches also requires fleet access and the use of the
-    // instance allocator context.
-    let boundary_switches = boundary_switches(datastore, opctx_alloc).await?;
-
-    let mut errors = vec![];
-    for switch in &boundary_switches {
-        debug!(log, "notifying dendrite of updates";
-                    "instance_id" => ?instance_id,
-                    "switch" => switch.to_string());
-
-        let clients = super::dpd_clients(resolver, log).await.map_err(|e| {
-            Error::internal_error(&format!("failed to get dpd clients: {e}"))
-        })?;
-        let client_result = clients.get(switch).ok_or_else(|| {
-            Error::internal_error(&format!(
-                "unable to find dendrite client for {switch}"
-            ))
-        });
-
-        let dpd_client = match client_result {
-            Ok(client) => client,
-            Err(new_error) => {
-                errors.push(new_error);
-                if fail_fast {
-                    break;
-                } else {
-                    continue;
-                }
-            }
-        };
-
-        // Notify dendrite that there are changes for it to reconcile.
-        // In the event of a failure to notify dendrite, we'll log an error
-        // and rely on dendrite's RPW timer to catch it up.
-        if let Err(e) = dpd_client.ipv4_nat_trigger_update().await {
-            error!(log, "failed to notify dendrite of nat updates"; "error" => ?e);
-        };
-    }
-
-    if let Some(e) = errors.into_iter().next() {
-        return Err(e);
-    }
-
-    Ok(())
-}
-
 async fn ensure_nat_entry(
     datastore: &DataStore,
     target_ip: &nexus_db_model::ExternalIp,

From aa354e74f4ad43e8704986631f3cfd5cf0aa571a Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 25 Apr 2024 13:11:23 -0700
Subject: [PATCH 17/69] actually use sleds table correctly

---
 .../src/db/datastore/sled_instance.rs           |  5 +----
 nexus/src/app/background/instance_watcher.rs    | 17 ++++++++---------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/nexus/db-queries/src/db/datastore/sled_instance.rs b/nexus/db-queries/src/db/datastore/sled_instance.rs
index dd3aac9f8e..a070a30756 100644
--- a/nexus/db-queries/src/db/datastore/sled_instance.rs
+++ b/nexus/db-queries/src/db/datastore/sled_instance.rs
@@ -10,13 +10,10 @@ use crate::db::pagination::paginated;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use diesel::prelude::*;
 use nexus_db_model::ApplySledFilterExt;
-use nexus_db_model::DbTypedUuid;
 use nexus_db_model::SledInstance;
 use nexus_types::deployment::SledFilter;
 use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::ListResultVec;
-use omicron_uuid_kinds::CollectionUuid;
-use omicron_uuid_kinds::SledKind;
 use uuid::Uuid;
 
 impl DataStore {
@@ -41,7 +38,7 @@ impl DataStore {
     pub async fn sled_instance_list_by_sled_agent(
         &self,
         opctx: &OpContext,
-        pagparams: &DataPageParams<'_, DbTypedUuid<SledKind>>,
+        pagparams: &DataPageParams<'_, Uuid>,
     ) -> ListResultVec<(Sled, SledInstance)> {
         // TODO(eliza): should probably paginate this?
         use crate::db::schema::{sled::dsl as sled_dsl, sled_instance::dsl};
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 742e92c0d7..cf0ca4605a 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -6,7 +6,7 @@
 
 use super::common::BackgroundTask;
 use futures::{future::BoxFuture, FutureExt};
-use nexus_db_model::SledInstance;
+use nexus_db_model::{Sled, SledInstance};
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
@@ -166,9 +166,8 @@ impl BackgroundTask for InstanceWatcher {
                 paginator = p.found_batch(&batch, &|(sled, _)| sled.id());
                 let mut batch = batch.into_iter();
 
-                if let Some((mut curr_sled_agent, sled_instance)) = batch.next()
-                {
-                    let mk_client = |sled| {
+                if let Some((mut curr_sled, sled_instance)) = batch.next() {
+                    let mk_client = |sled: &Sled| {
                         nexus_networking::sled_client_from_address(
                             sled.id(),
                             sled.address(),
@@ -176,18 +175,18 @@ impl BackgroundTask for InstanceWatcher {
                         )
                     };
 
-                    let mut client = mk_client(&curr_sled_agent);
+                    let mut client = mk_client(&curr_sled);
                     tasks.spawn(self.check_instance(
                         opctx,
                         &client,
                         sled_instance,
                     ));
 
-                    for (sled_agent, sled_instance) in batch {
+                    for (sled, sled_instance) in batch {
                         // We're now talking to a new sled agent; update the client.
-                        if sled_agent.sled_id != curr_sled_agent.sled_id {
-                            client = mk_client(&sled_agent);
-                            curr_sled_agent = sled_agent;
+                        if sled.id() != curr_sled.id() {
+                            client = mk_client(&sled);
+                            curr_sled = sled;
                         }
                         tasks.spawn(self.check_instance(
                             opctx,

From eacb233e1fb1a4178215a0b81b660e7531b90dcf Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 25 Apr 2024 14:36:32 -0700
Subject: [PATCH 18/69] wip

---
 nexus/src/app/background/instance_watcher.rs | 99 +++++++-------------
 sled-agent/src/sled_agent.rs                 | 41 ++++----
 2 files changed, 55 insertions(+), 85 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index cf0ca4605a..2711fffcdf 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -5,14 +5,13 @@
 //! Background task for pulling instance state from sled-agents.
 
 use super::common::BackgroundTask;
+use crate::Error;
 use futures::{future::BoxFuture, FutureExt};
 use nexus_db_model::{Sled, SledInstance};
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
 use nexus_types::identity::Asset;
-use omicron_common::backoff::{self, BackoffError};
-use serde_json::json;
 use sled_agent_client::Client as SledAgentClient;
 use std::future::Future;
 use std::num::NonZeroU32;
@@ -23,7 +22,6 @@ use std::sync::Arc;
 pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
-    max_retries: NonZeroU32,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -37,7 +35,7 @@ impl InstanceWatcher {
         resolver: internal_dns::resolver::Resolver,
         max_retries: NonZeroU32,
     ) -> Self {
-        Self { datastore, resolver, max_retries }
+        Self { datastore, resolver }
     }
 
     fn check_instance(
@@ -45,7 +43,7 @@ impl InstanceWatcher {
         opctx: &OpContext,
         client: &SledAgentClient,
         instance: SledInstance,
-    ) -> impl Future<Output = ()> + Send + 'static {
+    ) -> impl Future<Output = Result<(), Error>> + Send + 'static {
         let instance_id = instance.instance_id();
         let watcher = self.clone();
         let opctx = opctx.child(
@@ -58,63 +56,30 @@ impl InstanceWatcher {
         let client = client.clone();
 
         async move {
-            let InstanceWatcher { datastore, resolver, max_retries } = watcher;
+            let InstanceWatcher { datastore, resolver } = watcher;
             slog::trace!(opctx.log, "checking on instance...");
-            let backoff = backoff::retry_policy_internal_service();
-            let mut retries = 0;
-            let rsp = backoff::retry_notify(
-                backoff,
-                || async {
-                    let rsp = client
-                        .instance_get_state(&instance.instance_id())
-                        .await;
-                    match rsp {
-                        Ok(rsp) => Ok(rsp.into_inner()),
-                        Err(e) if retries == max_retries.get() => {
-                            Err(BackoffError::Permanent(e))
-                        }
-                        Err(
-                            e @ ClientError::InvalidRequest(_)
-                            | e @ ClientError::InvalidUpgrade(_)
-                            | e @ ClientError::UnexpectedResponse(_)
-                            | e @ ClientError::PreHookError(_),
-                        ) => Err(BackoffError::Permanent(e)),
-                        Err(e) => Err(BackoffError::transient(e)),
-                    }
-                },
-                |err, duration| {
-                    slog::info!(
-                        opctx.log,
-                        "instance check failed; retrying: {err}";
-                        "duration" => ?duration,
-                        "retries_remaining" => max_retries.get() - retries,
-                    );
-                },
-            )
-            .await;
+            let rsp = client.instance_get_state(&instance_id).await;
             let state = match rsp {
-                Ok(state) => state,
-                Err(error) => {
-                    // TODO(eliza): here is where it gets interesting --- if the
-                    // sled-agent is in a bad state, we need to:
-                    // 1. figure out whether the instance's VMM is reachable directly
-                    // 2. figure out whether we can recover the sled agent?
-                    // 3. if the instances' VMMs are also gone, mark them as
-                    //    "failed"
-                    // 4. this might mean that the whole sled is super gone,
-                    //    figure that out too.
-                    //
-                    // for now though, we'll just log a really big error.
-                    slog::error!(
+                Ok(rsp) => rsp.into_inner(),
+                Err(ClientError::ErrorResponse(rsp))
+                    if rsp.status() == http::StatusCode::NOT_FOUND
+                        && rsp.as_ref().error_code.as_deref()
+                            == Some("NO_SUCH_INSTANCE") =>
+                {
+                    slog::info!(opctx.log, "instance is wayyyyy gone");
+                    todo!();
+                }
+                Err(e) => {
+                    slog::warn!(
                         opctx.log,
-                        "instance seems to be in a bad state: {error}"
+                        "error checking up on instance: {e}"
                     );
-                    return;
+                    return Err(e.into());
                 }
             };
 
             slog::debug!(opctx.log, "updating instance state: {state:?}");
-            let result = crate::app::instance::notify_instance_updated(
+            crate::app::instance::notify_instance_updated(
                 &datastore,
                 &resolver,
                 &opctx,
@@ -123,18 +88,13 @@ impl InstanceWatcher {
                 &instance_id,
                 &state.into(),
             )
-            .await;
-            match result {
-                Ok(_) => slog::debug!(opctx.log, "instance state updated"),
-                Err(e) => slog::error!(
-                    opctx.log,
-                    "failed to update instance state: {e}"
-                ),
-            }
+            .await
         }
     }
 }
 
+struct CheckResult {}
+
 type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
 
 impl BackgroundTask for InstanceWatcher {
@@ -198,20 +158,27 @@ impl BackgroundTask for InstanceWatcher {
             }
 
             // All requests fired off, let's wait for them to come back.
+            let mut ok = 0;
             while let Some(result) = tasks.join_next().await {
-                if let Err(e) = result {
-                    unreachable!(
+                match result {
+                    Ok(Ok(())) => {
+                        ok += 1;
+                    }
+                    Err(e) => unreachable!(
                         "a `JoinError` is returned if a spawned task \
                         panics, or if the task is aborted. we never abort \
                         tasks on this `JoinSet`, and nexus is compiled with \
                         `panic=\"abort\"`, so neither of these cases should \
                         ever occur: {e}",
-                    );
+                    ),
+                    Ok(Err(e)) => {}
                 }
             }
 
             slog::trace!(opctx.log, "all instance checks complete");
-            serde_json::json!({})
+            serde_json::json!({
+                "num_ok": ok,
+            })
         }
         .boxed()
     }
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index d987c6fa1b..39a5647420 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -171,16 +171,15 @@ impl From<Error> for omicron_common::api::external::Error {
 impl From<Error> for dropshot::HttpError {
     fn from(err: Error) -> Self {
         match err {
-            Error::Instance(instance_manager_error) => {
-                match instance_manager_error {
-                    crate::instance_manager::Error::Instance(
-                        instance_error,
-                    ) => match instance_error {
-                        crate::instance::Error::Propolis(propolis_error) => {
-                            // Work around dropshot#693: HttpError::for_status
-                            // only accepts client errors and asserts on server
-                            // errors, so convert server errors by hand.
-                            match propolis_error.status() {
+            Error::Instance(crate::instance_manager::Error::Instance(
+                instance_error,
+            )) => {
+                match instance_error {
+                    crate::instance::Error::Propolis(propolis_error) => {
+                        // Work around dropshot#693: HttpError::for_status
+                        // only accepts client errors and asserts on server
+                        // errors, so convert server errors by hand.
+                        match propolis_error.status() {
                                 None => HttpError::for_internal_error(
                                     propolis_error.to_string(),
                                 ),
@@ -196,18 +195,22 @@ impl From<Error> for dropshot::HttpError {
                                         HttpError::for_internal_error(propolis_error.to_string()),
                                 }
                             }
-                        }
-                        crate::instance::Error::Transition(omicron_error) => {
-                            // Preserve the status associated with the wrapped
-                            // Omicron error so that Nexus will see it in the
-                            // Progenitor client error it gets back.
-                            HttpError::from(omicron_error)
-                        }
-                        e => HttpError::for_internal_error(e.to_string()),
-                    },
+                    }
+                    crate::instance::Error::Transition(omicron_error) => {
+                        // Preserve the status associated with the wrapped
+                        // Omicron error so that Nexus will see it in the
+                        // Progenitor client error it gets back.
+                        HttpError::from(omicron_error)
+                    }
                     e => HttpError::for_internal_error(e.to_string()),
                 }
             }
+            Error::Instance(
+                e @ crate::instance_manager::Error::NoSuchInstance(_),
+            ) => HttpError::for_not_found(
+                Some("NO_SUCH_INSTANCE".to_string()),
+                e.to_string(),
+            ),
             Error::ZoneBundle(ref inner) => match inner {
                 BundleError::NoStorage | BundleError::Unavailable { .. } => {
                     HttpError::for_unavail(None, inner.to_string())

From 0df165d330bc3eacb4ff7a6c43ecb5310d8aade7 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 25 Apr 2024 15:08:34 -0700
Subject: [PATCH 19/69] new thing

---
 nexus-config/src/nexus_config.rs             |  4 --
 nexus/src/app/background/instance_watcher.rs | 67 ++++++++++++++++----
 nexus/src/app/instance.rs                    | 19 ++++--
 smf/nexus/single-sled/config-partial.toml    |  1 -
 4 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index abd633fec2..5d449d4826 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -528,10 +528,6 @@ pub struct InstanceWatcherConfig {
     /// period (in seconds) for periodic activations of this background task
     #[serde_as(as = "DurationSeconds<u64>")]
     pub period_secs: Duration,
-
-    /// maximum number of retries to attempt before considering a sled-agent
-    /// dead.
-    pub max_retries: NonZeroU32,
 }
 
 /// Configuration for a nexus server
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 2711fffcdf..ff0a8a1d99 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -5,6 +5,7 @@
 //! Background task for pulling instance state from sled-agents.
 
 use super::common::BackgroundTask;
+use crate::app::instance::InstanceUpdated;
 use crate::Error;
 use futures::{future::BoxFuture, FutureExt};
 use nexus_db_model::{Sled, SledInstance};
@@ -33,7 +34,6 @@ impl InstanceWatcher {
     pub(crate) fn new(
         datastore: Arc<DataStore>,
         resolver: internal_dns::resolver::Resolver,
-        max_retries: NonZeroU32,
     ) -> Self {
         Self { datastore, resolver }
     }
@@ -43,7 +43,10 @@ impl InstanceWatcher {
         opctx: &OpContext,
         client: &SledAgentClient,
         instance: SledInstance,
-    ) -> impl Future<Output = Result<(), Error>> + Send + 'static {
+    ) -> impl Future<
+        Output = Result<crate::app::instance::InstanceUpdated, CheckError>,
+    > + Send
+           + 'static {
         let instance_id = instance.instance_id();
         let watcher = self.clone();
         let opctx = opctx.child(
@@ -66,7 +69,7 @@ impl InstanceWatcher {
                         && rsp.as_ref().error_code.as_deref()
                             == Some("NO_SUCH_INSTANCE") =>
                 {
-                    slog::info!(opctx.log, "instance is wayyyyy gone");
+                    slog::debug!(opctx.log, "instance is wayyyyy gone");
                     todo!();
                 }
                 Err(e) => {
@@ -74,7 +77,7 @@ impl InstanceWatcher {
                         opctx.log,
                         "error checking up on instance: {e}"
                     );
-                    return Err(e.into());
+                    return Err(CheckError::SledAgent);
                 }
             };
 
@@ -89,11 +92,17 @@ impl InstanceWatcher {
                 &state.into(),
             )
             .await
+            .map_err(|_| CheckError::Update)?
+            .ok_or(CheckError::NotFound)
         }
     }
 }
 
-struct CheckResult {}
+enum CheckError {
+    SledAgent,
+    Update,
+    NotFound,
+}
 
 type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
 
@@ -158,12 +167,35 @@ impl BackgroundTask for InstanceWatcher {
             }
 
             // All requests fired off, let's wait for them to come back.
-            let mut ok = 0;
+            let mut total = 0;
+            let mut instances_updated = 0;
+            let mut vmms_updated = 0;
+            let mut no_change = 0;
+            let mut not_found = 0;
+            let mut sled_agent_errors = 0;
+            let mut update_errors = 0;
             while let Some(result) = tasks.join_next().await {
+                total += 1;
                 match result {
-                    Ok(Ok(())) => {
-                        ok += 1;
+                    Ok(Ok(InstanceUpdated {
+                        vmm_updated,
+                        instance_updated,
+                    })) => {
+                        if instance_updated {
+                            instances_updated += 1;
+                        }
+
+                        if vmm_updated {
+                            vmms_updated += 1;
+                        }
+
+                        if !(vmm_updated || instance_updated) {
+                            no_change += 1;
+                        }
                     }
+                    Ok(Err(CheckError::NotFound)) => not_found += 1,
+                    Ok(Err(CheckError::SledAgent)) => sled_agent_errors += 1,
+                    Ok(Err(CheckError::Update)) => update_errors += 1,
                     Err(e) => unreachable!(
                         "a `JoinError` is returned if a spawned task \
                         panics, or if the task is aborted. we never abort \
@@ -171,13 +203,26 @@ impl BackgroundTask for InstanceWatcher {
                         `panic=\"abort\"`, so neither of these cases should \
                         ever occur: {e}",
                     ),
-                    Ok(Err(e)) => {}
                 }
             }
 
-            slog::trace!(opctx.log, "all instance checks complete");
+            slog::info!(opctx.log, "all instance checks complete";
+                "total_instances" => ?total,
+                "instances_updated" => ?instances_updated,
+                "vmms_updated" => ?vmms_updated,
+                "no_change" => ?no_change,
+                "not_found" => ?not_found,
+                "sled_agent_errors" => ?sled_agent_errors,
+                "update_errors" => ?update_errors,
+            );
             serde_json::json!({
-                "num_ok": ok,
+                "total_instances": total,
+                "instances_updated": instances_updated,
+                "vmms_updated": vmms_updated,
+                "no_change": no_change,
+                "not_found": not_found,
+                "sled_agent_errors": sled_agent_errors,
+                "update_errors": update_errors,
             })
         }
         .boxed()
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
index b64757b690..50b46c8e8d 100644
--- a/nexus/src/app/instance.rs
+++ b/nexus/src/app/instance.rs
@@ -1523,7 +1523,8 @@ impl super::Nexus {
             instance_id,
             new_runtime_state,
         )
-        .await
+        .await?;
+        Ok(())
     }
 
     /// Returns the requested range of serial console output bytes,
@@ -1952,6 +1953,16 @@ impl super::Nexus {
     }
 }
 
+/// Records what aspects of an instance's state were actually changed in a
+/// [`notify_instance_updated`] call.
+///
+/// This is (presently) used for debugging purposes only.
+#[derive(Copy, Clone)]
+pub(crate) struct InstanceUpdated {
+    pub instance_updated: bool,
+    pub vmm_updated: bool,
+}
+
 /// Invoked by a sled agent to publish an updated runtime state for an
 /// Instance.
 pub(crate) async fn notify_instance_updated(
@@ -1962,7 +1973,7 @@ pub(crate) async fn notify_instance_updated(
     log: &slog::Logger,
     instance_id: &Uuid,
     new_runtime_state: &nexus::SledInstanceState,
-) -> Result<(), Error> {
+) -> Result<Option<InstanceUpdated>, Error> {
     let propolis_id = new_runtime_state.propolis_id;
 
     info!(log, "received new runtime state from sled agent";
@@ -2103,7 +2114,7 @@ pub(crate) async fn notify_instance_updated(
                     "propolis_id" => %propolis_id,
                     "instance_updated" => instance_updated,
                     "vmm_updated" => vmm_updated);
-            Ok(())
+            Ok(Some(InstanceUpdated { instance_updated, vmm_updated }))
         }
 
         // The update command should swallow object-not-found errors and
@@ -2114,7 +2125,7 @@ pub(crate) async fn notify_instance_updated(
                     an object not found error";
                     "instance_id" => %instance_id,
                     "propolis_id" => %propolis_id);
-            Ok(())
+            Ok(None)
         }
 
         // If the datastore is unavailable, propagate that to the caller.
diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
index 607af64fce..4f61609b4e 100644
--- a/smf/nexus/single-sled/config-partial.toml
+++ b/smf/nexus/single-sled/config-partial.toml
@@ -56,7 +56,6 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 instance_watcher.period_secs = 30
-instance_watcher.max_retries = 5
 
 [default_region_allocation_strategy]
 # by default, allocate without requirement for distinct sleds.

From 06950fe19a16b524588f8b94a7093325dd8d4a5e Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 25 Apr 2024 15:23:47 -0700
Subject: [PATCH 20/69] remove unused import

---
 nexus-config/src/nexus_config.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 5d449d4826..86429a3957 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -27,7 +27,6 @@ use std::collections::HashMap;
 use std::fmt;
 use std::net::IpAddr;
 use std::net::SocketAddr;
-use std::num::NonZeroU32;
 use std::time::Duration;
 use uuid::Uuid;
 

From c49780d0a96dd5ff5091b3601c7bc35b31d11dcb Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 29 Apr 2024 10:17:51 -0700
Subject: [PATCH 21/69] rm max retries

---
 nexus-config/src/nexus_config.rs             | 4 +---
 nexus/src/app/background/init.rs             | 1 -
 nexus/src/app/background/instance_watcher.rs | 1 -
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 86429a3957..7f49c2792c 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -895,8 +895,7 @@ mod test {
                         },
                         instance_watcher: InstanceWatcherConfig {
                             period_secs: Duration::from_secs(30),
-                            max_retries: NonZeroU32::new(5).unwrap(),
-                        }
+                        },
                     },
                     default_region_allocation_strategy:
                         crate::nexus_config::RegionAllocationStrategy::Random {
@@ -964,7 +963,6 @@ mod test {
             switch_port_settings_manager.period_secs = 30
             region_replacement.period_secs = 30
             instance_watcher.period_secs = 30
-            instance_watcher.max_retries = 10
             [default_region_allocation_strategy]
             type = "random"
             "##,
diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index 384b55c2d9..be770b9287 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -349,7 +349,6 @@ impl BackgroundTasks {
             let watcher = instance_watcher::InstanceWatcher::new(
                 datastore,
                 resolver.clone(),
-                config.instance_watcher.max_retries,
             );
             driver.register(
                 "instance_watcher".to_string(),
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index ff0a8a1d99..02d937fe07 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -6,7 +6,6 @@
 
 use super::common::BackgroundTask;
 use crate::app::instance::InstanceUpdated;
-use crate::Error;
 use futures::{future::BoxFuture, FutureExt};
 use nexus_db_model::{Sled, SledInstance};
 use nexus_db_queries::context::OpContext;

From 5ed818feee37daf845972b09405fffc3bfc82a98 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 29 Apr 2024 10:27:51 -0700
Subject: [PATCH 22/69] fix config tests

---
 nexus-config/src/nexus_config.rs | 1 +
 nexus/examples/config.toml       | 2 ++
 nexus/tests/config.test.toml     | 1 +
 3 files changed, 4 insertions(+)

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
index 7f49c2792c..1bcc885471 100644
--- a/nexus-config/src/nexus_config.rs
+++ b/nexus-config/src/nexus_config.rs
@@ -765,6 +765,7 @@ mod test {
             sync_service_zone_nat.period_secs = 30
             switch_port_settings_manager.period_secs = 30
             region_replacement.period_secs = 30
+            instance_watcher.period_secs = 30
             [default_region_allocation_strategy]
             type = "random"
             seed = 0
diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
index f7c5e44cf0..8a46f649a5 100644
--- a/nexus/examples/config.toml
+++ b/nexus/examples/config.toml
@@ -113,6 +113,8 @@ blueprints.period_secs_execute = 60
 sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
+# How frequently to query the status of active instances.
+instance_watcher.period_secs = 30
 
 [default_region_allocation_strategy]
 # allocate region on 3 random distinct zpools, on 3 random distinct sleds.
diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml
index 94cf34ee41..dad5797b9f 100644
--- a/nexus/tests/config.test.toml
+++ b/nexus/tests/config.test.toml
@@ -109,6 +109,7 @@ blueprints.period_secs_execute = 600
 sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
+instance_watcher.period_secs = 30
 
 [default_region_allocation_strategy]
 # we only have one sled in the test environment, so we need to use the

From fa5fd029f567e186b28349de3b463e89f1944847 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 30 Apr 2024 09:28:15 -0700
Subject: [PATCH 23/69] wip metrics

---
 nexus/src/app/background/init.rs             |   3 +
 nexus/src/app/background/instance_watcher.rs | 246 ++++++++++++++++++-
 nexus/src/app/mod.rs                         |   1 +
 3 files changed, 239 insertions(+), 11 deletions(-)

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index be770b9287..2df4fa94cc 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -28,6 +28,7 @@ use nexus_config::DnsTasksConfig;
 use nexus_db_model::DnsGroup;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::DataStore;
+use oximeter::types::ProducerRegistry;
 use std::collections::BTreeMap;
 use std::sync::Arc;
 use tokio::sync::mpsc::Sender;
@@ -107,6 +108,7 @@ impl BackgroundTasks {
         nexus_id: Uuid,
         resolver: internal_dns::resolver::Resolver,
         saga_request: Sender<SagaRequest>,
+        producer_registry: &ProducerRegistry,
     ) -> BackgroundTasks {
         let mut driver = common::Driver::new();
 
@@ -349,6 +351,7 @@ impl BackgroundTasks {
             let watcher = instance_watcher::InstanceWatcher::new(
                 datastore,
                 resolver.clone(),
+                producer_registry,
             );
             driver.register(
                 "instance_watcher".to_string(),
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 02d937fe07..224df432da 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -7,11 +7,13 @@
 use super::common::BackgroundTask;
 use crate::app::instance::InstanceUpdated;
 use futures::{future::BoxFuture, FutureExt};
+use http::StatusCode;
 use nexus_db_model::{Sled, SledInstance};
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
 use nexus_types::identity::Asset;
+use oximeter::types::ProducerRegistry;
 use sled_agent_client::Client as SledAgentClient;
 use std::future::Future;
 use std::num::NonZeroU32;
@@ -33,6 +35,7 @@ impl InstanceWatcher {
     pub(crate) fn new(
         datastore: Arc<DataStore>,
         resolver: internal_dns::resolver::Resolver,
+        producer_registry: &ProducerRegistry,
     ) -> Self {
         Self { datastore, resolver }
     }
@@ -43,7 +46,7 @@ impl InstanceWatcher {
         client: &SledAgentClient,
         instance: SledInstance,
     ) -> impl Future<
-        Output = Result<crate::app::instance::InstanceUpdated, CheckError>,
+        Output = CheckResult,
     > + Send
            + 'static {
         let instance_id = instance.instance_id();
@@ -58,7 +61,12 @@ impl InstanceWatcher {
         let client = client.clone();
 
         async move {
-            let InstanceWatcher { datastore, resolver } = watcher;
+            let InstanceWatcher { datastore, resolver } = watcher; 
+            let target = CheckTarget {
+                sled_agent_ip: client.address().ip(),
+                sled_agent_port: client.address().port(),
+                instance,
+            };
             slog::trace!(opctx.log, "checking on instance...");
             let rsp = client.instance_get_state(&instance_id).await;
             let state = match rsp {
@@ -69,14 +77,22 @@ impl InstanceWatcher {
                             == Some("NO_SUCH_INSTANCE") =>
                 {
                     slog::debug!(opctx.log, "instance is wayyyyy gone");
-                    todo!();
+                    return CheckResult { target, check_failure: Some(CheckFailure::NoSuchInstance), update_failure: None };
                 }
                 Err(e) => {
+                    let status = e.status();
                     slog::warn!(
                         opctx.log,
-                        "error checking up on instance: {e}"
+                        "error checking up on instance";
+                        "error" => ?e,
+                        "status" => ?status,
                     );
-                    return Err(CheckError::SledAgent);
+                    if let Some(status) = status {
+                        let check_failure = Some(CheckFailure::SledAgentResponse(status));
+                        return CheckResult { check_failure, update_failure: None };
+                    } else {
+                        match e.
+                    }
                 }
             };
 
@@ -91,18 +107,43 @@ impl InstanceWatcher {
                 &state.into(),
             )
             .await
-            .map_err(|_| CheckError::Update)?
-            .ok_or(CheckError::NotFound)
+            .map_err(|_| CheckError::UpdateFailed)?
+            .ok_or(CheckError::UnknownInstance)
         }
     }
 }
 
-enum CheckError {
-    SledAgent,
-    Update,
-    NotFound,
+struct CheckTarget {
+    sled_agent_ip: IpAddr,
+    sled_agent_port: u16,
+    instance: SledInstance,
 }
 
+struct CheckResult {
+    target: CheckTarget,
+    instance_updated: Option<crate::instances::InstanceUpdated>,
+    check_failure: Option<CheckFailure>,
+    update_failure: Option<UpdateFailure>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+enum CheckFailure {
+    SledAgentUnreachable,
+    SledAgentResponse(StatusCode),
+    NoSuchInstance,
+    Other,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+enum UpdateFailure {
+    ClientError,
+    InstanceNotFound,
+    UpdateFailed,
+    Other,
+}
+
+// impl
+
 type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
 
 impl BackgroundTask for InstanceWatcher {
@@ -227,3 +268,186 @@ impl BackgroundTask for InstanceWatcher {
         .boxed()
     }
 }
+
+mod metrics {
+    use super::{CheckFailure, InstanceUpdated, UpdateFailure};
+    use oximeter::types::Cumulative;
+    use oximeter::Metric;
+    use std::collections::BTreeMap;
+    use std::net::IpAddr;
+    use uuid::Uuid;
+
+    pub(super) struct Metrics {
+        sled_agents: BTreeMap<Uuid, SledAgent>,
+    }
+
+    type SledAgent = BTreeMap<Uuid, Instance>;
+
+    pub(super) struct Instance {
+        no_update: InstanceChecks,
+        instance_updated: InstanceChecks,
+        vmm_updated: InstanceChecks,
+        both_updated: InstanceChecks,
+        check_failures: BTreeMap<CheckFailure, InstanceCheckFailures>,
+        update_failures: BTreeMap<UpdateFailure, InstanceUpdateFailures>,
+        touched: bool,
+    }
+
+    impl Metrics {
+        pub fn instance(
+            &mut self,
+            sled_id: Uuid,
+            sled_ip: IpAddr,
+            sled_port: u16,
+            instance_id: Uuid,
+        ) -> &mut Instance {
+            self.sled_agents
+                .entry(sled_id)
+                .or_default()
+                .entry(instance_id)
+                .or_insert_with(|| Instance {
+                    no_update: InstanceChecks {
+                        instance_id,
+                        sled_agent_id: sled_id,
+                        sled_agent_ip: sled_ip,
+                        sled_agent_port: sled_port,
+                        instance_updated: false,
+                        vmm_updated: false,
+                        datum: Cumulative::default(),
+                    },
+                    instance_updated: InstanceChecks {
+                        instance_id,
+                        sled_agent_id: sled_id,
+                        sled_agent_ip: sled_ip,
+                        sled_agent_port: sled_port,
+                        instance_updated: true,
+                        vmm_updated: false,
+                        datum: Cumulative::default(),
+                    },
+                    vmm_updated: InstanceChecks {
+                        instance_id,
+                        sled_agent_id: sled_id,
+                        sled_agent_ip: sled_ip,
+                        sled_agent_port: sled_port,
+                        instance_updated: false,
+                        vmm_updated: true,
+                        datum: Cumulative::default(),
+                    },
+                    both_updated: InstanceChecks {
+                        instance_id,
+                        sled_agent_id: sled_id,
+                        sled_agent_ip: sled_ip,
+                        sled_agent_port: sled_port,
+                        instance_updated: true,
+                        vmm_updated: true,
+                        datum: Cumulative::default(),
+                    },
+                    check_failures: BTreeMap::new(),
+                    update_failures: BTreeMap::new(),
+                    touched: false,
+                })
+        }
+    }
+
+    impl Instance {
+        pub fn success(&mut self, updated: InstanceUpdated) {
+            match updated {
+                InstanceUpdated { instance_updated: true, vmm_updated: true } => self.both_updated.datum += 1,
+                InstanceUpdated { instance_updated: true, vmm_updated: false } => self.instance_updated.datum += 1,
+                InstanceUpdated { instance_updated: false, vmm_updated: true } => self.vmm_updated.datum += 1,
+                InstanceUpdated { instance_updated: false, vmm_updated: false } => self.no_update.datum += 1,
+            }
+            self.touched = true;
+        }
+
+        pub fn check_failure(&mut self, reason: CheckFailure) {
+            self.check_failures
+                .entry(reason)
+                .or_insert_with(|| InstanceCheckFailures {
+                    instance_id: self.no_update.instance_id,
+                    sled_agent_id: self.no_update.sled_agent_id,
+                    sled_agent_ip: self.no_update.sled_agent_ip,
+                    sled_agent_port: self.no_update.sled_agent_port,
+                    reason: reason.to_string(),
+                    datum: Cumulative::default(),
+                })
+                .datum += 1;
+            self.touched = true;
+        }
+
+        pub fn update_failure(&mut self, reason: UpdateFailure) {
+            self.update_failures
+                .entry(reason)
+                .or_insert_with(|| InstanceUpdateFailures {
+                    instance_id: self.no_update.instance_id,
+                    sled_agent_id: self.no_update.sled_agent_id,
+                    sled_agent_ip: self.no_update.sled_agent_ip,
+                    sled_agent_port: self.no_update.sled_agent_port,
+                    reason: reason.as_str(),
+                    datum: Cumulative::default(),
+                })
+                .datum += 1;
+            self.touched = true;
+        }
+    }
+
+    /// The number of successful checks for a single instance and sled agent
+    /// pair.
+    #[derive(Clone, Debug, Metric)]
+    struct InstanceChecks {
+        /// The instance's ID.
+        instance_id: Uuid,
+        /// The sled-agent's ID.
+        sled_agent_id: Uuid,
+        /// The sled agent's IP address.
+        sled_agent_ip: IpAddr,
+        /// The sled agent's port.
+        sled_agent_port: u16,
+        instance_updated: bool,
+        vmm_updated: bool,
+        /// The number of successful checks for this instance and sled agent.
+        datum: Cumulative<u64>,
+    }
+
+    /// The number of failed checks for an instance and sled agent pair.
+    #[derive(Clone, Debug, Metric)]
+    struct InstanceCheckFailures {
+        /// The instance's ID.
+        instance_id: Uuid,
+        /// The sled-agent's ID.
+        sled_agent_id: Uuid,
+        /// The sled agent's IP address.
+        sled_agent_ip: IpAddr,
+        /// The sled agent's port.
+        sled_agent_port: u16,
+        /// The reason why the check failed.
+        ///
+        /// # Note
+        /// This must always be generated from a `CheckFailure` enum.
+        reason: String,
+        /// The number of failed checks for this instance and sled agent.
+        datum: Cumulative<u64>,
+    }
+
+    /// The number of failed instance updates for an instance and sled agent pair.
+    #[derive(Clone, Debug, Metric)]
+    struct InstanceUpdateFailures {
+        /// The instance's ID.
+        instance_id: Uuid,
+        /// The sled-agent's ID.
+        sled_agent_id: Uuid,
+        /// The sled agent's IP address.
+        sled_agent_ip: IpAddr,
+        /// The sled agent's port.
+        sled_agent_port: u16,
+        /// The reason why the check failed.
+        ///
+        /// # Note
+        /// This must always be generated from a `CheckFailure` enum.
+        // TODO(eliza): it would be nice if this was a `oximeter::FieldType`:
+        // From<&str>` impl, so that this could be a `&'static str`.
+        reason: String,
+        /// The number of failed checks for this instance and sled agent.
+        datum: Cumulative<u64>,
+    }
+}
diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs
index a2cbe2f7ae..833950aaec 100644
--- a/nexus/src/app/mod.rs
+++ b/nexus/src/app/mod.rs
@@ -386,6 +386,7 @@ impl Nexus {
             config.deployment.id,
             resolver.clone(),
             saga_request,
+            producer_registry,
         );
 
         let external_resolver = {

From 9263bcb27608b2478fb5bdb72e22279c1312b938 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 30 Apr 2024 11:19:28 -0700
Subject: [PATCH 24/69] more metric plumbing

---
 nexus/src/app/background/instance_watcher.rs | 460 ++++++++++++++-----
 1 file changed, 338 insertions(+), 122 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 224df432da..bb5ff8b540 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -15,15 +15,19 @@ use nexus_db_queries::db::DataStore;
 use nexus_types::identity::Asset;
 use oximeter::types::ProducerRegistry;
 use sled_agent_client::Client as SledAgentClient;
+use std::fmt;
 use std::future::Future;
+use std::net::IpAddr;
 use std::num::NonZeroU32;
 use std::sync::Arc;
+use std::sync::Mutex;
 
 /// Background task that periodically checks instance states.
 #[derive(Clone)]
 pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
+    metrics: Arc<Mutex<metrics::Metrics>>,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -37,20 +41,25 @@ impl InstanceWatcher {
         resolver: internal_dns::resolver::Resolver,
         producer_registry: &ProducerRegistry,
     ) -> Self {
-        Self { datastore, resolver }
+        let (metrics, producer) = metrics::Metrics::new();
+        producer_registry.register_producer(producer);
+        Self { datastore, resolver, metrics }
     }
 
     fn check_instance(
         &self,
         opctx: &OpContext,
+        sled: &Sled,
         client: &SledAgentClient,
         instance: SledInstance,
-    ) -> impl Future<
-        Output = CheckResult,
-    > + Send
-           + 'static {
+    ) -> impl Future<Output = CheckResult> + Send + 'static {
         let instance_id = instance.instance_id();
         let watcher = self.clone();
+        let target = CheckTarget {
+            sled_agent_ip: std::net::Ipv6Addr::from(sled.ip).into(),
+            sled_agent_port: sled.port.into(),
+            instance,
+        };
         let opctx = opctx.child(
             std::iter::once((
                 "instance_id".to_string(),
@@ -61,43 +70,77 @@ impl InstanceWatcher {
         let client = client.clone();
 
         async move {
-            let InstanceWatcher { datastore, resolver } = watcher; 
-            let target = CheckTarget {
-                sled_agent_ip: client.address().ip(),
-                sled_agent_port: client.address().port(),
-                instance,
-            };
+            let InstanceWatcher { datastore, resolver, .. } = watcher;
             slog::trace!(opctx.log, "checking on instance...");
             let rsp = client.instance_get_state(&instance_id).await;
             let state = match rsp {
                 Ok(rsp) => rsp.into_inner(),
-                Err(ClientError::ErrorResponse(rsp))
-                    if rsp.status() == http::StatusCode::NOT_FOUND
+                Err(ClientError::ErrorResponse(rsp)) => {
+                    let status = rsp.status();
+                    if status == StatusCode::NOT_FOUND
                         && rsp.as_ref().error_code.as_deref()
-                            == Some("NO_SUCH_INSTANCE") =>
-                {
-                    slog::debug!(opctx.log, "instance is wayyyyy gone");
-                    return CheckResult { target, check_failure: Some(CheckFailure::NoSuchInstance), update_failure: None };
+                            == Some("NO_SUCH_INSTANCE")
+                    {
+                        slog::info!(opctx.log, "instance is wayyyyy gone");
+                        return CheckResult {
+                            target,
+                            check_failure: Some(CheckFailure::NoSuchInstance),
+                            update_failure: None,
+                            instance_updated: None,
+                        };
+                    }
+                    if status.is_client_error() {
+                        slog::warn!(opctx.log, "check failed due to client error";
+                            "status" => ?status, "error" => ?rsp.into_inner());
+                        return CheckResult {
+                            target,
+                            check_failure: None,
+                            update_failure: Some(
+                                UpdateFailure::ClientHttpError(status),
+                            ),
+                            instance_updated: None,
+                        };
+                    }
+
+                    slog::info!(opctx.log, "check failed due to server error";
+                        "status" => ?status, "error" => ?rsp.into_inner());
+
+                    return CheckResult {
+                        target,
+                        check_failure: Some(CheckFailure::SledAgentResponse(
+                            status,
+                        )),
+                        update_failure: None,
+                        instance_updated: None,
+                    };
+                }
+                Err(ClientError::CommunicationError(e)) => {
+                    slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
+                    return CheckResult {
+                        target,
+                        check_failure: Some(CheckFailure::SledAgentUnreachable),
+                        update_failure: None,
+                        instance_updated: None,
+                    };
                 }
                 Err(e) => {
-                    let status = e.status();
                     slog::warn!(
                         opctx.log,
                         "error checking up on instance";
                         "error" => ?e,
-                        "status" => ?status,
+                        "status" => ?e.status(),
                     );
-                    if let Some(status) = status {
-                        let check_failure = Some(CheckFailure::SledAgentResponse(status));
-                        return CheckResult { check_failure, update_failure: None };
-                    } else {
-                        match e.
-                    }
+                    return CheckResult {
+                        target,
+                        check_failure: None,
+                        update_failure: Some(UpdateFailure::ClientError),
+                        instance_updated: None,
+                    };
                 }
             };
 
             slog::debug!(opctx.log, "updating instance state: {state:?}");
-            crate::app::instance::notify_instance_updated(
+            let update_result = crate::app::instance::notify_instance_updated(
                 &datastore,
                 &resolver,
                 &opctx,
@@ -107,8 +150,28 @@ impl InstanceWatcher {
                 &state.into(),
             )
             .await
-            .map_err(|_| CheckError::UpdateFailed)?
-            .ok_or(CheckError::UnknownInstance)
+            .map_err(|_| UpdateFailure::UpdateFailed)
+            .and_then(|updated| updated.ok_or(UpdateFailure::InstanceNotFound));
+            match update_result {
+                Ok(updated) => {
+                    slog::debug!(opctx.log, "update successful"; "instance_updated" => updated.instance_updated, "vmm_updated" => updated.vmm_updated);
+                    CheckResult {
+                        target,
+                        instance_updated: Some(updated),
+                        check_failure: None,
+                        update_failure: None,
+                    }
+                }
+                Err(e) => {
+                    slog::warn!(opctx.log, "error updating instance"; "error" => ?e);
+                    CheckResult {
+                        target,
+                        instance_updated: None,
+                        check_failure: None,
+                        update_failure: Some(e),
+                    }
+                }
+            }
         }
     }
 }
@@ -121,28 +184,48 @@ struct CheckTarget {
 
 struct CheckResult {
     target: CheckTarget,
-    instance_updated: Option<crate::instances::InstanceUpdated>,
+    instance_updated: Option<crate::app::instance::InstanceUpdated>,
     check_failure: Option<CheckFailure>,
     update_failure: Option<UpdateFailure>,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 enum CheckFailure {
     SledAgentUnreachable,
     SledAgentResponse(StatusCode),
     NoSuchInstance,
-    Other,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 enum UpdateFailure {
+    ClientHttpError(StatusCode),
     ClientError,
     InstanceNotFound,
     UpdateFailed,
-    Other,
 }
 
-// impl
+impl fmt::Display for CheckFailure {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::SledAgentUnreachable => f.write_str("unreachable"),
+            Self::SledAgentResponse(status) => {
+                write!(f, "{status}")
+            }
+            Self::NoSuchInstance => f.write_str("no_such_instance"),
+        }
+    }
+}
+
+impl fmt::Display for UpdateFailure {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::ClientHttpError(status) => write!(f, "{status}"),
+            Self::ClientError => f.write_str("client_error"),
+            Self::InstanceNotFound => f.write_str("instance_not_found"),
+            Self::UpdateFailed => f.write_str("update_failed"),
+        }
+    }
+}
 
 type ClientError = sled_agent_client::Error<sled_agent_client::types::Error>;
 
@@ -187,6 +270,7 @@ impl BackgroundTask for InstanceWatcher {
                     let mut client = mk_client(&curr_sled);
                     tasks.spawn(self.check_instance(
                         opctx,
+                        &curr_sled,
                         &client,
                         sled_instance,
                     ));
@@ -199,6 +283,7 @@ impl BackgroundTask for InstanceWatcher {
                         }
                         tasks.spawn(self.check_instance(
                             opctx,
+                            &curr_sled,
                             &client,
                             sled_instance,
                         ));
@@ -206,54 +291,67 @@ impl BackgroundTask for InstanceWatcher {
                 }
             }
 
-            // All requests fired off, let's wait for them to come back.
-            let mut total = 0;
-            let mut instances_updated = 0;
-            let mut vmms_updated = 0;
-            let mut no_change = 0;
-            let mut not_found = 0;
-            let mut sled_agent_errors = 0;
-            let mut update_errors = 0;
+            // All requests fired off! While we wait for them to come back,
+            // let's prune old instances.
+            let pruned = self.metrics.lock().unwrap().prune();
+
+            // Now, wait for the check results to come back.
+            let mut total: usize = 0;
+            let mut instances_updated: usize = 0;
+            let mut vmms_updated: usize = 0;
+            let mut no_change: usize = 0;
+            let mut not_found: usize = 0;
+            let mut sled_agent_errors: usize = 0;
+            let mut update_errors: usize = 0;
             while let Some(result) = tasks.join_next().await {
                 total += 1;
-                match result {
-                    Ok(Ok(InstanceUpdated {
-                        vmm_updated,
-                        instance_updated,
-                    })) => {
-                        if instance_updated {
-                            instances_updated += 1;
-                        }
+                let CheckResult {
+                    instance_updated,
+                    check_failure,
+                    update_failure,
+                    ..
+                } = result.expect(
+                    "a `JoinError` is returned if a spawned task \
+                    panics, or if the task is aborted. we never abort \
+                    tasks on this `JoinSet`, and nexus is compiled with \
+                    `panic=\"abort\"`, so neither of these cases should \
+                    ever occur",
+                );
+                if let Some(InstanceUpdated { vmm_updated, instance_updated }) =
+                    instance_updated
+                {
+                    if instance_updated {
+                        instances_updated += 1;
+                    }
 
-                        if vmm_updated {
-                            vmms_updated += 1;
-                        }
+                    if vmm_updated {
+                        vmms_updated += 1;
+                    }
 
-                        if !(vmm_updated || instance_updated) {
-                            no_change += 1;
-                        }
+                    if !(vmm_updated || instance_updated) {
+                        no_change += 1;
                     }
-                    Ok(Err(CheckError::NotFound)) => not_found += 1,
-                    Ok(Err(CheckError::SledAgent)) => sled_agent_errors += 1,
-                    Ok(Err(CheckError::Update)) => update_errors += 1,
-                    Err(e) => unreachable!(
-                        "a `JoinError` is returned if a spawned task \
-                        panics, or if the task is aborted. we never abort \
-                        tasks on this `JoinSet`, and nexus is compiled with \
-                        `panic=\"abort\"`, so neither of these cases should \
-                        ever occur: {e}",
-                    ),
+                }
+                if let Some(failure) = check_failure {
+                    match failure {
+                        CheckFailure::NoSuchInstance => not_found += 1,
+                        _ => sled_agent_errors += 1,
+                    }
+                }
+                if update_failure.is_some() {
+                    update_errors += 1;
                 }
             }
 
             slog::info!(opctx.log, "all instance checks complete";
-                "total_instances" => ?total,
-                "instances_updated" => ?instances_updated,
-                "vmms_updated" => ?vmms_updated,
-                "no_change" => ?no_change,
-                "not_found" => ?not_found,
-                "sled_agent_errors" => ?sled_agent_errors,
-                "update_errors" => ?update_errors,
+                "total_instances" => total,
+                "instances_updated" => instances_updated,
+                "vmms_updated" => vmms_updated,
+                "no_change" => no_change,
+                "not_found" => not_found,
+                "sled_agent_errors" => sled_agent_errors,
+                "update_errors" => update_errors,
+                "pruned_instances" => pruned,
             );
             serde_json::json!({
                 "total_instances": total,
@@ -263,6 +361,7 @@ impl BackgroundTask for InstanceWatcher {
                 "not_found": not_found,
                 "sled_agent_errors": sled_agent_errors,
                 "update_errors": update_errors,
+                "pruned_instances": pruned,
             })
         }
         .boxed()
@@ -272,17 +371,32 @@ impl BackgroundTask for InstanceWatcher {
 mod metrics {
     use super::{CheckFailure, InstanceUpdated, UpdateFailure};
     use oximeter::types::Cumulative;
+    use oximeter::types::ProducerResultsItem;
     use oximeter::Metric;
+    use oximeter::MetricsError;
+    use oximeter::Sample;
+    use oximeter::Target;
     use std::collections::BTreeMap;
     use std::net::IpAddr;
+    use std::sync::Arc;
+    use std::sync::Mutex;
     use uuid::Uuid;
 
+    #[derive(Debug)]
     pub(super) struct Metrics {
         sled_agents: BTreeMap<Uuid, SledAgent>,
+        instance_count: usize,
+    }
+
+    #[derive(Debug)]
+    pub(super) struct Producer {
+        metrics: Arc<Mutex<Metrics>>,
+        target: InstanceWatcherTarget,
     }
 
     type SledAgent = BTreeMap<Uuid, Instance>;
 
+    #[derive(Debug)]
     pub(super) struct Instance {
         no_update: InstanceChecks,
         instance_updated: InstanceChecks,
@@ -294,73 +408,148 @@ mod metrics {
     }
 
     impl Metrics {
-        pub fn instance(
+        pub fn new() -> (Arc<Mutex<Self>>, Producer) {
+            let metrics = Arc::new(Mutex::new(Self {
+                sled_agents: BTreeMap::new(),
+                instance_count: 0,
+            }));
+            let producer = Producer {
+                metrics: metrics.clone(),
+                target: InstanceWatcherTarget {
+                    name: "instance-watcher".to_string(),
+                },
+            };
+            (metrics, producer)
+        }
+
+        pub(crate) fn instance(
             &mut self,
             sled_id: Uuid,
             sled_ip: IpAddr,
             sled_port: u16,
             instance_id: Uuid,
         ) -> &mut Instance {
+            let count = &mut self.instance_count;
             self.sled_agents
                 .entry(sled_id)
                 .or_default()
                 .entry(instance_id)
-                .or_insert_with(|| Instance {
-                    no_update: InstanceChecks {
-                        instance_id,
-                        sled_agent_id: sled_id,
-                        sled_agent_ip: sled_ip,
-                        sled_agent_port: sled_port,
-                        instance_updated: false,
-                        vmm_updated: false,
-                        datum: Cumulative::default(),
-                    },
-                    instance_updated: InstanceChecks {
-                        instance_id,
-                        sled_agent_id: sled_id,
-                        sled_agent_ip: sled_ip,
-                        sled_agent_port: sled_port,
-                        instance_updated: true,
-                        vmm_updated: false,
-                        datum: Cumulative::default(),
-                    },
-                    vmm_updated: InstanceChecks {
-                        instance_id,
-                        sled_agent_id: sled_id,
-                        sled_agent_ip: sled_ip,
-                        sled_agent_port: sled_port,
-                        instance_updated: false,
-                        vmm_updated: true,
-                        datum: Cumulative::default(),
-                    },
-                    both_updated: InstanceChecks {
-                        instance_id,
-                        sled_agent_id: sled_id,
-                        sled_agent_ip: sled_ip,
-                        sled_agent_port: sled_port,
-                        instance_updated: true,
-                        vmm_updated: true,
-                        datum: Cumulative::default(),
-                    },
-                    check_failures: BTreeMap::new(),
-                    update_failures: BTreeMap::new(),
-                    touched: false,
+                .or_insert_with(|| {
+                    *count += 1;
+                    Instance {
+                        no_update: InstanceChecks {
+                            instance_id,
+                            sled_agent_id: sled_id,
+                            sled_agent_ip: sled_ip,
+                            sled_agent_port: sled_port,
+                            instance_updated: false,
+                            vmm_updated: false,
+                            datum: Cumulative::default(),
+                        },
+                        instance_updated: InstanceChecks {
+                            instance_id,
+                            sled_agent_id: sled_id,
+                            sled_agent_ip: sled_ip,
+                            sled_agent_port: sled_port,
+                            instance_updated: true,
+                            vmm_updated: false,
+                            datum: Cumulative::default(),
+                        },
+                        vmm_updated: InstanceChecks {
+                            instance_id,
+                            sled_agent_id: sled_id,
+                            sled_agent_ip: sled_ip,
+                            sled_agent_port: sled_port,
+                            instance_updated: false,
+                            vmm_updated: true,
+                            datum: Cumulative::default(),
+                        },
+                        both_updated: InstanceChecks {
+                            instance_id,
+                            sled_agent_id: sled_id,
+                            sled_agent_ip: sled_ip,
+                            sled_agent_port: sled_port,
+                            instance_updated: true,
+                            vmm_updated: true,
+                            datum: Cumulative::default(),
+                        },
+                        check_failures: BTreeMap::new(),
+                        update_failures: BTreeMap::new(),
+                        touched: false,
+                    }
                 })
         }
+
+        pub(super) fn sample(
+            &self,
+            target: &impl Target,
+        ) -> impl IntoIterator<Item = Sample> {
+            let mut v = Vec::with_capacity(self.instance_count);
+            for sled in self.sled_agents.values() {
+                for instance in sled.values() {
+                    if instance.touched {
+                        v.push(match instance.sample(target) {
+                            Ok(samples) => ProducerResultsItem::Ok(samples),
+                            Err(e) => ProducerResultsItem::Err(e),
+                        });
+                    }
+                }
+            }
+            v
+        }
+
+        pub(super) fn prune(&mut self) -> usize {
+            let mut pruned = 0;
+            self.sled_agents.retain(|_, sled| {
+                sled.retain(|_, instance| {
+                    let touched =
+                        std::mem::replace(&mut instance.touched, false);
+                    if !touched {
+                        pruned += 1;
+                    }
+                    touched
+                });
+                !sled.is_empty()
+            });
+            self.instance_count -= pruned;
+            pruned
+        }
+    }
+
+    impl oximeter::Producer for Producer {
+        fn produce(
+            &mut self,
+        ) -> Result<Box<dyn Iterator<Item = Sample>>, MetricsError> {
+            Box::new(
+                self.metrics.lock().unwrap().sample(&self.target).into_iter(),
+            )
+        }
     }
 
     impl Instance {
-        pub fn success(&mut self, updated: InstanceUpdated) {
+        pub(super) fn success(&mut self, updated: InstanceUpdated) {
             match updated {
-                InstanceUpdated { instance_updated: true, vmm_updated: true } => self.both_updated.datum += 1,
-                InstanceUpdated { instance_updated: true, vmm_updated: false } => self.instance_updated.datum += 1,
-                InstanceUpdated { instance_updated: false, vmm_updated: true } => self.vmm_updated.datum += 1,
-                InstanceUpdated { instance_updated: false, vmm_updated: false } => self.no_update.datum += 1,
+                InstanceUpdated {
+                    instance_updated: true,
+                    vmm_updated: true,
+                } => self.both_updated.datum += 1,
+                InstanceUpdated {
+                    instance_updated: true,
+                    vmm_updated: false,
+                } => self.instance_updated.datum += 1,
+                InstanceUpdated {
+                    instance_updated: false,
+                    vmm_updated: true,
+                } => self.vmm_updated.datum += 1,
+                InstanceUpdated {
+                    instance_updated: false,
+                    vmm_updated: false,
+                } => self.no_update.datum += 1,
             }
             self.touched = true;
         }
 
-        pub fn check_failure(&mut self, reason: CheckFailure) {
+        pub(super) fn check_failure(&mut self, reason: CheckFailure) {
             self.check_failures
                 .entry(reason)
                 .or_insert_with(|| InstanceCheckFailures {
@@ -375,7 +564,7 @@ mod metrics {
             self.touched = true;
         }
 
-        pub fn update_failure(&mut self, reason: UpdateFailure) {
+        pub(super) fn update_failure(&mut self, reason: UpdateFailure) {
             self.update_failures
                 .entry(reason)
                 .or_insert_with(|| InstanceUpdateFailures {
@@ -383,12 +572,39 @@ mod metrics {
                     sled_agent_id: self.no_update.sled_agent_id,
                     sled_agent_ip: self.no_update.sled_agent_ip,
                     sled_agent_port: self.no_update.sled_agent_port,
-                    reason: reason.as_str(),
+                    reason: reason.to_string(),
                     datum: Cumulative::default(),
                 })
                 .datum += 1;
             self.touched = true;
         }
+
+        fn len(&self) -> usize {
+            4 + self.check_failures.len() + self.update_failures.len()
+        }
+
+        fn sample(
+            &self,
+            target: &impl Target,
+        ) -> Result<Vec<Sample>, MetricsError> {
+            let mut v = Vec::with_capacity(self.len());
+            v.push(Sample::new(target, &self.no_update)?);
+            v.push(Sample::new(target, &self.instance_updated)?);
+            v.push(Sample::new(target, &self.vmm_updated)?);
+            v.push(Sample::new(target, &self.both_updated)?);
+            for metric in self.check_failures.values() {
+                v.push(Sample::new(target, metric)?);
+            }
+            for metric in self.update_failures.values() {
+                v.push(Sample::new(target, metric)?)
+            }
+            Ok(v)
+        }
+    }
+
+    #[derive(Clone, Debug, Target)]
+    struct InstanceWatcherTarget {
+        name: String,
     }
 
     /// The number of successful checks for a single instance and sled agent

From ddb5792ed79902fb05e5ef09b25f89c13ee0d189 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 30 Apr 2024 11:52:56 -0700
Subject: [PATCH 25/69] redo metrics

---
 nexus/src/app/background/instance_watcher.rs | 266 +++++++------------
 1 file changed, 89 insertions(+), 177 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index bb5ff8b540..1c7d07978b 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -21,6 +21,7 @@ use std::net::IpAddr;
 use std::num::NonZeroU32;
 use std::sync::Arc;
 use std::sync::Mutex;
+use uuid::Uuid;
 
 /// Background task that periodically checks instance states.
 #[derive(Clone)]
@@ -41,8 +42,8 @@ impl InstanceWatcher {
         resolver: internal_dns::resolver::Resolver,
         producer_registry: &ProducerRegistry,
     ) -> Self {
-        let (metrics, producer) = metrics::Metrics::new();
-        producer_registry.register_producer(producer);
+        let metrics = Arc::new(Mutex::new(metrics::Metrics::default()));
+        producer_registry.register_producer(metrics::Producer(metrics.clone()));
         Self { datastore, resolver, metrics }
     }
 
@@ -55,10 +56,11 @@ impl InstanceWatcher {
     ) -> impl Future<Output = CheckResult> + Send + 'static {
         let instance_id = instance.instance_id();
         let watcher = self.clone();
-        let target = CheckTarget {
+        let target = InstanceTarget {
+            instance_id,
+            sled_agent_id: sled.id(),
             sled_agent_ip: std::net::Ipv6Addr::from(sled.ip).into(),
             sled_agent_port: sled.port.into(),
-            instance,
         };
         let opctx = opctx.child(
             std::iter::once((
@@ -176,14 +178,22 @@ impl InstanceWatcher {
     }
 }
 
-struct CheckTarget {
+#[derive(
+    Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, oximeter::Target,
+)]
+struct InstanceTarget {
+    /// The instance's ID.
+    instance_id: Uuid,
+    /// The sled-agent's ID.
+    sled_agent_id: Uuid,
+    /// The sled agent's IP address.
     sled_agent_ip: IpAddr,
+    /// The sled agent's port.
     sled_agent_port: u16,
-    instance: SledInstance,
 }
 
 struct CheckResult {
-    target: CheckTarget,
+    target: InstanceTarget,
     instance_updated: Option<crate::app::instance::InstanceUpdated>,
     check_failure: Option<CheckFailure>,
     update_failure: Option<UpdateFailure>,
@@ -306,6 +316,7 @@ impl BackgroundTask for InstanceWatcher {
             while let Some(result) = tasks.join_next().await {
                 total += 1;
                 let CheckResult {
+                    target,
                     instance_updated,
                     check_failure,
                     update_failure,
@@ -317,28 +328,31 @@ impl BackgroundTask for InstanceWatcher {
                     `panic=\"abort\"`, so neither of these cases should \
                     ever occur",
                 );
-                if let Some(InstanceUpdated { vmm_updated, instance_updated }) =
-                    instance_updated
-                {
-                    if instance_updated {
+                let metric = self.metrics.lock().unwrap().instance(target);
+                if let Some(up) = instance_updated {
+                    if up.instance_updated {
                         instances_updated += 1;
                     }
 
-                    if vmm_updated {
+                    if up.vmm_updated {
                         vmms_updated += 1;
                     }
 
-                    if !(vmm_updated || instance_updated) {
+                    if !(up.vmm_updated || up.instance_updated) {
                         no_change += 1;
                     }
+                    metric.success(up);
                 }
-                if let Some(failure) = check_failure {
-                    match failure {
+                if let Some(reason) = check_failure {
+                    match reason {
                         CheckFailure::NoSuchInstance => not_found += 1,
                         _ => sled_agent_errors += 1,
                     }
+
+                    metric.check_failure(reason);
                 }
-                if update_failure.is_some() {
+                if let Some(reason) = update_failure {
+                    metric.update_failure(reason);
                     update_errors += 1;
                 }
             }
@@ -369,32 +383,22 @@ impl BackgroundTask for InstanceWatcher {
 }
 
 mod metrics {
-    use super::{CheckFailure, InstanceUpdated, UpdateFailure};
+    use super::{CheckFailure, InstanceTarget, InstanceUpdated, UpdateFailure};
     use oximeter::types::Cumulative;
-    use oximeter::types::ProducerResultsItem;
     use oximeter::Metric;
     use oximeter::MetricsError;
     use oximeter::Sample;
-    use oximeter::Target;
     use std::collections::BTreeMap;
-    use std::net::IpAddr;
     use std::sync::Arc;
     use std::sync::Mutex;
-    use uuid::Uuid;
 
-    #[derive(Debug)]
+    #[derive(Debug, Default)]
     pub(super) struct Metrics {
-        sled_agents: BTreeMap<Uuid, SledAgent>,
-        instance_count: usize,
+        instances: BTreeMap<InstanceTarget, Instance>,
     }
 
     #[derive(Debug)]
-    pub(super) struct Producer {
-        metrics: Arc<Mutex<Metrics>>,
-        target: InstanceWatcherTarget,
-    }
-
-    type SledAgent = BTreeMap<Uuid, Instance>;
+    pub(super) struct Producer(pub(super) Arc<Mutex<Metrics>>);
 
     #[derive(Debug)]
     pub(super) struct Instance {
@@ -408,121 +412,64 @@ mod metrics {
     }
 
     impl Metrics {
-        pub fn new() -> (Arc<Mutex<Self>>, Producer) {
-            let metrics = Arc::new(Mutex::new(Self {
-                sled_agents: BTreeMap::new(),
-                instance_count: 0,
-            }));
-            let producer = Producer {
-                metrics: metrics.clone(),
-                target: InstanceWatcherTarget {
-                    name: "instance-watcher".to_string(),
-                },
-            };
-            (metrics, producer)
-        }
-
         pub(crate) fn instance(
             &mut self,
-            sled_id: Uuid,
-            sled_ip: IpAddr,
-            sled_port: u16,
-            instance_id: Uuid,
+            instance: InstanceTarget,
         ) -> &mut Instance {
-            let count = &mut self.instance_count;
-            self.sled_agents
-                .entry(sled_id)
-                .or_default()
-                .entry(instance_id)
-                .or_insert_with(|| {
-                    *count += 1;
-                    Instance {
-                        no_update: InstanceChecks {
-                            instance_id,
-                            sled_agent_id: sled_id,
-                            sled_agent_ip: sled_ip,
-                            sled_agent_port: sled_port,
-                            instance_updated: false,
-                            vmm_updated: false,
-                            datum: Cumulative::default(),
-                        },
-                        instance_updated: InstanceChecks {
-                            instance_id,
-                            sled_agent_id: sled_id,
-                            sled_agent_ip: sled_ip,
-                            sled_agent_port: sled_port,
-                            instance_updated: true,
-                            vmm_updated: false,
-                            datum: Cumulative::default(),
-                        },
-                        vmm_updated: InstanceChecks {
-                            instance_id,
-                            sled_agent_id: sled_id,
-                            sled_agent_ip: sled_ip,
-                            sled_agent_port: sled_port,
-                            instance_updated: false,
-                            vmm_updated: true,
-                            datum: Cumulative::default(),
-                        },
-                        both_updated: InstanceChecks {
-                            instance_id,
-                            sled_agent_id: sled_id,
-                            sled_agent_ip: sled_ip,
-                            sled_agent_port: sled_port,
-                            instance_updated: true,
-                            vmm_updated: true,
-                            datum: Cumulative::default(),
-                        },
-                        check_failures: BTreeMap::new(),
-                        update_failures: BTreeMap::new(),
-                        touched: false,
-                    }
-                })
-        }
-
-        pub(super) fn sample(
-            &self,
-            target: &impl Target,
-        ) -> impl IntoIterator<Item = Sample> {
-            let mut v = Vec::with_capacity(self.instance_count);
-            for sled in self.sled_agents.values() {
-                for instance in sled.values() {
-                    if instance.touched {
-                        v.push(match instance.sample(target) {
-                            Ok(samples) => ProducerResultsItem::Ok(samples),
-                            Err(e) => ProducerResultsItem::Err(e),
-                        });
-                    }
-                }
-            }
-            v
+            self.instances.entry(instance).or_insert_with(|| Instance {
+                no_update: InstanceChecks {
+                    instance_updated: false,
+                    vmm_updated: false,
+                    datum: Cumulative::default(),
+                },
+                instance_updated: InstanceChecks {
+                    instance_updated: true,
+                    vmm_updated: false,
+                    datum: Cumulative::default(),
+                },
+                vmm_updated: InstanceChecks {
+                    instance_updated: false,
+                    vmm_updated: true,
+                    datum: Cumulative::default(),
+                },
+                both_updated: InstanceChecks {
+                    instance_updated: true,
+                    vmm_updated: true,
+                    datum: Cumulative::default(),
+                },
+                check_failures: BTreeMap::new(),
+                update_failures: BTreeMap::new(),
+                touched: false,
+            })
         }
 
         pub(super) fn prune(&mut self) -> usize {
             let mut pruned = 0;
-            self.sled_agents.retain(|_, sled| {
-                sled.retain(|_, instance| {
-                    let touched =
-                        std::mem::replace(&mut instance.touched, false);
-                    if !touched {
-                        pruned += 1;
-                    }
-                    touched
-                });
-                !sled.is_empty()
+            self.instances.retain(|_, instance| {
+                let touched = std::mem::replace(&mut instance.touched, false);
+                if !touched {
+                    pruned += 1;
+                }
+                touched
             });
-            self.instance_count -= pruned;
             pruned
         }
+
+        fn len(&self) -> usize {
+            self.instances.values().map(Instance::len).sum()
+        }
     }
 
     impl oximeter::Producer for Producer {
         fn produce(
             &mut self,
         ) -> Result<Box<dyn Iterator<Item = Sample>>, MetricsError> {
-            Box::new(
-                self.metrics.lock().unwrap().sample(&self.target).into_iter(),
-            )
+            let metrics = self.0.lock().unwrap();
+            let mut v = Vec::with_capacity(metrics.len());
+            for (target, instance) in &metrics.instances {
+                instance.sample_into(target, &mut v)?;
+            }
+            Ok(Box::new(v.into_iter()))
         }
     }
 
@@ -553,10 +500,6 @@ mod metrics {
             self.check_failures
                 .entry(reason)
                 .or_insert_with(|| InstanceCheckFailures {
-                    instance_id: self.no_update.instance_id,
-                    sled_agent_id: self.no_update.sled_agent_id,
-                    sled_agent_ip: self.no_update.sled_agent_ip,
-                    sled_agent_port: self.no_update.sled_agent_port,
                     reason: reason.to_string(),
                     datum: Cumulative::default(),
                 })
@@ -568,10 +511,6 @@ mod metrics {
             self.update_failures
                 .entry(reason)
                 .or_insert_with(|| InstanceUpdateFailures {
-                    instance_id: self.no_update.instance_id,
-                    sled_agent_id: self.no_update.sled_agent_id,
-                    sled_agent_ip: self.no_update.sled_agent_ip,
-                    sled_agent_port: self.no_update.sled_agent_port,
                     reason: reason.to_string(),
                     datum: Cumulative::default(),
                 })
@@ -583,43 +522,32 @@ mod metrics {
             4 + self.check_failures.len() + self.update_failures.len()
         }
 
-        fn sample(
+        fn sample_into(
             &self,
-            target: &impl Target,
-        ) -> Result<Vec<Sample>, MetricsError> {
-            let mut v = Vec::with_capacity(self.len());
-            v.push(Sample::new(target, &self.no_update)?);
-            v.push(Sample::new(target, &self.instance_updated)?);
-            v.push(Sample::new(target, &self.vmm_updated)?);
-            v.push(Sample::new(target, &self.both_updated)?);
+            target: &InstanceTarget,
+            dest: &mut Vec<Sample>,
+        ) -> Result<(), MetricsError> {
+            dest.push(Sample::new(target, &self.no_update)?);
+            dest.push(Sample::new(target, &self.instance_updated)?);
+            dest.push(Sample::new(target, &self.vmm_updated)?);
+            dest.push(Sample::new(target, &self.both_updated)?);
             for metric in self.check_failures.values() {
-                v.push(Sample::new(target, metric)?);
+                dest.push(Sample::new(target, metric)?);
             }
             for metric in self.update_failures.values() {
-                v.push(Sample::new(target, metric)?)
+                dest.push(Sample::new(target, metric)?);
             }
-            Ok(v)
+            Ok(())
         }
     }
 
-    #[derive(Clone, Debug, Target)]
-    struct InstanceWatcherTarget {
-        name: String,
-    }
-
     /// The number of successful checks for a single instance and sled agent
     /// pair.
     #[derive(Clone, Debug, Metric)]
     struct InstanceChecks {
-        /// The instance's ID.
-        instance_id: Uuid,
-        /// The sled-agent's ID.
-        sled_agent_id: Uuid,
-        /// The sled agent's IP address.
-        sled_agent_ip: IpAddr,
-        /// The sled agent's port.
-        sled_agent_port: u16,
+        /// `true` if the instance state changed as a result of this check.
         instance_updated: bool,
+        /// `true` if the VMM state changed as a result of this check.
         vmm_updated: bool,
         /// The number of successful checks for this instance and sled agent.
         datum: Cumulative<u64>,
@@ -628,14 +556,6 @@ mod metrics {
     /// The number of failed checks for an instance and sled agent pair.
     #[derive(Clone, Debug, Metric)]
     struct InstanceCheckFailures {
-        /// The instance's ID.
-        instance_id: Uuid,
-        /// The sled-agent's ID.
-        sled_agent_id: Uuid,
-        /// The sled agent's IP address.
-        sled_agent_ip: IpAddr,
-        /// The sled agent's port.
-        sled_agent_port: u16,
         /// The reason why the check failed.
         ///
         /// # Note
@@ -648,14 +568,6 @@ mod metrics {
     /// The number of failed instance updates for an instance and sled agent pair.
     #[derive(Clone, Debug, Metric)]
     struct InstanceUpdateFailures {
-        /// The instance's ID.
-        instance_id: Uuid,
-        /// The sled-agent's ID.
-        sled_agent_id: Uuid,
-        /// The sled agent's IP address.
-        sled_agent_ip: IpAddr,
-        /// The sled agent's port.
-        sled_agent_port: u16,
         /// The reason why the check failed.
         ///
         /// # Note

From 5d18a6948d7060e2a392cb21bd3eca435dd583ad Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 30 Apr 2024 12:07:18 -0700
Subject: [PATCH 26/69] keep lock alive

---
 nexus/src/app/background/instance_watcher.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 1c7d07978b..e89c052886 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -328,7 +328,8 @@ impl BackgroundTask for InstanceWatcher {
                     `panic=\"abort\"`, so neither of these cases should \
                     ever occur",
                 );
-                let metric = self.metrics.lock().unwrap().instance(target);
+                let mut metrics = self.metrics.lock().unwrap();
+                let metric = metrics.instance(target);
                 if let Some(up) = instance_updated {
                     if up.instance_updated {
                         instances_updated += 1;

From bbc4fd052747e75afe3283938e74c504b702c074 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 30 Apr 2024 12:47:33 -0700
Subject: [PATCH 27/69] whoops

---
 nexus/src/app/background/instance_watcher.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index e89c052886..42bfa88f60 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -43,7 +43,9 @@ impl InstanceWatcher {
         producer_registry: &ProducerRegistry,
     ) -> Self {
         let metrics = Arc::new(Mutex::new(metrics::Metrics::default()));
-        producer_registry.register_producer(metrics::Producer(metrics.clone()));
+        producer_registry
+            .register_producer(metrics::Producer(metrics.clone()))
+            .unwrap();
         Self { datastore, resolver, metrics }
     }
 

From dcc9fe967a3a64f1164e6bd651d49f04e0bdf303 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 30 Apr 2024 14:43:53 -0700
Subject: [PATCH 28/69] docs etc

---
 nexus/src/app/background/instance_watcher.rs | 84 +++++++++++++++-----
 1 file changed, 64 insertions(+), 20 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 42bfa88f60..fe3b20bd8d 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -86,10 +86,12 @@ impl InstanceWatcher {
                             == Some("NO_SUCH_INSTANCE")
                     {
                         slog::info!(opctx.log, "instance is wayyyyy gone");
+                        // TODO(eliza): eventually, we should attempt to put the
+                        // instance in the `Failed` state here.
                         return CheckResult {
                             target,
                             check_failure: Some(CheckFailure::NoSuchInstance),
-                            update_failure: None,
+                            error: None,
                             instance_updated: None,
                         };
                     }
@@ -99,9 +101,7 @@ impl InstanceWatcher {
                         return CheckResult {
                             target,
                             check_failure: None,
-                            update_failure: Some(
-                                UpdateFailure::ClientHttpError(status),
-                            ),
+                            error: Some(CheckError::ClientHttpError(status)),
                             instance_updated: None,
                         };
                     }
@@ -114,16 +114,20 @@ impl InstanceWatcher {
                         check_failure: Some(CheckFailure::SledAgentResponse(
                             status,
                         )),
-                        update_failure: None,
+                        error: None,
                         instance_updated: None,
                     };
                 }
                 Err(ClientError::CommunicationError(e)) => {
+                    // TODO(eliza): eventually, we may want to transition the
+                    // instance to the `Failed` state if the sled-agent has been
+                    // unreachable for a while. We may also want to take other
+                    // corrective actions or alert an operator in this case.
                     slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
                     return CheckResult {
                         target,
                         check_failure: Some(CheckFailure::SledAgentUnreachable),
-                        update_failure: None,
+                        error: None,
                         instance_updated: None,
                     };
                 }
@@ -137,7 +141,7 @@ impl InstanceWatcher {
                     return CheckResult {
                         target,
                         check_failure: None,
-                        update_failure: Some(UpdateFailure::ClientError),
+                        error: Some(CheckError::ClientError),
                         instance_updated: None,
                     };
                 }
@@ -154,8 +158,8 @@ impl InstanceWatcher {
                 &state.into(),
             )
             .await
-            .map_err(|_| UpdateFailure::UpdateFailed)
-            .and_then(|updated| updated.ok_or(UpdateFailure::InstanceNotFound));
+            .map_err(|_| CheckError::UpdateFailed)
+            .and_then(|updated| updated.ok_or(CheckError::InstanceNotFound));
             match update_result {
                 Ok(updated) => {
                     slog::debug!(opctx.log, "update successful"; "instance_updated" => updated.instance_updated, "vmm_updated" => updated.vmm_updated);
@@ -163,7 +167,7 @@ impl InstanceWatcher {
                         target,
                         instance_updated: Some(updated),
                         check_failure: None,
-                        update_failure: None,
+                        error: None,
                     }
                 }
                 Err(e) => {
@@ -172,7 +176,7 @@ impl InstanceWatcher {
                         target,
                         instance_updated: None,
                         check_failure: None,
-                        update_failure: Some(e),
+                        error: Some(e),
                     }
                 }
             }
@@ -196,23 +200,63 @@ struct InstanceTarget {
 
 struct CheckResult {
     target: InstanceTarget,
+    /// `Some` if the instance's state was up
     instance_updated: Option<crate::app::instance::InstanceUpdated>,
+
+    /// `Some` if the instance check indicated that the instance is in a bad state.
+    ///
+    /// This is a result that indicates that something is *wrong* with either the
+    /// sled on which the instance is running, the sled-agent on that sled, or the
+    /// instance itself. This is distinct from a [`CheckError`], which indicates
+    /// that we were *unable* to check on the instance or update its state.
     check_failure: Option<CheckFailure>,
-    update_failure: Option<UpdateFailure>,
+
+    /// `Some` if the instance check was unsuccessful.
+    ///
+    /// This indicates that something went wrong *while performing the check* that
+    /// does not necessarily indicate that the instance itself is in a bad
+    /// state. For example, the sled-agent client may have constructed an
+    /// invalid request, or an error may have occurred while updating the
+    /// instance in the database.
+    ///
+    /// Depending on when the error occurred, the `CheckFailure` field may also
+    /// be populated.
+    error: Option<CheckError>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 enum CheckFailure {
+    /// The sled-agent for the sled on which the instance is running was
+    /// unreachable.
+    ///
+    /// This may indicate a network partition between us and that sled, that
+    /// the sled-agent process has crashed, or that the sled is down.
     SledAgentUnreachable,
+    /// The sled-agent responded with an unexpected HTTP error.
     SledAgentResponse(StatusCode),
+    /// The sled-agent indicated that it doesn't know about an instance ID that
+    /// we believe it *should* know about. This probably means the sled-agent,
+    /// and potentially the whole sled, has been restarted.
     NoSuchInstance,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-enum UpdateFailure {
+enum CheckError {
+    /// The sled-agent responded with an HTTP client error, indicating that our
+    /// request as somehow malformed.
     ClientHttpError(StatusCode),
+    /// Something else went wrong while making an HTTP request.
     ClientError,
+    /// We attempted to update the instance state in the database, but no
+    /// instance with that UUID existed.
+    ///
+    /// Because the instance UUIDs that we perform checks on come from querying
+    /// the instances table, this would probably indicate that the instance was
+    /// removed from the database between when we listed instances and when the
+    /// check completed.
     InstanceNotFound,
+    /// Something went wrong while updating the state of the instance in the
+    /// database.
     UpdateFailed,
 }
 
@@ -228,7 +272,7 @@ impl fmt::Display for CheckFailure {
     }
 }
 
-impl fmt::Display for UpdateFailure {
+impl fmt::Display for CheckError {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             Self::ClientHttpError(status) => write!(f, "{status}"),
@@ -321,7 +365,7 @@ impl BackgroundTask for InstanceWatcher {
                     target,
                     instance_updated,
                     check_failure,
-                    update_failure,
+                    error: update_failure,
                     ..
                 } = result.expect(
                     "a `JoinError` is returned if a spawned task \
@@ -386,7 +430,7 @@ impl BackgroundTask for InstanceWatcher {
 }
 
 mod metrics {
-    use super::{CheckFailure, InstanceTarget, InstanceUpdated, UpdateFailure};
+    use super::{CheckError, CheckFailure, InstanceTarget, InstanceUpdated};
     use oximeter::types::Cumulative;
     use oximeter::Metric;
     use oximeter::MetricsError;
@@ -410,7 +454,7 @@ mod metrics {
         vmm_updated: InstanceChecks,
         both_updated: InstanceChecks,
         check_failures: BTreeMap<CheckFailure, InstanceCheckFailures>,
-        update_failures: BTreeMap<UpdateFailure, InstanceUpdateFailures>,
+        update_failures: BTreeMap<CheckError, InstanceCheckErrors>,
         touched: bool,
     }
 
@@ -510,10 +554,10 @@ mod metrics {
             self.touched = true;
         }
 
-        pub(super) fn update_failure(&mut self, reason: UpdateFailure) {
+        pub(super) fn update_failure(&mut self, reason: CheckError) {
             self.update_failures
                 .entry(reason)
-                .or_insert_with(|| InstanceUpdateFailures {
+                .or_insert_with(|| InstanceCheckErrors {
                     reason: reason.to_string(),
                     datum: Cumulative::default(),
                 })
@@ -570,7 +614,7 @@ mod metrics {
 
     /// The number of failed instance updates for an instance and sled agent pair.
     #[derive(Clone, Debug, Metric)]
-    struct InstanceUpdateFailures {
+    struct InstanceCheckErrors {
         /// The reason why the check failed.
         ///
         /// # Note

From 30fd5524090db3e36574af704ecc54354fad6604 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 30 Apr 2024 15:30:45 -0700
Subject: [PATCH 29/69] omdb test update

---
 dev-tools/omdb/tests/env.out       | 12 ++++++++++++
 dev-tools/omdb/tests/successes.out | 11 +++++++++++
 2 files changed, 23 insertions(+)

diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
index c8605d38b2..469a99af7b 100644
--- a/dev-tools/omdb/tests/env.out
+++ b/dev-tools/omdb/tests/env.out
@@ -72,6 +72,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 
@@ -174,6 +178,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 
@@ -263,6 +271,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 
diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
index 9dcf9ec61a..a2f12396ec 100644
--- a/dev-tools/omdb/tests/successes.out
+++ b/dev-tools/omdb/tests/successes.out
@@ -249,6 +249,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 
@@ -391,6 +395,13 @@ task: "external_endpoints"
 
     TLS certificates: 0
 
+task: "instance_watcher"
+  configured period: every 30s
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by an explicit signal
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+warning: unknown background task: "instance_watcher" (don't know how to interpret details: Object {"instances_updated": Number(0), "no_change": Number(0), "not_found": Number(0), "pruned_instances": Number(0), "sled_agent_errors": Number(0), "total_instances": Number(0), "update_errors": Number(0), "vmms_updated": Number(0)})
+
 task: "inventory_collection"
   configured period: every 10m
   currently executing: no

From 4407df9e0667c4dc213f6c53df16967f597c2560 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 1 May 2024 13:04:01 -0700
Subject: [PATCH 30/69] additional renamening

---
 nexus/src/app/background/instance_watcher.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index fe3b20bd8d..329813144f 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -358,7 +358,7 @@ impl BackgroundTask for InstanceWatcher {
             let mut no_change: usize = 0;
             let mut not_found: usize = 0;
             let mut sled_agent_errors: usize = 0;
-            let mut update_errors: usize = 0;
+            let mut check_errors: usize = 0;
             while let Some(result) = tasks.join_next().await {
                 total += 1;
                 let CheckResult {
@@ -400,7 +400,7 @@ impl BackgroundTask for InstanceWatcher {
                 }
                 if let Some(reason) = update_failure {
                     metric.update_failure(reason);
-                    update_errors += 1;
+                    check_errors += 1;
                 }
             }
 
@@ -411,7 +411,7 @@ impl BackgroundTask for InstanceWatcher {
                 "no_change" => no_change,
                 "not_found" => not_found,
                 "sled_agent_errors" => sled_agent_errors,
-                "update_errors" => update_errors,
+                "check_errors" => check_errors,
                 "pruned_instances" => pruned,
             );
             serde_json::json!({
@@ -421,7 +421,7 @@ impl BackgroundTask for InstanceWatcher {
                 "no_change": no_change,
                 "not_found": not_found,
                 "sled_agent_errors": sled_agent_errors,
-                "update_errors": update_errors,
+                "check_errors": check_errors,
                 "pruned_instances": pruned,
             })
         }

From 3efa0ad6e5acaaa35ef4362401f7d5c67f9eb9db Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 1 May 2024 14:22:50 -0700
Subject: [PATCH 31/69] add nicer omdb support

---
 dev-tools/omdb/src/bin/omdb/nexus.rs         | 77 ++++++++++++++++++++
 dev-tools/omdb/tests/successes.out           | 11 ++-
 nexus/src/app/background/instance_watcher.rs | 17 +++--
 3 files changed, 99 insertions(+), 6 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index 11ee64a225..266b8417d3 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -889,6 +889,83 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 );
             }
         };
+    } else if name == "instance_watcher" {
+        #[derive(Deserialize)]
+        struct TaskSuccess {
+            /// total number of instances checked
+            total_instances: usize,
+
+            /// number of instances whose check succeeded without a state
+            /// change
+            no_change: usize,
+
+            /// number of instances whose state has changed
+            instances_updated: usize,
+
+            /// number of instances whose VMM state has changed
+            vmms_updated: usize,
+
+            /// number of instances which the sled-agent indicated no longer exists
+            not_found: usize,
+
+            /// number of unexpected errors returned by sled-agent
+            sled_agent_errors: usize,
+
+            /// number of instances for which the sled agent was unreachable
+            unreachable_instances: usize,
+
+            /// number of checks that could not be completed successfully
+            check_errors: usize,
+
+            /// number of stale instance metrics that were deleted.
+            pruned_instances: usize,
+        }
+
+        match serde_json::from_value::<TaskSuccess>(details.clone()) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+            Ok(success) => {
+                println!(
+                    "    total instances checked: {}",
+                    success.total_instances
+                );
+                println!(
+                    "    checks completed successfully: {}",
+                    success.total_instances - success.check_errors
+                );
+                println!("      -> {} instances unchanged", success.no_change);
+                println!(
+                    "      -> {} instance states updated",
+                    success.instances_updated
+                );
+                println!(
+                    "      -> {} VMM states updated",
+                    success.vmms_updated
+                );
+                println!(
+                    "      -> {} instances no longer exist",
+                    success.not_found
+                );
+                println!(
+                    "      -> {} sled-agent errors",
+                    success.sled_agent_errors
+                );
+                println!(
+                    "      -> {} instances with unreachable sled-agents",
+                    success.unreachable_instances
+                );
+                println!(
+                    "    checks that could not be completed successfully: {}",
+                    success.check_errors
+                );
+                println!(
+                    "    stale instance metrics removed: {}",
+                    success.pruned_instances
+                )
+            }
+        };
     } else {
         println!(
             "warning: unknown background task: {:?} \
diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
index a2f12396ec..19c4227dfe 100644
--- a/dev-tools/omdb/tests/successes.out
+++ b/dev-tools/omdb/tests/successes.out
@@ -400,7 +400,16 @@ task: "instance_watcher"
   currently executing: no
   last completed activation: <REDACTED ITERATIONS>, triggered by an explicit signal
     started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
-warning: unknown background task: "instance_watcher" (don't know how to interpret details: Object {"instances_updated": Number(0), "no_change": Number(0), "not_found": Number(0), "pruned_instances": Number(0), "sled_agent_errors": Number(0), "total_instances": Number(0), "update_errors": Number(0), "vmms_updated": Number(0)})
+    total instances checked: 0
+    checks completed successfully: 0
+      -> 0 instances unchanged
+      -> 0 instance states updated
+      -> 0 VMM states updated
+      -> 0 instances no longer exist
+      -> 0 sled-agent errors
+      -> 0 instances with unreachable sled-agents
+    checks that could not be completed successfully: 0
+    stale instance metrics removed: 0
 
 task: "inventory_collection"
   configured period: every 10m
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 329813144f..d5649e37af 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -357,6 +357,7 @@ impl BackgroundTask for InstanceWatcher {
             let mut vmms_updated: usize = 0;
             let mut no_change: usize = 0;
             let mut not_found: usize = 0;
+            let mut unreachable_instances: usize = 0;
             let mut sled_agent_errors: usize = 0;
             let mut check_errors: usize = 0;
             while let Some(result) = tasks.join_next().await {
@@ -365,7 +366,7 @@ impl BackgroundTask for InstanceWatcher {
                     target,
                     instance_updated,
                     check_failure,
-                    error: update_failure,
+                    error,
                     ..
                 } = result.expect(
                     "a `JoinError` is returned if a spawned task \
@@ -393,13 +394,16 @@ impl BackgroundTask for InstanceWatcher {
                 if let Some(reason) = check_failure {
                     match reason {
                         CheckFailure::NoSuchInstance => not_found += 1,
+                        CheckFailure::SledAgentUnreachable => {
+                            unreachable_instances += 1
+                        }
                         _ => sled_agent_errors += 1,
                     }
 
                     metric.check_failure(reason);
                 }
-                if let Some(reason) = update_failure {
-                    metric.update_failure(reason);
+                if let Some(reason) = error {
+                    metric.check_error(reason);
                     check_errors += 1;
                 }
             }
@@ -407,6 +411,7 @@ impl BackgroundTask for InstanceWatcher {
             slog::info!(opctx.log, "all instance checks complete";
                 "total_instances" => total,
                 "instances_updated" => instances_updated,
+                "unreachable_instances" => unreachable_instances,
                 "vmms_updated" => vmms_updated,
                 "no_change" => no_change,
                 "not_found" => not_found,
@@ -420,6 +425,7 @@ impl BackgroundTask for InstanceWatcher {
                 "vmms_updated": vmms_updated,
                 "no_change": no_change,
                 "not_found": not_found,
+                "unreachable_instances": unreachable_instances,
                 "sled_agent_errors": sled_agent_errors,
                 "check_errors": check_errors,
                 "pruned_instances": pruned,
@@ -554,7 +560,7 @@ mod metrics {
             self.touched = true;
         }
 
-        pub(super) fn update_failure(&mut self, reason: CheckError) {
+        pub(super) fn check_error(&mut self, reason: CheckError) {
             self.update_failures
                 .entry(reason)
                 .or_insert_with(|| InstanceCheckErrors {
@@ -612,7 +618,8 @@ mod metrics {
         datum: Cumulative<u64>,
     }
 
-    /// The number of failed instance updates for an instance and sled agent pair.
+    /// The number of instance checks that were unsuccessful for an instance and
+    /// sled agent.
     #[derive(Clone, Debug, Metric)]
     struct InstanceCheckErrors {
         /// The reason why the check failed.

From 224c66aa14fc13495ac32f815f421959994bd7e5 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 2 May 2024 11:49:40 -0700
Subject: [PATCH 32/69] redo metric structure per @bnaecker's feedback

---
 nexus/src/app/background/instance_watcher.rs | 385 ++++++++-----------
 1 file changed, 154 insertions(+), 231 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index d5649e37af..da73d83b6b 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -5,7 +5,6 @@
 //! Background task for pulling instance state from sled-agents.
 
 use super::common::BackgroundTask;
-use crate::app::instance::InstanceUpdated;
 use futures::{future::BoxFuture, FutureExt};
 use http::StatusCode;
 use nexus_db_model::{Sled, SledInstance};
@@ -13,8 +12,11 @@ use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
 use nexus_types::identity::Asset;
+use omicron_common::api::external::InstanceState;
+use omicron_common::api::internal::nexus::SledInstanceState;
 use oximeter::types::ProducerRegistry;
 use sled_agent_client::Client as SledAgentClient;
+use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
 use std::net::IpAddr;
@@ -55,10 +57,10 @@ impl InstanceWatcher {
         sled: &Sled,
         client: &SledAgentClient,
         instance: SledInstance,
-    ) -> impl Future<Output = CheckResult> + Send + 'static {
+    ) -> impl Future<Output = Check> + Send + 'static {
         let instance_id = instance.instance_id();
         let watcher = self.clone();
-        let target = InstanceTarget {
+        let target = VirtualMachine {
             instance_id,
             sled_agent_id: sled.id(),
             sled_agent_ip: std::net::Ipv6Addr::from(sled.ip).into(),
@@ -77,6 +79,7 @@ impl InstanceWatcher {
             let InstanceWatcher { datastore, resolver, .. } = watcher;
             slog::trace!(opctx.log, "checking on instance...");
             let rsp = client.instance_get_state(&instance_id).await;
+            let mut check = Check { target, outcome: None, result: Ok(()) };
             let state = match rsp {
                 Ok(rsp) => rsp.into_inner(),
                 Err(ClientError::ErrorResponse(rsp)) => {
@@ -88,35 +91,24 @@ impl InstanceWatcher {
                         slog::info!(opctx.log, "instance is wayyyyy gone");
                         // TODO(eliza): eventually, we should attempt to put the
                         // instance in the `Failed` state here.
-                        return CheckResult {
-                            target,
-                            check_failure: Some(CheckFailure::NoSuchInstance),
-                            error: None,
-                            instance_updated: None,
-                        };
+                        check.outcome =
+                            Some(CheckOutcome::Failed(Failure::NoSuchInstance));
+                        return check;
                     }
                     if status.is_client_error() {
                         slog::warn!(opctx.log, "check failed due to client error";
                             "status" => ?status, "error" => ?rsp.into_inner());
-                        return CheckResult {
-                            target,
-                            check_failure: None,
-                            error: Some(CheckError::ClientHttpError(status)),
-                            instance_updated: None,
-                        };
-                    }
-
-                    slog::info!(opctx.log, "check failed due to server error";
+                        check.result =
+                            Err(Incomplete::ClientHttpError(status.as_u16()));
+                    } else {
+                        slog::info!(opctx.log, "check failed due to server error";
                         "status" => ?status, "error" => ?rsp.into_inner());
+                    }
 
-                    return CheckResult {
-                        target,
-                        check_failure: Some(CheckFailure::SledAgentResponse(
-                            status,
-                        )),
-                        error: None,
-                        instance_updated: None,
-                    };
+                    check.outcome = Some(CheckOutcome::Failed(
+                        Failure::SledAgentResponse(status.as_u16()),
+                    ));
+                    return check;
                 }
                 Err(ClientError::CommunicationError(e)) => {
                     // TODO(eliza): eventually, we may want to transition the
@@ -124,12 +116,10 @@ impl InstanceWatcher {
                     // unreachable for a while. We may also want to take other
                     // corrective actions or alert an operator in this case.
                     slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
-                    return CheckResult {
-                        target,
-                        check_failure: Some(CheckFailure::SledAgentUnreachable),
-                        error: None,
-                        instance_updated: None,
-                    };
+                    check.outcome = Some(CheckOutcome::Failed(
+                        Failure::SledAgentUnreachable,
+                    ));
+                    return check;
                 }
                 Err(e) => {
                     slog::warn!(
@@ -138,48 +128,42 @@ impl InstanceWatcher {
                         "error" => ?e,
                         "status" => ?e.status(),
                     );
-                    return CheckResult {
-                        target,
-                        check_failure: None,
-                        error: Some(CheckError::ClientError),
-                        instance_updated: None,
-                    };
+                    check.result = Err(Incomplete::ClientError);
+                    return check;
                 }
             };
 
-            slog::debug!(opctx.log, "updating instance state: {state:?}");
-            let update_result = crate::app::instance::notify_instance_updated(
+            let new_runtime_state: SledInstanceState = state.into();
+            check.outcome = Some(CheckOutcome::Completed {
+                instance_state: new_runtime_state.vmm_state.state,
+                vmm_id: new_runtime_state.propolis_id,
+            });
+            slog::debug!(
+                opctx.log,
+                "updating instance state: {new_runtime_state:?}"
+            );
+            check.result = crate::app::instance::notify_instance_updated(
                 &datastore,
                 &resolver,
                 &opctx,
                 &opctx,
                 &opctx.log,
                 &instance_id,
-                &state.into(),
+                &new_runtime_state,
             )
             .await
-            .map_err(|_| CheckError::UpdateFailed)
-            .and_then(|updated| updated.ok_or(CheckError::InstanceNotFound));
-            match update_result {
-                Ok(updated) => {
-                    slog::debug!(opctx.log, "update successful"; "instance_updated" => updated.instance_updated, "vmm_updated" => updated.vmm_updated);
-                    CheckResult {
-                        target,
-                        instance_updated: Some(updated),
-                        check_failure: None,
-                        error: None,
-                    }
-                }
-                Err(e) => {
-                    slog::warn!(opctx.log, "error updating instance"; "error" => ?e);
-                    CheckResult {
-                        target,
-                        instance_updated: None,
-                        check_failure: None,
-                        error: Some(e),
-                    }
-                }
-            }
+            .map_err(|e| {
+                slog::warn!(opctx.log, "error updating instance"; "error" => ?e);
+                Incomplete::UpdateFailed
+            })
+            .and_then(|updated| updated.ok_or_else(|| {
+                slog::warn!(opctx.log, "error updating instance: not found in database");
+                Incomplete::InstanceNotFound
+            })).map(|updated| {
+                slog::debug!(opctx.log, "update successful"; "instance_updated" => updated.instance_updated, "vmm_updated" => updated.vmm_updated);
+            });
+
+            check
         }
     }
 }
@@ -187,7 +171,7 @@ impl InstanceWatcher {
 #[derive(
     Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, oximeter::Target,
 )]
-struct InstanceTarget {
+struct VirtualMachine {
     /// The instance's ID.
     instance_id: Uuid,
     /// The sled-agent's ID.
@@ -198,18 +182,11 @@ struct InstanceTarget {
     sled_agent_port: u16,
 }
 
-struct CheckResult {
-    target: InstanceTarget,
-    /// `Some` if the instance's state was up
-    instance_updated: Option<crate::app::instance::InstanceUpdated>,
+struct Check {
+    target: VirtualMachine,
 
-    /// `Some` if the instance check indicated that the instance is in a bad state.
-    ///
-    /// This is a result that indicates that something is *wrong* with either the
-    /// sled on which the instance is running, the sled-agent on that sled, or the
-    /// instance itself. This is distinct from a [`CheckError`], which indicates
-    /// that we were *unable* to check on the instance or update its state.
-    check_failure: Option<CheckFailure>,
+    /// The outcome of performing this check.
+    outcome: Option<CheckOutcome>,
 
     /// `Some` if the instance check was unsuccessful.
     ///
@@ -219,13 +196,21 @@ struct CheckResult {
     /// invalid request, or an error may have occurred while updating the
     /// instance in the database.
     ///
-    /// Depending on when the error occurred, the `CheckFailure` field may also
+    /// Depending on when the error occurred, the `outcome` field may also
     /// be populated.
-    error: Option<CheckError>,
+    result: Result<(), Incomplete>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-enum CheckFailure {
+enum CheckOutcome {
+    Completed { instance_state: InstanceState, vmm_id: Uuid },
+    Failed(Failure),
+}
+
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
+)]
+enum Failure {
     /// The sled-agent for the sled on which the instance is running was
     /// unreachable.
     ///
@@ -233,18 +218,20 @@ enum CheckFailure {
     /// the sled-agent process has crashed, or that the sled is down.
     SledAgentUnreachable,
     /// The sled-agent responded with an unexpected HTTP error.
-    SledAgentResponse(StatusCode),
+    SledAgentResponse(u16),
     /// The sled-agent indicated that it doesn't know about an instance ID that
     /// we believe it *should* know about. This probably means the sled-agent,
     /// and potentially the whole sled, has been restarted.
     NoSuchInstance,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-enum CheckError {
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
+)]
+enum Incomplete {
     /// The sled-agent responded with an HTTP client error, indicating that our
     /// request as somehow malformed.
-    ClientHttpError(StatusCode),
+    ClientHttpError(u16),
     /// Something else went wrong while making an HTTP request.
     ClientError,
     /// We attempted to update the instance state in the database, but no
@@ -260,7 +247,7 @@ enum CheckError {
     UpdateFailed,
 }
 
-impl fmt::Display for CheckFailure {
+impl fmt::Display for Failure {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             Self::SledAgentUnreachable => f.write_str("unreachable"),
@@ -272,7 +259,7 @@ impl fmt::Display for CheckFailure {
     }
 }
 
-impl fmt::Display for CheckError {
+impl fmt::Display for Incomplete {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             Self::ClientHttpError(status) => write!(f, "{status}"),
@@ -353,22 +340,14 @@ impl BackgroundTask for InstanceWatcher {
 
             // Now, wait for the check results to come back.
             let mut total: usize = 0;
-            let mut instances_updated: usize = 0;
-            let mut vmms_updated: usize = 0;
-            let mut no_change: usize = 0;
-            let mut not_found: usize = 0;
-            let mut unreachable_instances: usize = 0;
-            let mut sled_agent_errors: usize = 0;
-            let mut check_errors: usize = 0;
+            let mut instance_states: BTreeMap<InstanceState, usize> =
+                BTreeMap::new();
+            let mut check_failures: BTreeMap<Failure, usize> =
+                BTreeMap::new();
+            let mut check_errors: BTreeMap<Incomplete, usize> = BTreeMap::new();
             while let Some(result) = tasks.join_next().await {
                 total += 1;
-                let CheckResult {
-                    target,
-                    instance_updated,
-                    check_failure,
-                    error,
-                    ..
-                } = result.expect(
+                let Check { target, outcome, result } = result.expect(
                     "a `JoinError` is returned if a spawned task \
                     panics, or if the task is aborted. we never abort \
                     tasks on this `JoinSet`, and nexus is compiled with \
@@ -377,57 +356,37 @@ impl BackgroundTask for InstanceWatcher {
                 );
                 let mut metrics = self.metrics.lock().unwrap();
                 let metric = metrics.instance(target);
-                if let Some(up) = instance_updated {
-                    if up.instance_updated {
-                        instances_updated += 1;
-                    }
-
-                    if up.vmm_updated {
-                        vmms_updated += 1;
-                    }
-
-                    if !(up.vmm_updated || up.instance_updated) {
-                        no_change += 1;
-                    }
-                    metric.success(up);
-                }
-                if let Some(reason) = check_failure {
-                    match reason {
-                        CheckFailure::NoSuchInstance => not_found += 1,
-                        CheckFailure::SledAgentUnreachable => {
-                            unreachable_instances += 1
+                if let Some(outcome) = outcome {
+                    metric.completed(outcome);
+                    match outcome {
+                        CheckOutcome::Completed { instance_state, .. } => {
+                            *instance_states
+                                .entry(instance_state)
+                                .or_default() += 1;
+                        }
+                        CheckOutcome::Failed(reason) => {
+                            *check_failures.entry(reason).or_default() += 1;
                         }
-                        _ => sled_agent_errors += 1,
                     }
-
-                    metric.check_failure(reason);
                 }
-                if let Some(reason) = error {
+                if let Err(reason) = result {
                     metric.check_error(reason);
-                    check_errors += 1;
+                    *check_errors.entry(reason).or_default() += 1;
                 }
             }
 
             slog::info!(opctx.log, "all instance checks complete";
                 "total_instances" => total,
-                "instances_updated" => instances_updated,
-                "unreachable_instances" => unreachable_instances,
-                "vmms_updated" => vmms_updated,
-                "no_change" => no_change,
-                "not_found" => not_found,
-                "sled_agent_errors" => sled_agent_errors,
-                "check_errors" => check_errors,
+                "total_completed" => instance_states.len() + check_failures.len(),
+                "total_failed" => check_failures.len(),
+                "total_incomplete" => check_errors.len(),
                 "pruned_instances" => pruned,
             );
             serde_json::json!({
                 "total_instances": total,
-                "instances_updated": instances_updated,
-                "vmms_updated": vmms_updated,
-                "no_change": no_change,
-                "not_found": not_found,
-                "unreachable_instances": unreachable_instances,
-                "sled_agent_errors": sled_agent_errors,
-                "check_errors": check_errors,
+                "instance_states": instance_states,
+                "failed_checks": check_failures,
+                "incomplete_checks": check_errors,
                 "pruned_instances": pruned,
             })
         }
@@ -436,7 +395,9 @@ impl BackgroundTask for InstanceWatcher {
 }
 
 mod metrics {
-    use super::{CheckError, CheckFailure, InstanceTarget, InstanceUpdated};
+    use super::{
+        CheckOutcome, Failure, Incomplete, InstanceState, Uuid, VirtualMachine,
+    };
     use oximeter::types::Cumulative;
     use oximeter::Metric;
     use oximeter::MetricsError;
@@ -447,53 +408,31 @@ mod metrics {
 
     #[derive(Debug, Default)]
     pub(super) struct Metrics {
-        instances: BTreeMap<InstanceTarget, Instance>,
+        instances: BTreeMap<VirtualMachine, Instance>,
     }
 
     #[derive(Debug)]
     pub(super) struct Producer(pub(super) Arc<Mutex<Metrics>>);
 
-    #[derive(Debug)]
+    #[derive(Debug, Default)]
     pub(super) struct Instance {
-        no_update: InstanceChecks,
-        instance_updated: InstanceChecks,
-        vmm_updated: InstanceChecks,
-        both_updated: InstanceChecks,
-        check_failures: BTreeMap<CheckFailure, InstanceCheckFailures>,
-        update_failures: BTreeMap<CheckError, InstanceCheckErrors>,
+        instance_states: BTreeMap<StateKey, State>,
+        // N.B. that these names are a bit unfortunate; since the name of the
+        // metrics is generated from the name of the metric struct, we can't
+        // name the struct anything else.
+        check_failures: BTreeMap<Failure, FailedCheck>,
+        check_errors: BTreeMap<Incomplete, IncompleteCheck>,
         touched: bool,
     }
 
+    type StateKey = (Uuid, InstanceState);
+
     impl Metrics {
         pub(crate) fn instance(
             &mut self,
-            instance: InstanceTarget,
+            instance: VirtualMachine,
         ) -> &mut Instance {
-            self.instances.entry(instance).or_insert_with(|| Instance {
-                no_update: InstanceChecks {
-                    instance_updated: false,
-                    vmm_updated: false,
-                    datum: Cumulative::default(),
-                },
-                instance_updated: InstanceChecks {
-                    instance_updated: true,
-                    vmm_updated: false,
-                    datum: Cumulative::default(),
-                },
-                vmm_updated: InstanceChecks {
-                    instance_updated: false,
-                    vmm_updated: true,
-                    datum: Cumulative::default(),
-                },
-                both_updated: InstanceChecks {
-                    instance_updated: true,
-                    vmm_updated: true,
-                    datum: Cumulative::default(),
-                },
-                check_failures: BTreeMap::new(),
-                update_failures: BTreeMap::new(),
-                touched: false,
-            })
+            self.instances.entry(instance).or_default()
         }
 
         pub(super) fn prune(&mut self) -> usize {
@@ -527,43 +466,35 @@ mod metrics {
     }
 
     impl Instance {
-        pub(super) fn success(&mut self, updated: InstanceUpdated) {
-            match updated {
-                InstanceUpdated {
-                    instance_updated: true,
-                    vmm_updated: true,
-                } => self.both_updated.datum += 1,
-                InstanceUpdated {
-                    instance_updated: true,
-                    vmm_updated: false,
-                } => self.instance_updated.datum += 1,
-                InstanceUpdated {
-                    instance_updated: false,
-                    vmm_updated: true,
-                } => self.vmm_updated.datum += 1,
-                InstanceUpdated {
-                    instance_updated: false,
-                    vmm_updated: false,
-                } => self.no_update.datum += 1,
-            }
-            self.touched = true;
-        }
-
-        pub(super) fn check_failure(&mut self, reason: CheckFailure) {
-            self.check_failures
-                .entry(reason)
-                .or_insert_with(|| InstanceCheckFailures {
-                    reason: reason.to_string(),
-                    datum: Cumulative::default(),
-                })
-                .datum += 1;
+        pub(super) fn completed(&mut self, outcome: CheckOutcome) {
+            match outcome {
+                CheckOutcome::Completed { instance_state, vmm_id } => {
+                    self.instance_states
+                        .entry((vmm_id, instance_state))
+                        .or_insert_with(|| State {
+                            state: instance_state.to_string(),
+                            vmm_id,
+                            datum: Cumulative::default(),
+                        })
+                        .datum += 1;
+                }
+                CheckOutcome::Failed(reason) => {
+                    self.check_failures
+                        .entry(reason)
+                        .or_insert_with(|| FailedCheck {
+                            reason: reason.to_string(),
+                            datum: Cumulative::default(),
+                        })
+                        .datum += 1;
+                }
+            };
             self.touched = true;
         }
 
-        pub(super) fn check_error(&mut self, reason: CheckError) {
-            self.update_failures
+        pub(super) fn check_error(&mut self, reason: Incomplete) {
+            self.check_errors
                 .entry(reason)
-                .or_insert_with(|| InstanceCheckErrors {
+                .or_insert_with(|| IncompleteCheck {
                     reason: reason.to_string(),
                     datum: Cumulative::default(),
                 })
@@ -572,62 +503,54 @@ mod metrics {
         }
 
         fn len(&self) -> usize {
-            4 + self.check_failures.len() + self.update_failures.len()
+            self.instance_states.len()
+                + self.check_failures.len()
+                + self.check_errors.len()
         }
 
         fn sample_into(
             &self,
-            target: &InstanceTarget,
+            target: &VirtualMachine,
             dest: &mut Vec<Sample>,
         ) -> Result<(), MetricsError> {
-            dest.push(Sample::new(target, &self.no_update)?);
-            dest.push(Sample::new(target, &self.instance_updated)?);
-            dest.push(Sample::new(target, &self.vmm_updated)?);
-            dest.push(Sample::new(target, &self.both_updated)?);
+            for metric in self.instance_states.values() {
+                dest.push(Sample::new(target, metric)?);
+            }
             for metric in self.check_failures.values() {
                 dest.push(Sample::new(target, metric)?);
             }
-            for metric in self.update_failures.values() {
+            for metric in self.check_errors.values() {
                 dest.push(Sample::new(target, metric)?);
             }
             Ok(())
         }
     }
 
-    /// The number of successful checks for a single instance and sled agent
-    /// pair.
+    /// The number of successful checks for a single instance, VMM, and sled agent.
     #[derive(Clone, Debug, Metric)]
-    struct InstanceChecks {
-        /// `true` if the instance state changed as a result of this check.
-        instance_updated: bool,
-        /// `true` if the VMM state changed as a result of this check.
-        vmm_updated: bool,
+    struct State {
+        /// The UUID of the VMM process for which this state was reported.
+        vmm_id: Uuid,
+        /// The string representation of the instance's state as understood by
+        /// the VMM.
+        state: String,
         /// The number of successful checks for this instance and sled agent.
         datum: Cumulative<u64>,
     }
 
     /// The number of failed checks for an instance and sled agent pair.
     #[derive(Clone, Debug, Metric)]
-    struct InstanceCheckFailures {
+    struct FailedCheck {
         /// The reason why the check failed.
-        ///
-        /// # Note
-        /// This must always be generated from a `CheckFailure` enum.
         reason: String,
         /// The number of failed checks for this instance and sled agent.
         datum: Cumulative<u64>,
     }
 
-    /// The number of instance checks that were unsuccessful for an instance and
-    /// sled agent.
+    /// The number of unsuccessful checks for an instance and sled agent pair.
     #[derive(Clone, Debug, Metric)]
-    struct InstanceCheckErrors {
-        /// The reason why the check failed.
-        ///
-        /// # Note
-        /// This must always be generated from a `CheckFailure` enum.
-        // TODO(eliza): it would be nice if this was a `oximeter::FieldType`:
-        // From<&str>` impl, so that this could be a `&'static str`.
+    struct IncompleteCheck {
+        /// The reason why the check was unsuccessful.
         reason: String,
         /// The number of failed checks for this instance and sled agent.
         datum: Cumulative<u64>,

From 03de70038cd63631444717b8374d9074cf722ecb Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 2 May 2024 12:01:24 -0700
Subject: [PATCH 33/69] update omdb

---
 dev-tools/omdb/src/bin/omdb/nexus.rs | 85 +++++++++++-----------------
 dev-tools/omdb/tests/successes.out   | 14 ++---
 2 files changed, 39 insertions(+), 60 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index 48abc4e4f3..3686031127 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -895,30 +895,17 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
             /// total number of instances checked
             total_instances: usize,
 
-            /// number of instances whose check succeeded without a state
-            /// change
-            no_change: usize,
-
-            /// number of instances whose state has changed
-            instances_updated: usize,
-
-            /// number of instances whose VMM state has changed
-            vmms_updated: usize,
-
-            /// number of instances which the sled-agent indicated no longer exists
-            not_found: usize,
-
-            /// number of unexpected errors returned by sled-agent
-            sled_agent_errors: usize,
+            /// number of stale instance metrics that were deleted
+            pruned_instances: usize,
 
-            /// number of instances for which the sled agent was unreachable
-            unreachable_instances: usize,
+            /// instance states from completed checks
+            instance_states: BTreeMap<String, usize>,
 
-            /// number of checks that could not be completed successfully
-            check_errors: usize,
+            /// instance check failures
+            failed_checks: BTreeMap<String, usize>,
 
-            /// number of stale instance metrics that were deleted.
-            pruned_instances: usize,
+            /// checks that could not be completed
+            incomplete_checks: BTreeMap<String, usize>,
         }
 
         match serde_json::from_value::<TaskSuccess>(details.clone()) {
@@ -926,44 +913,40 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 "warning: failed to interpret task details: {:?}: {:?}",
                 error, details
             ),
-            Ok(success) => {
-                println!(
-                    "    total instances checked: {}",
-                    success.total_instances
-                );
-                println!(
-                    "    checks completed successfully: {}",
-                    success.total_instances - success.check_errors
-                );
-                println!("      -> {} instances unchanged", success.no_change);
-                println!(
-                    "      -> {} instance states updated",
-                    success.instances_updated
-                );
-                println!(
-                    "      -> {} VMM states updated",
-                    success.vmms_updated
-                );
+            Ok(TaskSuccess {
+                total_instances,
+                pruned_instances,
+                instance_states,
+                failed_checks,
+                incomplete_checks,
+            }) => {
+                println!("    total instances checked: {total_instances}",);
                 println!(
-                    "      -> {} instances no longer exist",
-                    success.not_found
+                    "    checks completed: {}",
+                    instance_states.len() + failed_checks.len()
                 );
                 println!(
-                    "      -> {} sled-agent errors",
-                    success.sled_agent_errors
+                    "     -> successful checks: {}",
+                    instance_states.len()
                 );
+                for (state, count) in &instance_states {
+                    println!("        {state} instances: {count}")
+                }
+
+                println!("     -> failed checks: {}", failed_checks.len());
+                for (failure, count) in &failed_checks {
+                    println!("        {failure}: {count}")
+                }
                 println!(
-                    "      -> {} instances with unreachable sled-agents",
-                    success.unreachable_instances
+                    "    checks that could not be completed: {}",
+                    incomplete_checks.len()
                 );
+                for (error, count) in &incomplete_checks {
+                    println!("     -> {error}: {count}")
+                }
                 println!(
-                    "    checks that could not be completed successfully: {}",
-                    success.check_errors
+                    "    stale instance metrics pruned: {pruned_instances}"
                 );
-                println!(
-                    "    stale instance metrics removed: {}",
-                    success.pruned_instances
-                )
             }
         };
     } else {
diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
index 19c4227dfe..ff4dd38ddc 100644
--- a/dev-tools/omdb/tests/successes.out
+++ b/dev-tools/omdb/tests/successes.out
@@ -401,15 +401,11 @@ task: "instance_watcher"
   last completed activation: <REDACTED ITERATIONS>, triggered by an explicit signal
     started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
     total instances checked: 0
-    checks completed successfully: 0
-      -> 0 instances unchanged
-      -> 0 instance states updated
-      -> 0 VMM states updated
-      -> 0 instances no longer exist
-      -> 0 sled-agent errors
-      -> 0 instances with unreachable sled-agents
-    checks that could not be completed successfully: 0
-    stale instance metrics removed: 0
+    checks completed: 0
+     -> successful checks: 0
+     -> failed checks: 0
+    checks that could not be completed: 0
+    stale instance metrics pruned: 0
 
 task: "inventory_collection"
   configured period: every 10m

From 3f85ebe013569b1a6aac7922034ffe6acf2ffc2b Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 2 May 2024 12:01:49 -0700
Subject: [PATCH 34/69] most of @bnaecker and @smklein's style suggestions

---
 .../src/db/datastore/sled_instance.rs         |  1 -
 nexus/src/app/background/instance_watcher.rs  | 48 ++++++++++++-------
 smf/nexus/multi-sled/config-partial.toml      |  1 -
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/nexus/db-queries/src/db/datastore/sled_instance.rs b/nexus/db-queries/src/db/datastore/sled_instance.rs
index a070a30756..17ef98a43f 100644
--- a/nexus/db-queries/src/db/datastore/sled_instance.rs
+++ b/nexus/db-queries/src/db/datastore/sled_instance.rs
@@ -40,7 +40,6 @@ impl DataStore {
         opctx: &OpContext,
         pagparams: &DataPageParams<'_, Uuid>,
     ) -> ListResultVec<(Sled, SledInstance)> {
-        // TODO(eliza): should probably paginate this?
         use crate::db::schema::{sled::dsl as sled_dsl, sled_instance::dsl};
         opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
         let conn = self.pool_connection_authorized(opctx).await?;
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index da73d83b6b..da69093071 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -185,7 +185,13 @@ struct VirtualMachine {
 struct Check {
     target: VirtualMachine,
 
-    /// The outcome of performing this check.
+    /// The outcome of performing this check. Either we were able to reach the
+    /// sled-agent that owns this instance and it told us the instance's state
+    /// and VMM, or we the health check failed in a way that suggests a
+    /// potential issue with the sled-agent or instance.
+    ///
+    /// If we were not able to perform the request at all due to an error on
+    /// *our* end, this will be `None`.
     outcome: Option<CheckOutcome>,
 
     /// `Some` if the instance check was unsuccessful.
@@ -280,6 +286,14 @@ impl BackgroundTask for InstanceWatcher {
         async {
             let mut tasks = tokio::task::JoinSet::new();
             let mut paginator = Paginator::new(MAX_SLED_AGENTS);
+            let mk_client = |sled: &Sled| {
+                nexus_networking::sled_client_from_address(
+                    sled.id(),
+                    sled.address(),
+                    &opctx.log,
+                )
+            };
+
             while let Some(p) = paginator.next() {
                 let maybe_batch = self
                     .datastore
@@ -299,17 +313,17 @@ impl BackgroundTask for InstanceWatcher {
                     }
                 };
                 paginator = p.found_batch(&batch, &|(sled, _)| sled.id());
-                let mut batch = batch.into_iter();
 
+                // When we iterate over the batch of sled instances, we pop the
+                // first sled from the batch before looping over the rest, to
+                // insure that the initial sled-agent client is created first,
+                // as we need the address of the first sled to construct it.
+                // We could, alternatively, make the sled-agent client an
+                // `Option`, but then every subsequent iteration would have to
+                // handle the case where it's `None`, and I thought this was a
+                // bit neater...
+                let mut batch = batch.into_iter();
                 if let Some((mut curr_sled, sled_instance)) = batch.next() {
-                    let mk_client = |sled: &Sled| {
-                        nexus_networking::sled_client_from_address(
-                            sled.id(),
-                            sled.address(),
-                            &opctx.log,
-                        )
-                    };
-
                     let mut client = mk_client(&curr_sled);
                     tasks.spawn(self.check_instance(
                         opctx,
@@ -436,15 +450,11 @@ mod metrics {
         }
 
         pub(super) fn prune(&mut self) -> usize {
-            let mut pruned = 0;
+            let len = self.instances.len();
             self.instances.retain(|_, instance| {
-                let touched = std::mem::replace(&mut instance.touched, false);
-                if !touched {
-                    pruned += 1;
-                }
-                touched
+                std::mem::replace(&mut instance.touched, false)
             });
-            pruned
+            len - self.instances.len()
         }
 
         fn len(&self) -> usize {
@@ -542,6 +552,8 @@ mod metrics {
     #[derive(Clone, Debug, Metric)]
     struct FailedCheck {
         /// The reason why the check failed.
+        ///
+        /// This is generated from the [`Failure`] enum's `Display` implementation.
         reason: String,
         /// The number of failed checks for this instance and sled agent.
         datum: Cumulative<u64>,
@@ -551,6 +563,8 @@ mod metrics {
     #[derive(Clone, Debug, Metric)]
     struct IncompleteCheck {
         /// The reason why the check was unsuccessful.
+        ///
+        /// This is generated from the [`Incomplete`] enum's `Display` implementation.
         reason: String,
         /// The number of failed checks for this instance and sled agent.
         datum: Cumulative<u64>,
diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
index 6d69f4c8fa..d998a3c396 100644
--- a/smf/nexus/multi-sled/config-partial.toml
+++ b/smf/nexus/multi-sled/config-partial.toml
@@ -56,7 +56,6 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 instance_watcher.period_secs = 30
-instance_watcher.max_retries = 5
 
 [default_region_allocation_strategy]
 # by default, allocate across 3 distinct sleds

From 610aceac191f3879587198b6ca4987da1df8b244 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 2 May 2024 13:32:10 -0700
Subject: [PATCH 35/69] back out unneeded changes

---
 nexus/db-model/src/lib.rs                     |  2 +-
 nexus/db-model/src/typed_uuid.rs              |  7 +++-
 .../db-queries/src/db/datastore/inventory.rs  | 35 ++++++++-----------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs
index cda22f42f8..6495a0c960 100644
--- a/nexus/db-model/src/lib.rs
+++ b/nexus/db-model/src/lib.rs
@@ -182,7 +182,7 @@ pub use switch::*;
 pub use switch_interface::*;
 pub use switch_port::*;
 pub use tuf_repo::*;
-pub use typed_uuid::*;
+pub use typed_uuid::to_db_typed_uuid;
 pub use upstairs_repair::*;
 pub use user_builtin::*;
 pub use utilization::*;
diff --git a/nexus/db-model/src/typed_uuid.rs b/nexus/db-model/src/typed_uuid.rs
index 7a172703c7..1e54e242f3 100644
--- a/nexus/db-model/src/typed_uuid.rs
+++ b/nexus/db-model/src/typed_uuid.rs
@@ -19,13 +19,18 @@ use uuid::Uuid;
 /// Returns the corresponding `DbTypedUuid` for this `TypedUuid`.
 ///
 /// Code external to the `db-model` crate sometimes needs a way to convert a
-/// `TypedUuid` to a `DbTypedUuid`.
+/// `TypedUuid` to a `DbTypedUuid`. We don't want `DbTypedUuid` to be used
+/// anywhere, so we don't make it public. Instead, we expose this function.
 #[inline]
 pub fn to_db_typed_uuid<T: TypedUuidKind>(id: TypedUuid<T>) -> DbTypedUuid<T> {
     DbTypedUuid(id)
 }
 
 /// A UUID with information about the kind of type it is.
+///
+/// Despite the fact that this is marked `pub`, this is *private* to the
+/// `db-model` crate (this type is not exported at the top level). External
+/// users must use omicron-common's `TypedUuid`.
 #[derive_where(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
 #[derive(AsExpression, FromSqlRow, Serialize, Deserialize, JsonSchema)]
 #[diesel(sql_type = sql_types::Uuid)]
diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs
index bba96c76f9..6faa8ea251 100644
--- a/nexus/db-queries/src/db/datastore/inventory.rs
+++ b/nexus/db-queries/src/db/datastore/inventory.rs
@@ -1312,33 +1312,28 @@ impl DataStore {
         &self,
         opctx: &OpContext,
     ) -> Result<Option<Collection>, Error> {
-        let Some(id) = self.inventory_get_latest_collection_id(opctx).await?
-        else {
-            return Ok(None);
-        };
-        Ok(Some(self.inventory_collection_read(opctx, id).await?))
-    }
-
-    /// Returns the ID of the latest collection, if any collections exist.
-    ///
-    /// If there aren't any collections, returns `Ok(None)`.
-    pub async fn inventory_get_latest_collection_id(
-        &self,
-        opctx: &OpContext,
-    ) -> Result<Option<CollectionUuid>, Error> {
-        use db::schema::inv_collection::dsl;
-
         opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
         let conn = self.pool_connection_authorized(opctx).await?;
-        let id = dsl::inv_collection
+        use db::schema::inv_collection::dsl;
+        let collection_id = dsl::inv_collection
             .select(dsl::id)
             .order_by(dsl::time_started.desc())
             .first_async::<Uuid>(&*conn)
             .await
             .optional()
-            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?
-            .map(CollectionUuid::from_untyped_uuid);
-        Ok(id)
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        let Some(collection_id) = collection_id else {
+            return Ok(None);
+        };
+
+        Ok(Some(
+            self.inventory_collection_read(
+                opctx,
+                CollectionUuid::from_untyped_uuid(collection_id),
+            )
+            .await?,
+        ))
     }
 
     /// Attempt to read the current collection

From 0e6ed21c767e9736875b6f67c839faedbac79178 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 3 May 2024 11:46:57 -0700
Subject: [PATCH 36/69] include project and silo IDs; metrics tweaks

---
 nexus/db-queries/src/db/datastore/instance.rs |  42 ++++
 .../src/db/datastore/sled_instance.rs         |  26 ---
 nexus/src/app/background/instance_watcher.rs  | 189 +++++++++---------
 3 files changed, 135 insertions(+), 122 deletions(-)

diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs
index 731f7b4c06..2ea6c74f9c 100644
--- a/nexus/db-queries/src/db/datastore/instance.rs
+++ b/nexus/db-queries/src/db/datastore/instance.rs
@@ -22,6 +22,7 @@ use crate::db::model::Instance;
 use crate::db::model::InstanceRuntimeState;
 use crate::db::model::Name;
 use crate::db::model::Project;
+use crate::db::model::Sled;
 use crate::db::model::Vmm;
 use crate::db::pagination::paginated;
 use crate::db::update_and_check::UpdateAndCheck;
@@ -29,11 +30,14 @@ use crate::db::update_and_check::UpdateStatus;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use chrono::Utc;
 use diesel::prelude::*;
+use nexus_db_model::ApplySledFilterExt;
 use nexus_db_model::Disk;
 use nexus_db_model::VmmRuntimeState;
+use nexus_types::deployment::SledFilter;
 use omicron_common::api;
 use omicron_common::api::external::http_pagination::PaginatedBy;
 use omicron_common::api::external::CreateResult;
+use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::DeleteResult;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::ListResultVec;
@@ -385,6 +389,44 @@ impl DataStore {
         Ok((instance_updated, vmm_updated))
     }
 
+    pub async fn instance_and_vmm_list_by_sled_agent(
+        &self,
+        opctx: &OpContext,
+        pagparams: &DataPageParams<'_, Uuid>,
+    ) -> ListResultVec<(Sled, Instance, Vmm, Project)> {
+        use crate::db::schema::{
+            instance::dsl as instance_dsl, project::dsl as project_dsl,
+            sled::dsl as sled_dsl, vmm::dsl as vmm_dsl,
+        };
+        opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
+
+        let result = paginated(sled_dsl::sled, sled_dsl::id, pagparams)
+            .filter(sled_dsl::time_deleted.is_null())
+            .sled_filter(SledFilter::InService)
+            .inner_join(vmm_dsl::vmm.on(vmm_dsl::sled_id.eq(sled_dsl::id)))
+            .inner_join(
+                instance_dsl::instance.on(instance_dsl::active_propolis_id
+                    .eq(vmm_dsl::id.nullable())
+                    .and(vmm_dsl::time_deleted.is_null())),
+            )
+            .inner_join(
+                project_dsl::project
+                    .on(instance_dsl::project_id.eq(project_dsl::id)),
+            )
+            .select((
+                Sled::as_select(),
+                Instance::as_select(),
+                Vmm::as_select(),
+                Project::as_select(),
+            ))
+            .load_async::<(Sled, Instance, Vmm, Project)>(&*conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        Ok(result)
+    }
+
     pub async fn project_delete_instance(
         &self,
         opctx: &OpContext,
diff --git a/nexus/db-queries/src/db/datastore/sled_instance.rs b/nexus/db-queries/src/db/datastore/sled_instance.rs
index 17ef98a43f..dbdd696d70 100644
--- a/nexus/db-queries/src/db/datastore/sled_instance.rs
+++ b/nexus/db-queries/src/db/datastore/sled_instance.rs
@@ -5,13 +5,10 @@ use crate::context::OpContext;
 use crate::db;
 use crate::db::error::public_error_from_diesel;
 use crate::db::error::ErrorHandler;
-use crate::db::model::Sled;
 use crate::db::pagination::paginated;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use diesel::prelude::*;
-use nexus_db_model::ApplySledFilterExt;
 use nexus_db_model::SledInstance;
-use nexus_types::deployment::SledFilter;
 use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::ListResultVec;
 use uuid::Uuid;
@@ -34,27 +31,4 @@ impl DataStore {
             .await
             .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
     }
-
-    pub async fn sled_instance_list_by_sled_agent(
-        &self,
-        opctx: &OpContext,
-        pagparams: &DataPageParams<'_, Uuid>,
-    ) -> ListResultVec<(Sled, SledInstance)> {
-        use crate::db::schema::{sled::dsl as sled_dsl, sled_instance::dsl};
-        opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
-        let conn = self.pool_connection_authorized(opctx).await?;
-
-        let result = paginated(sled_dsl::sled, sled_dsl::id, pagparams)
-            .filter(sled_dsl::time_deleted.is_null())
-            .sled_filter(SledFilter::InService)
-            .inner_join(
-                dsl::sled_instance.on(dsl::active_sled_id.eq(sled_dsl::id)),
-            )
-            .select((Sled::as_select(), SledInstance::as_select()))
-            .load_async::<(Sled, SledInstance)>(&*conn)
-            .await
-            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
-
-        Ok(result)
-    }
 }
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index da69093071..8b73fe4420 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -7,11 +7,12 @@
 use super::common::BackgroundTask;
 use futures::{future::BoxFuture, FutureExt};
 use http::StatusCode;
-use nexus_db_model::{Sled, SledInstance};
+use nexus_db_model::Sled;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
 use nexus_types::identity::Asset;
+use nexus_types::identity::Resource;
 use omicron_common::api::external::InstanceState;
 use omicron_common::api::internal::nexus::SledInstanceState;
 use oximeter::types::ProducerRegistry;
@@ -54,22 +55,15 @@ impl InstanceWatcher {
     fn check_instance(
         &self,
         opctx: &OpContext,
-        sled: &Sled,
         client: &SledAgentClient,
-        instance: SledInstance,
+        target: VirtualMachine,
     ) -> impl Future<Output = Check> + Send + 'static {
-        let instance_id = instance.instance_id();
         let watcher = self.clone();
-        let target = VirtualMachine {
-            instance_id,
-            sled_agent_id: sled.id(),
-            sled_agent_ip: std::net::Ipv6Addr::from(sled.ip).into(),
-            sled_agent_port: sled.port.into(),
-        };
+
         let opctx = opctx.child(
             std::iter::once((
                 "instance_id".to_string(),
-                instance_id.to_string(),
+                target.instance_id.to_string(),
             ))
             .collect(),
         );
@@ -78,7 +72,7 @@ impl InstanceWatcher {
         async move {
             let InstanceWatcher { datastore, resolver, .. } = watcher;
             slog::trace!(opctx.log, "checking on instance...");
-            let rsp = client.instance_get_state(&instance_id).await;
+            let rsp = client.instance_get_state(&target.instance_id).await;
             let mut check = Check { target, outcome: None, result: Ok(()) };
             let state = match rsp {
                 Ok(rsp) => rsp.into_inner(),
@@ -91,8 +85,9 @@ impl InstanceWatcher {
                         slog::info!(opctx.log, "instance is wayyyyy gone");
                         // TODO(eliza): eventually, we should attempt to put the
                         // instance in the `Failed` state here.
-                        check.outcome =
-                            Some(CheckOutcome::Failed(Failure::NoSuchInstance));
+                        check.outcome = Some(CheckOutcome::Failure(
+                            Failure::NoSuchInstance,
+                        ));
                         return check;
                     }
                     if status.is_client_error() {
@@ -105,7 +100,7 @@ impl InstanceWatcher {
                         "status" => ?status, "error" => ?rsp.into_inner());
                     }
 
-                    check.outcome = Some(CheckOutcome::Failed(
+                    check.outcome = Some(CheckOutcome::Failure(
                         Failure::SledAgentResponse(status.as_u16()),
                     ));
                     return check;
@@ -115,8 +110,14 @@ impl InstanceWatcher {
                     // instance to the `Failed` state if the sled-agent has been
                     // unreachable for a while. We may also want to take other
                     // corrective actions or alert an operator in this case.
+                    //
+                    // TODO(eliza):  because we have the preported IP address
+                    // of the instance's VMM from our databse query, we could
+                    // also ask the VMM directly when the sled-agent is
+                    // unreachable. We should start doing that here at some
+                    // point.
                     slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
-                    check.outcome = Some(CheckOutcome::Failed(
+                    check.outcome = Some(CheckOutcome::Failure(
                         Failure::SledAgentUnreachable,
                     ));
                     return check;
@@ -134,10 +135,8 @@ impl InstanceWatcher {
             };
 
             let new_runtime_state: SledInstanceState = state.into();
-            check.outcome = Some(CheckOutcome::Completed {
-                instance_state: new_runtime_state.vmm_state.state,
-                vmm_id: new_runtime_state.propolis_id,
-            });
+            check.outcome =
+                Some(CheckOutcome::Success(new_runtime_state.vmm_state.state));
             slog::debug!(
                 opctx.log,
                 "updating instance state: {new_runtime_state:?}"
@@ -148,7 +147,7 @@ impl InstanceWatcher {
                 &opctx,
                 &opctx,
                 &opctx.log,
-                &instance_id,
+                &target.instance_id,
                 &new_runtime_state,
             )
             .await
@@ -174,6 +173,12 @@ impl InstanceWatcher {
 struct VirtualMachine {
     /// The instance's ID.
     instance_id: Uuid,
+    /// The silo ID of the instance's silo.
+    silo_id: Uuid,
+    /// The project ID of the instance.
+    project_id: Uuid,
+    /// The VMM ID of the instance's virtual machine manager.
+    vmm_id: Uuid,
     /// The sled-agent's ID.
     sled_agent_id: Uuid,
     /// The sled agent's IP address.
@@ -209,8 +214,8 @@ struct Check {
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 enum CheckOutcome {
-    Completed { instance_state: InstanceState, vmm_id: Uuid },
-    Failed(Failure),
+    Success(InstanceState),
+    Failure(Failure),
 }
 
 #[derive(
@@ -253,6 +258,25 @@ enum Incomplete {
     UpdateFailed,
 }
 
+impl CheckOutcome {
+    fn is_healthy(&self) -> bool {
+        match self {
+            Self::Success(InstanceState::Failed) => false,
+            Self::Failure(_) => false,
+            _ => true,
+        }
+    }
+}
+
+impl fmt::Display for CheckOutcome {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Success(state) => write!(f, "{state}"),
+            Self::Failure(reason) => write!(f, "{reason}"),
+        }
+    }
+}
+
 impl fmt::Display for Failure {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
@@ -297,7 +321,7 @@ impl BackgroundTask for InstanceWatcher {
             while let Some(p) = paginator.next() {
                 let maybe_batch = self
                     .datastore
-                    .sled_instance_list_by_sled_agent(
+                    .instance_and_vmm_list_by_sled_agent(
                         opctx,
                         &p.current_pagparams(),
                     )
@@ -312,7 +336,7 @@ impl BackgroundTask for InstanceWatcher {
                         break;
                     }
                 };
-                paginator = p.found_batch(&batch, &|(sled, _)| sled.id());
+                paginator = p.found_batch(&batch, &|(sled, _, _, _)| sled.id());
 
                 // When we iterate over the batch of sled instances, we pop the
                 // first sled from the batch before looping over the rest, to
@@ -323,27 +347,35 @@ impl BackgroundTask for InstanceWatcher {
                 // handle the case where it's `None`, and I thought this was a
                 // bit neater...
                 let mut batch = batch.into_iter();
-                if let Some((mut curr_sled, sled_instance)) = batch.next() {
+                if let Some((mut curr_sled, instance, vmm, project)) = batch.next() {
                     let mut client = mk_client(&curr_sled);
-                    tasks.spawn(self.check_instance(
-                        opctx,
-                        &curr_sled,
-                        &client,
-                        sled_instance,
-                    ));
-
-                    for (sled, sled_instance) in batch {
+                    let target = VirtualMachine {
+                        instance_id: instance.id(),
+                        silo_id: project.silo_id,
+                        project_id: project.id(),
+                        vmm_id: vmm.id,
+                        sled_agent_id: curr_sled.id(),
+                        sled_agent_ip: (*curr_sled.address().ip()).into(),
+                        sled_agent_port: curr_sled.address().port(),
+                    };
+                    tasks.spawn(self.check_instance(opctx, &client, target));
+
+                    for (sled, instance, vmm, project) in batch {
                         // We're now talking to a new sled agent; update the client.
                         if sled.id() != curr_sled.id() {
                             client = mk_client(&sled);
                             curr_sled = sled;
                         }
-                        tasks.spawn(self.check_instance(
-                            opctx,
-                            &curr_sled,
-                            &client,
-                            sled_instance,
-                        ));
+                        let target = VirtualMachine {
+                            instance_id: instance.id(),
+                            silo_id: project.silo_id,
+                            project_id: project.id(),
+                            vmm_id: vmm.id,
+                            sled_agent_id: curr_sled.id(),
+                            sled_agent_ip: (*curr_sled.address().ip()).into(),
+                            sled_agent_port: curr_sled.address().port(),
+                        };
+                        tasks.spawn(self.check_instance(opctx, &client, target));
                     }
                 }
             }
@@ -373,12 +405,12 @@ impl BackgroundTask for InstanceWatcher {
                 if let Some(outcome) = outcome {
                     metric.completed(outcome);
                     match outcome {
-                        CheckOutcome::Completed { instance_state, .. } => {
+                        CheckOutcome::Success(state) => {
                             *instance_states
-                                .entry(instance_state)
+                                .entry(state)
                                 .or_default() += 1;
                         }
-                        CheckOutcome::Failed(reason) => {
+                        CheckOutcome::Failure(reason) => {
                             *check_failures.entry(reason).or_default() += 1;
                         }
                     }
@@ -409,9 +441,7 @@ impl BackgroundTask for InstanceWatcher {
 }
 
 mod metrics {
-    use super::{
-        CheckOutcome, Failure, Incomplete, InstanceState, Uuid, VirtualMachine,
-    };
+    use super::{CheckOutcome, Incomplete, VirtualMachine};
     use oximeter::types::Cumulative;
     use oximeter::Metric;
     use oximeter::MetricsError;
@@ -430,17 +460,11 @@ mod metrics {
 
     #[derive(Debug, Default)]
     pub(super) struct Instance {
-        instance_states: BTreeMap<StateKey, State>,
-        // N.B. that these names are a bit unfortunate; since the name of the
-        // metrics is generated from the name of the metric struct, we can't
-        // name the struct anything else.
-        check_failures: BTreeMap<Failure, FailedCheck>,
+        checks: BTreeMap<CheckOutcome, Check>,
         check_errors: BTreeMap<Incomplete, IncompleteCheck>,
         touched: bool,
     }
 
-    type StateKey = (Uuid, InstanceState);
-
     impl Metrics {
         pub(crate) fn instance(
             &mut self,
@@ -477,27 +501,15 @@ mod metrics {
 
     impl Instance {
         pub(super) fn completed(&mut self, outcome: CheckOutcome) {
-            match outcome {
-                CheckOutcome::Completed { instance_state, vmm_id } => {
-                    self.instance_states
-                        .entry((vmm_id, instance_state))
-                        .or_insert_with(|| State {
-                            state: instance_state.to_string(),
-                            vmm_id,
-                            datum: Cumulative::default(),
-                        })
-                        .datum += 1;
-                }
-                CheckOutcome::Failed(reason) => {
-                    self.check_failures
-                        .entry(reason)
-                        .or_insert_with(|| FailedCheck {
-                            reason: reason.to_string(),
-                            datum: Cumulative::default(),
-                        })
-                        .datum += 1;
-                }
-            };
+            self.checks
+                .entry(outcome)
+                .or_insert_with(|| Check {
+                    state: outcome.to_string(),
+                    healthy: outcome.is_healthy(),
+                    datum: Cumulative::default(),
+                })
+                .datum += 1;
+
             self.touched = true;
         }
 
@@ -513,9 +525,7 @@ mod metrics {
         }
 
         fn len(&self) -> usize {
-            self.instance_states.len()
-                + self.check_failures.len()
-                + self.check_errors.len()
+            self.checks.len() + self.check_errors.len()
         }
 
         fn sample_into(
@@ -523,10 +533,7 @@ mod metrics {
             target: &VirtualMachine,
             dest: &mut Vec<Sample>,
         ) -> Result<(), MetricsError> {
-            for metric in self.instance_states.values() {
-                dest.push(Sample::new(target, metric)?);
-            }
-            for metric in self.check_failures.values() {
+            for metric in self.checks.values() {
                 dest.push(Sample::new(target, metric)?);
             }
             for metric in self.check_errors.values() {
@@ -538,27 +545,17 @@ mod metrics {
 
     /// The number of successful checks for a single instance, VMM, and sled agent.
     #[derive(Clone, Debug, Metric)]
-    struct State {
-        /// The UUID of the VMM process for which this state was reported.
-        vmm_id: Uuid,
+    struct Check {
         /// The string representation of the instance's state as understood by
-        /// the VMM.
+        /// the VMM, or the cause of the check failure, if the check failed.
         state: String,
+        /// `true` if the instance is considered healthy, false if the instance
+        /// is not considered healthy.
+        healthy: bool,
         /// The number of successful checks for this instance and sled agent.
         datum: Cumulative<u64>,
     }
 
-    /// The number of failed checks for an instance and sled agent pair.
-    #[derive(Clone, Debug, Metric)]
-    struct FailedCheck {
-        /// The reason why the check failed.
-        ///
-        /// This is generated from the [`Failure`] enum's `Display` implementation.
-        reason: String,
-        /// The number of failed checks for this instance and sled agent.
-        datum: Cumulative<u64>,
-    }
-
     /// The number of unsuccessful checks for an instance and sled agent pair.
     #[derive(Clone, Debug, Metric)]
     struct IncompleteCheck {

From fc7899da85dcb941bba4c608e695aefdd69345eb Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 3 May 2024 11:50:23 -0700
Subject: [PATCH 37/69] minor style embetterments

---
 nexus/src/app/background/instance_watcher.rs | 33 +++++++++++++++-----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 8b73fe4420..29e7045add 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -139,7 +139,8 @@ impl InstanceWatcher {
                 Some(CheckOutcome::Success(new_runtime_state.vmm_state.state));
             slog::debug!(
                 opctx.log,
-                "updating instance state: {new_runtime_state:?}"
+                "updating instance state";
+                "state" => ?new_runtime_state.vmm_state.state,
             );
             check.result = crate::app::instance::notify_instance_updated(
                 &datastore,
@@ -152,14 +153,32 @@ impl InstanceWatcher {
             )
             .await
             .map_err(|e| {
-                slog::warn!(opctx.log, "error updating instance"; "error" => ?e);
+                slog::warn!(
+                    opctx.log,
+                    "error updating instance";
+                    "error" => ?e,
+                    "state" => ?new_runtime_state.vmm_state.state,
+                );
                 Incomplete::UpdateFailed
             })
-            .and_then(|updated| updated.ok_or_else(|| {
-                slog::warn!(opctx.log, "error updating instance: not found in database");
-                Incomplete::InstanceNotFound
-            })).map(|updated| {
-                slog::debug!(opctx.log, "update successful"; "instance_updated" => updated.instance_updated, "vmm_updated" => updated.vmm_updated);
+            .and_then(|updated| {
+                updated.ok_or_else(|| {
+                    slog::warn!(
+                        opctx.log,
+                        "error updating instance: not found in database";
+                        "state" => ?new_runtime_state.vmm_state.state,
+                    );
+                    Incomplete::InstanceNotFound
+                })
+            })
+            .map(|updated| {
+                slog::debug!(
+                    opctx.log,
+                    "update successful";
+                    "instance_updated" => updated.instance_updated,
+                    "vmm_updated" => updated.vmm_updated,
+                    "state" => ?new_runtime_state.vmm_state.state,
+                );
             });
 
             check

From 49976d24b49767f692d1988834ba445d3328c9a1 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 08:58:06 -0700
Subject: [PATCH 38/69] use fleet authz in instance/vmm query

---
 nexus/db-queries/src/db/datastore/instance.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs
index 2ea6c74f9c..9bb8071572 100644
--- a/nexus/db-queries/src/db/datastore/instance.rs
+++ b/nexus/db-queries/src/db/datastore/instance.rs
@@ -398,7 +398,7 @@ impl DataStore {
             instance::dsl as instance_dsl, project::dsl as project_dsl,
             sled::dsl as sled_dsl, vmm::dsl as vmm_dsl,
         };
-        opctx.authorize(authz::Action::Read, &authz::INVENTORY).await?;
+        opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
         let conn = self.pool_connection_authorized(opctx).await?;
 
         let result = paginated(sled_dsl::sled, sled_dsl::id, pagparams)

From 70b8485b28a8b7047cdde8576bedf2afb8ea722b Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 09:29:41 -0700
Subject: [PATCH 39/69] docs

---
 dev-tools/omdb/src/bin/omdb/nexus.rs          | 30 +++++++++++++++++--
 nexus/db-queries/src/db/datastore/instance.rs |  6 ++++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index 3686031127..a2427b02a8 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -898,13 +898,37 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
             /// number of stale instance metrics that were deleted
             pruned_instances: usize,
 
-            /// instance states from completed checks
+            /// instance states from completed checks.
+            ///
+            /// this is a mapping of stringified instance states to the number
+            /// of instances in that state. these stringified states correspond
+            /// to the `state` field recorded by the instance watcher's
+            /// `virtual_machine:check` timeseries with the `healthy` field set
+            /// to `true`. any changes to the instance state type which cause it
+            /// to print differently will be counted as a distinct state.
             instance_states: BTreeMap<String, usize>,
 
-            /// instance check failures
+            /// instance check failures.
+            ///
+            /// this is a mapping of stringified instance check failure reasons
+            /// to the number of instances with checks that failed for that
+            /// reason. these stringified  failure reasons correspond to the
+            /// `state` field recorded by the instance watcher's
+            /// `virtual_machine:check` timeseries with the `healthy` field set
+            /// to `false`. any changes to the instance state type which cause
+            /// it to print differently will be counted as a distinct failure
+            /// reason.
             failed_checks: BTreeMap<String, usize>,
 
-            /// checks that could not be completed
+            /// instance checks that could not be completed successfully.
+            ///
+            /// this is a mapping of stringified instance check errors
+            /// to the number of instance checks that were not completed due to
+            /// that error. these stringified errors correspond to the `reason `
+            /// field recorded by the instance watcher's
+            /// `virtual_machine:incomplete_check` timeseries. any changes to
+            /// the check error type which cause it to print
+            /// differently will be counted as a distinct check error.
             incomplete_checks: BTreeMap<String, usize>,
         }
 
diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs
index 9bb8071572..832e560c36 100644
--- a/nexus/db-queries/src/db/datastore/instance.rs
+++ b/nexus/db-queries/src/db/datastore/instance.rs
@@ -389,6 +389,12 @@ impl DataStore {
         Ok((instance_updated, vmm_updated))
     }
 
+    /// Lists all instances on in-service sleds with active Propolis VMM
+    /// processes, returning the instance along with the VMM on which it's
+    /// running, the sled on which the VMM is running, and the project that owns
+    /// the instance.
+    ///
+    /// The query performed by this function is paginated by the sled's UUID.
     pub async fn instance_and_vmm_list_by_sled_agent(
         &self,
         opctx: &OpContext,

From 6f0216d59a088d6a699d1bcab08ed293f2f9925f Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 09:39:20 -0700
Subject: [PATCH 40/69] separate instance state and reason in metrics

This way, we can emit metrics that consider instances with failed checks
to be in the "failed" state alongside instances marked as failed
because the sled-agent said so, but distinguish between the two.
Currently, we don't actually mark instances as "failed" due to failed
checks, but I wanted the metrics schema to support the ability to do
that.
---
 nexus/src/app/background/instance_watcher.rs | 35 ++++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 29e7045add..57c78e9eb6 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -460,7 +460,7 @@ impl BackgroundTask for InstanceWatcher {
 }
 
 mod metrics {
-    use super::{CheckOutcome, Incomplete, VirtualMachine};
+    use super::{CheckOutcome, Incomplete, InstanceState, VirtualMachine};
     use oximeter::types::Cumulative;
     use oximeter::Metric;
     use oximeter::MetricsError;
@@ -522,10 +522,17 @@ mod metrics {
         pub(super) fn completed(&mut self, outcome: CheckOutcome) {
             self.checks
                 .entry(outcome)
-                .or_insert_with(|| Check {
-                    state: outcome.to_string(),
-                    healthy: outcome.is_healthy(),
-                    datum: Cumulative::default(),
+                .or_insert_with(|| match outcome {
+                    CheckOutcome::Failure(failure) => Check {
+                        state: InstanceState::Failed.to_string(),
+                        reason: failure.to_string(),
+                        datum: Cumulative::default(),
+                    },
+                    CheckOutcome::Success(state) => Check {
+                        state: state.to_string(),
+                        reason: "success".to_string(),
+                        datum: Cumulative::default(),
+                    },
                 })
                 .datum += 1;
 
@@ -566,12 +573,20 @@ mod metrics {
     #[derive(Clone, Debug, Metric)]
     struct Check {
         /// The string representation of the instance's state as understood by
-        /// the VMM, or the cause of the check failure, if the check failed.
+        /// the VMM. If the check failed, this will generally be "failed".
         state: String,
-        /// `true` if the instance is considered healthy, false if the instance
-        /// is not considered healthy.
-        healthy: bool,
-        /// The number of successful checks for this instance and sled agent.
+        /// `Why the instance was marked as being in this state.
+        ///
+        /// If an instance was marked as "failed" due to a check failure, this
+        /// will be a string representation of the failure reason. Otherwise, if
+        /// the check was successful, this will be "success". Note that this may
+        /// be "success" even if the instance's state is "failed", which
+        /// indicates that we successfully queried the instance's state from the
+        /// sled-agent, and the *sled-agent* reported that the instance has
+        /// failed --- which is distinct from the instance watcher marking an
+        /// instance as failed due to a failed check.
+        reason: String,
+        /// this will be a string representation of the failure reason.
         datum: Cumulative<u64>,
     }
 

From c1661e9c2d49f5905a3c47e13f16e1fac74e40c6 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 09:45:14 -0700
Subject: [PATCH 41/69] rm unused code

---
 nexus/src/app/background/instance_watcher.rs | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 57c78e9eb6..310d15efe1 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -277,16 +277,6 @@ enum Incomplete {
     UpdateFailed,
 }
 
-impl CheckOutcome {
-    fn is_healthy(&self) -> bool {
-        match self {
-            Self::Success(InstanceState::Failed) => false,
-            Self::Failure(_) => false,
-            _ => true,
-        }
-    }
-}
-
 impl fmt::Display for CheckOutcome {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {

From 70f2b3ffc6168673642261dfc921966a2d7457c8 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 12:32:07 -0700
Subject: [PATCH 42/69] make instance-watcher query errors more obvious

---
 nexus/src/app/background/instance_watcher.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 310d15efe1..6113aa3e49 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -338,11 +338,11 @@ impl BackgroundTask for InstanceWatcher {
                 let batch = match maybe_batch {
                     Ok(batch) => batch,
                     Err(e) => {
-                        slog::warn!(
+                        slog::error!(
                             opctx.log,
                             "sled instances by sled agent query failed: {e}"
                         );
-                        break;
+                        return serde_json::json!({ "error": e.to_string() });
                     }
                 };
                 paginator = p.found_batch(&batch, &|(sled, _, _, _)| sled.id());

From 59e1bc317f4018c47b280c5b1180963c90ba8690 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 15:53:14 -0700
Subject: [PATCH 43/69] add indices so that the vmms-by-sled query works

---
 nexus/db-queries/src/db/datastore/instance.rs | 24 ++++++++++++-------
 .../add-lookup-vmm-by-sled-id-index/up.sql    |  3 +++
 schema/crdb/dbinit.sql                        |  4 ++++
 3 files changed, 23 insertions(+), 8 deletions(-)
 create mode 100644 schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql

diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs
index 832e560c36..ce40e20501 100644
--- a/nexus/db-queries/src/db/datastore/instance.rs
+++ b/nexus/db-queries/src/db/datastore/instance.rs
@@ -410,16 +410,24 @@ impl DataStore {
         let result = paginated(sled_dsl::sled, sled_dsl::id, pagparams)
             .filter(sled_dsl::time_deleted.is_null())
             .sled_filter(SledFilter::InService)
-            .inner_join(vmm_dsl::vmm.on(vmm_dsl::sled_id.eq(sled_dsl::id)))
             .inner_join(
-                instance_dsl::instance.on(instance_dsl::active_propolis_id
-                    .eq(vmm_dsl::id.nullable())
-                    .and(vmm_dsl::time_deleted.is_null())),
-            )
-            .inner_join(
-                project_dsl::project
-                    .on(instance_dsl::project_id.eq(project_dsl::id)),
+                vmm_dsl::vmm
+                    .on(vmm_dsl::sled_id
+                        .eq(sled_dsl::id)
+                        .and(vmm_dsl::time_deleted.is_null()))
+                    .inner_join(
+                        instance_dsl::instance
+                            .on(instance_dsl::id
+                                .eq(vmm_dsl::instance_id)
+                                .and(instance_dsl::time_deleted.is_null()))
+                            .inner_join(
+                                project_dsl::project.on(project_dsl::id
+                                    .eq(instance_dsl::project_id)
+                                    .and(project_dsl::time_deleted.is_null())),
+                            ),
+                    ),
             )
+            .sled_filter(SledFilter::InService)
             .select((
                 Sled::as_select(),
                 Instance::as_select(),
diff --git a/schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql b/schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql
new file mode 100644
index 0000000000..d280ff1e05
--- /dev/null
+++ b/schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql
@@ -0,0 +1,3 @@
+CREATE UNIQUE INDEX IF NOT EXISTS lookup_vmms_by_sled_id ON omicron.public.vmm (
+    sled_id
+) WHERE time_deleted IS NULL;
diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql
index e77b7b81ef..18b4742caf 100644
--- a/schema/crdb/dbinit.sql
+++ b/schema/crdb/dbinit.sql
@@ -3426,6 +3426,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.vmm (
     propolis_port INT4 NOT NULL CHECK (propolis_port BETWEEN 0 AND 65535) DEFAULT 12400
 );
 
+CREATE INDEX IF NOT EXISTS lookup_vmms_by_sled_id ON omicron.public.vmm (
+    sled_id
+) WHERE time_deleted IS NULL;
+
 /*
  * A special view of an instance provided to operators for insights into what's
  * running on a sled.

From 44ef38652dba246d98afc0c5d968e78ed43cf95c Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 15:59:16 -0700
Subject: [PATCH 44/69] serde keys must be strings :/

---
 nexus/src/app/background/instance_watcher.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 6113aa3e49..02efbb09ea 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -395,11 +395,11 @@ impl BackgroundTask for InstanceWatcher {
 
             // Now, wait for the check results to come back.
             let mut total: usize = 0;
-            let mut instance_states: BTreeMap<InstanceState, usize> =
+            let mut instance_states: BTreeMap<String, usize> =
                 BTreeMap::new();
-            let mut check_failures: BTreeMap<Failure, usize> =
+            let mut check_failures: BTreeMap<String, usize> =
                 BTreeMap::new();
-            let mut check_errors: BTreeMap<Incomplete, usize> = BTreeMap::new();
+            let mut check_errors: BTreeMap<String, usize> = BTreeMap::new();
             while let Some(result) = tasks.join_next().await {
                 total += 1;
                 let Check { target, outcome, result } = result.expect(
@@ -416,17 +416,17 @@ impl BackgroundTask for InstanceWatcher {
                     match outcome {
                         CheckOutcome::Success(state) => {
                             *instance_states
-                                .entry(state)
+                                .entry(state.to_string())
                                 .or_default() += 1;
                         }
                         CheckOutcome::Failure(reason) => {
-                            *check_failures.entry(reason).or_default() += 1;
+                            *check_failures.entry(reason.to_string()).or_default() += 1;
                         }
                     }
                 }
                 if let Err(reason) = result {
                     metric.check_error(reason);
-                    *check_errors.entry(reason).or_default() += 1;
+                    *check_errors.entry(reason.to_string()).or_default() += 1;
                 }
             }
 

From 23eb079aa807e774818de999167afc6ffcaa4398 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 16:21:18 -0700
Subject: [PATCH 45/69] actually populate sled-agent-sim methods

---
 sled-agent/src/sim/http_entrypoints.rs | 13 +++++++++++++
 sled-agent/src/sim/sled_agent.rs       | 16 ++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs
index 73e94e949b..e160ff615e 100644
--- a/sled-agent/src/sim/http_entrypoints.rs
+++ b/sled-agent/src/sim/http_entrypoints.rs
@@ -134,6 +134,19 @@ async fn instance_put_state(
     ))
 }
 
+#[endpoint {
+    method = GET,
+    path = "/instances/{instance_id}/state",
+}]
+async fn instance_get_state(
+    rqctx: RequestContext<SledAgent>,
+    path_params: Path<InstancePathParam>,
+) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
+    let sa = rqctx.context();
+    let instance_id = path_params.into_inner().instance_id;
+    Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?))
+}
+
 #[endpoint {
     method = PUT,
     path = "/instances/{instance_id}/migration-ids",
diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs
index 37086a8343..298a8adc34 100644
--- a/sled-agent/src/sim/sled_agent.rs
+++ b/sled-agent/src/sim/sled_agent.rs
@@ -477,6 +477,22 @@ impl SledAgent {
         Ok(InstancePutStateResponse { updated_runtime: Some(new_state) })
     }
 
+    pub async fn instance_get_state(
+        &self,
+        instance_id: Uuid,
+    ) -> Result<SledInstanceState, HttpError> {
+        let instance = self
+            .instances
+            .sim_get_cloned_object(&instance_id)
+            .await
+            .map_err(|_| {
+                crate::sled_agent::Error::Instance(
+                    crate::instance_manager::Error::NoSuchInstance(instance_id),
+                )
+            })?;
+        Ok(instance.current())
+    }
+
     pub async fn set_instance_ensure_state_error(&self, error: Option<Error>) {
         *self.instance_ensure_state_error.lock().await = error;
     }

From 6b7832a8bfd6c52661c128ec30db08088fbb6437 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 16:21:18 -0700
Subject: [PATCH 46/69] actually populate sled-agent-sim methods

---
 sled-agent/src/sim/http_entrypoints.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs
index e160ff615e..6cddac6fb8 100644
--- a/sled-agent/src/sim/http_entrypoints.rs
+++ b/sled-agent/src/sim/http_entrypoints.rs
@@ -41,6 +41,7 @@ pub fn api() -> SledApiDescription {
     fn register_endpoints(api: &mut SledApiDescription) -> Result<(), String> {
         api.register(instance_put_migration_ids)?;
         api.register(instance_put_state)?;
+        api.register(instance_get_state)?;
         api.register(instance_register)?;
         api.register(instance_unregister)?;
         api.register(instance_put_external_ip)?;
@@ -139,7 +140,7 @@ async fn instance_put_state(
     path = "/instances/{instance_id}/state",
 }]
 async fn instance_get_state(
-    rqctx: RequestContext<SledAgent>,
+    rqctx: RequestContext<Arc<SledAgent>>,
     path_params: Path<InstancePathParam>,
 ) -> Result<HttpResponseOk<SledInstanceState>, HttpError> {
     let sa = rqctx.context();

From a207c62b0901c691850a56a0a9fb31a7ebb38c53 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 16:43:03 -0700
Subject: [PATCH 47/69] fix wrong counts in omdb output

---
 dev-tools/omdb/src/bin/omdb/nexus.rs | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index 1795f773cc..d39a147584 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -944,26 +944,25 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 failed_checks,
                 incomplete_checks,
             }) => {
+                let total_successes: usize = instance_states.values().sum();
+                let total_failures: usize = failed_checks.values().sum();
+                let total_incomplete: usize = incomplete_checks.values().sum();
                 println!("    total instances checked: {total_instances}",);
                 println!(
                     "    checks completed: {}",
-                    instance_states.len() + failed_checks.len()
-                );
-                println!(
-                    "     -> successful checks: {}",
-                    instance_states.len()
+                    total_successes + total_failures
                 );
+                println!("     -> successful checks: {total_successes}",);
                 for (state, count) in &instance_states {
                     println!("        {state} instances: {count}")
                 }
 
-                println!("     -> failed checks: {}", failed_checks.len());
+                println!("     -> failed checks: {total_failures}");
                 for (failure, count) in &failed_checks {
                     println!("        {failure}: {count}")
                 }
                 println!(
-                    "    checks that could not be completed: {}",
-                    incomplete_checks.len()
+                    "    checks that could not be completed: {total_incomplete}",
                 );
                 for (error, count) in &incomplete_checks {
                     println!("     -> {error}: {count}")

From 1aa3cc1efe7ec60fe9f95209dfcacf985d30baa9 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 6 May 2024 16:45:12 -0700
Subject: [PATCH 48/69] prettify OMDB output

---
 dev-tools/omdb/src/bin/omdb/nexus.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
index d39a147584..60925d0cb4 100644
--- a/dev-tools/omdb/src/bin/omdb/nexus.rs
+++ b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -952,20 +952,20 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                     "    checks completed: {}",
                     total_successes + total_failures
                 );
-                println!("     -> successful checks: {total_successes}",);
+                println!("       successful checks: {total_successes}",);
                 for (state, count) in &instance_states {
-                    println!("        {state} instances: {count}")
+                    println!("       -> {count} instances {state}")
                 }
 
-                println!("     -> failed checks: {total_failures}");
+                println!("       failed checks: {total_failures}");
                 for (failure, count) in &failed_checks {
-                    println!("        {failure}: {count}")
+                    println!("       -> {count} {failure}")
                 }
                 println!(
                     "    checks that could not be completed: {total_incomplete}",
                 );
                 for (error, count) in &incomplete_checks {
-                    println!("     -> {error}: {count}")
+                    println!("       -> {count} {error} errors")
                 }
                 println!(
                     "    stale instance metrics pruned: {pruned_instances}"

From e831a05fcfaffa7516527fe90b98916e139d3e82 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 09:22:39 -0700
Subject: [PATCH 49/69] put back eaten instance_watcher period configs

i don't know what happened to these...
---
 smf/nexus/multi-sled/config-partial.toml  | 1 +
 smf/nexus/single-sled/config-partial.toml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml
index 62e8b51b07..696411966b 100644
--- a/smf/nexus/multi-sled/config-partial.toml
+++ b/smf/nexus/multi-sled/config-partial.toml
@@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 service_firewall_propagation.period_secs = 300
+instance_watcher.period_secs = 30
 
 [default_region_allocation_strategy]
 # by default, allocate across 3 distinct sleds
diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml
index d5a4b4eb77..206f716fa7 100644
--- a/smf/nexus/single-sled/config-partial.toml
+++ b/smf/nexus/single-sled/config-partial.toml
@@ -56,6 +56,7 @@ sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
 service_firewall_propagation.period_secs = 300
+instance_watcher.period_secs = 30
 
 [default_region_allocation_strategy]
 # by default, allocate without requirement for distinct sleds.

From 10bc9374eddafe710ebf28a4e3378f4772724d7e Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 09:45:53 -0700
Subject: [PATCH 50/69] oh, so *that's* how you add to the schema

---
 nexus/db-model/src/schema_versions.rs | 3 ++-
 schema/crdb/dbinit.sql                | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs
index c4510c02be..a86d030e48 100644
--- a/nexus/db-model/src/schema_versions.rs
+++ b/nexus/db-model/src/schema_versions.rs
@@ -17,7 +17,7 @@ use std::collections::BTreeMap;
 ///
 /// This must be updated when you change the database schema.  Refer to
 /// schema/crdb/README.adoc in the root of this repository for details.
-pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(59, 0, 0);
+pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(60, 0, 0);
 
 /// List of all past database schema versions, in *reverse* order
 ///
@@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
         // |  leaving the first copy as an example for the next person.
         // v
         // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
+        KnownVersion::new(60, "add-lookup-vmm-by-sled-id-index"),
         KnownVersion::new(59, "enforce-first-as-default"),
         KnownVersion::new(58, "insert-default-allowlist"),
         KnownVersion::new(57, "add-allowed-source-ips"),
diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql
index 18b4742caf..e7025f2499 100644
--- a/schema/crdb/dbinit.sql
+++ b/schema/crdb/dbinit.sql
@@ -3846,7 +3846,7 @@ INSERT INTO omicron.public.db_metadata (
     version,
     target_version
 ) VALUES
-    (TRUE, NOW(), NOW(), '59.0.0', NULL)
+    (TRUE, NOW(), NOW(), '60.0.0', NULL)
 ON CONFLICT DO NOTHING;
 
 COMMIT;

From 065b1491e40fa3d24a2a4e8251f0b24e582cffa5 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 12:26:46 -0700
Subject: [PATCH 51/69] add a test for instance watcher timeseries

---
 nexus/tests/integration_tests/metrics.rs | 128 +++++++++++++++++++++++
 1 file changed, 128 insertions(+)

diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index 62c24a73e3..f74569b7c4 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -277,6 +277,134 @@ async fn test_timeseries_schema_list(
         .expect("Failed to find HTTP request latency histogram schema");
 }
 
+pub async fn timeseries_query(
+    cptestctx: &ControlPlaneTestContext<omicron_nexus::Server>,
+    query: impl ToString,
+) -> Vec<oximeter_db::oxql::Table> {
+    // first, make sure the latest timeseries have been collected.
+    cptestctx.oximeter.force_collect().await;
+
+    // okay, do the query
+    let body = nexus_types::external_api::params::TimeseriesQuery {
+        query: query.to_string(),
+    };
+    let query = dbg!(&body.query);
+    let rsp = NexusRequest::new(
+        nexus_test_utils::http_testing::RequestBuilder::new(
+            &cptestctx.external_client,
+            http::Method::POST,
+            "/v1/timeseries/query",
+        )
+        .body(Some(&body)),
+    )
+    .authn_as(AuthnMode::PrivilegedUser)
+    .execute()
+    .await
+    .unwrap_or_else(|e| {
+        panic!("timeseries query failed: {e:?}\nquery: {query}")
+    });
+    dbg!(rsp).parsed_body().unwrap_or_else(|e| {
+        panic!(
+            "could not parse timeseries query response: {e:?}\nquery: {query}"
+        );
+    })
+}
+
+#[nexus_test]
+async fn test_instance_watcher_metrics(
+    cptestctx: &ControlPlaneTestContext<omicron_nexus::Server>,
+) {
+    use oximeter::types::FieldValue;
+
+    let client = &cptestctx.external_client;
+    let internal_client = &cptestctx.internal_client;
+
+    let kick_instance_watcher = || async {
+        internal_client
+            .make_request(
+                http::Method::POST,
+                "/bgtasks/activate",
+                Some(serde_json::json!({
+                    "bgtask_names": vec![String::from("instance_watcher")]
+                })),
+                http::StatusCode::NO_CONTENT,
+            )
+            .await
+            .unwrap();
+        // bleh...
+        tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
+    };
+
+    create_default_ip_pool(&client).await; // needed for instance create to work
+                                           // Wait until Nexus registers as a producer with Oximeter.
+    wait_for_producer(
+        &cptestctx.oximeter,
+        cptestctx.server.apictx().nexus.id(),
+    )
+    .await;
+
+    create_project(&client, "p-1").await;
+    let instance1 = create_instance(&client, "p-1", "i-1").await;
+    let instance1_uuid = instance1.identity.id;
+
+    // activate the instance watcher background task.
+    kick_instance_watcher().await;
+
+    let metrics =
+        dbg!(timeseries_query(&cptestctx, "{ get virtual_machine:check, get virtual_machine:incomplete_check }").await);
+    let checks = metrics
+        .iter()
+        .find(|t| t.name() == "virtual_machine:check")
+        .expect("missing virtual_machine:check");
+    let ts = checks
+        .timeseries()
+        .find(|ts| {
+            ts.fields.get("instance_id").unwrap()
+                == &FieldValue::Uuid(instance1_uuid)
+        })
+        .expect("missing timeseries for instance1 checks");
+    assert_eq!(
+        ts.fields.get("status").unwrap(),
+        &FieldValue::String("starting".to_string())
+    );
+
+    // okay, make another instance
+    let instance2 = create_instance(&client, "p-1", "i-2").await;
+    let instance2_uuid = instance2.identity.id;
+
+    // activate the instance watcher background task.
+    kick_instance_watcher().await;
+
+    let metrics =
+        dbg!(timeseries_query(&cptestctx, "get virtual_machine:check").await);
+    let checks = metrics
+        .iter()
+        .find(|t| t.name() == "virtual_machine:check")
+        .expect("missing virtual_machine:check");
+    let ts1 = checks
+        .timeseries()
+        .find(|ts| {
+            ts.fields.get("instance_id").unwrap()
+                == &FieldValue::Uuid(instance1_uuid)
+        })
+        .expect("missing timeseries for instance1 checks");
+    let ts2 = checks
+        .timeseries()
+        .find(|ts| {
+            ts.fields.get("instance_id").unwrap()
+                == &FieldValue::Uuid(instance2_uuid)
+        })
+        .expect("missing timeseries for instance2 checks");
+    assert_eq!(
+        ts1.fields.get("status").unwrap(),
+        &FieldValue::String("starting".to_string())
+    );
+    assert_eq!(
+        ts2.fields.get("status").unwrap(),
+        &FieldValue::String("starting".to_string())
+    );
+}
+
 /// Wait until a producer is registered with Oximeter.
 ///
 /// This blocks until the producer is registered, for up to 60s. It panics if

From 75dac3985c4e056e6574c013b7fd5214a044402f Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 14:00:37 -0700
Subject: [PATCH 52/69] whoops i meant to fix that one too

---
 nexus/tests/integration_tests/metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index f74569b7c4..2379ed3bc0 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -351,7 +351,7 @@ async fn test_instance_watcher_metrics(
     kick_instance_watcher().await;
 
     let metrics =
-        dbg!(timeseries_query(&cptestctx, "{ get virtual_machine:check, get virtual_machine:incomplete_check }").await);
+        dbg!(timeseries_query(&cptestctx, "get virtual_machine:check").await);
     let checks = metrics
         .iter()
         .find(|t| t.name() == "virtual_machine:check")

From 8392b121fb293801185dcae1281ce28a882a36fc Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 15:22:44 -0700
Subject: [PATCH 53/69] unbreak expected metric fields

---
 nexus/tests/integration_tests/metrics.rs | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index 2379ed3bc0..2bf260bc25 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -315,6 +315,9 @@ async fn test_instance_watcher_metrics(
     cptestctx: &ControlPlaneTestContext<omicron_nexus::Server>,
 ) {
     use oximeter::types::FieldValue;
+    const INSTANCE_ID_FIELD: &str = "instance_id";
+    const STATE_FIELD: &str = "state";
+    const STATE_STARTING: &str = "starting";
 
     let client = &cptestctx.external_client;
     let internal_client = &cptestctx.internal_client;
@@ -359,13 +362,13 @@ async fn test_instance_watcher_metrics(
     let ts = checks
         .timeseries()
         .find(|ts| {
-            ts.fields.get("instance_id").unwrap()
+            ts.fields.get(INSTANCE_ID_FIELD).unwrap()
                 == &FieldValue::Uuid(instance1_uuid)
         })
         .expect("missing timeseries for instance1 checks");
     assert_eq!(
-        ts.fields.get("status").unwrap(),
-        &FieldValue::String("starting".to_string())
+        ts.fields.get(STATE_FIELD).unwrap(),
+        &FieldValue::String(STATE_STARTING.to_string())
     );
 
     // okay, make another instance
@@ -384,24 +387,24 @@ async fn test_instance_watcher_metrics(
     let ts1 = checks
         .timeseries()
         .find(|ts| {
-            ts.fields.get("instance_id").unwrap()
+            ts.fields.get(INSTANCE_ID_FIELD).unwrap()
                 == &FieldValue::Uuid(instance1_uuid)
         })
         .expect("missing timeseries for instance1 checks");
     let ts2 = checks
         .timeseries()
         .find(|ts| {
-            ts.fields.get("instance_id").unwrap()
+            ts.fields.get(INSTANCE_ID_FIELD).unwrap()
                 == &FieldValue::Uuid(instance2_uuid)
         })
         .expect("missing timeseries for instance2 checks");
     assert_eq!(
-        ts1.fields.get("status").unwrap(),
-        &FieldValue::String("starting".to_string())
+        ts1.fields.get(STATE_FIELD).unwrap(),
+        &FieldValue::String(STATE_STARTING.to_string())
     );
     assert_eq!(
-        ts2.fields.get("status").unwrap(),
-        &FieldValue::String("starting".to_string())
+        ts2.fields.get(STATE_FIELD).unwrap(),
+        &FieldValue::String(STATE_STARTING.to_string())
     );
 }
 

From c3694fc539c1a8e0e81799814d17093da46c6612 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 15:23:05 -0700
Subject: [PATCH 54/69] nicer code for finding instance timeserieses

---
 nexus/tests/integration_tests/metrics.rs | 45 +++++++++++++-----------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index 2bf260bc25..b383ce355a 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -338,6 +338,27 @@ async fn test_instance_watcher_metrics(
         tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
     };
 
+    #[track_caller]
+    fn timeseries_for_instance(
+        table: &oximeter_db::oxql::Table,
+        instance_id: Uuid,
+    ) -> &oximeter_db::oxql::Timeseries {
+        let uuid = FieldValue::Uuid(instance_id);
+        let mut timeserieses = table
+            .timeseries()
+            .filter(move |ts| ts.fields.get(INSTANCE_ID_FIELD) == Some(&uuid));
+        let Some(timeseries) = timeserieses.next() else {
+            panic!("missing timeseries for instance {instance_id}")
+        };
+        if let Some(timeseries) = timeserieses.next() {
+            panic!(
+                "multiple timeseries for instance {instance_id}: \
+                {timeseries:?}, {timeseries:?}, ..."
+            )
+        }
+        timeseries
+    }
+
     create_default_ip_pool(&client).await; // needed for instance create to work
                                            // Wait until Nexus registers as a producer with Oximeter.
     wait_for_producer(
@@ -359,13 +380,7 @@ async fn test_instance_watcher_metrics(
         .iter()
         .find(|t| t.name() == "virtual_machine:check")
         .expect("missing virtual_machine:check");
-    let ts = checks
-        .timeseries()
-        .find(|ts| {
-            ts.fields.get(INSTANCE_ID_FIELD).unwrap()
-                == &FieldValue::Uuid(instance1_uuid)
-        })
-        .expect("missing timeseries for instance1 checks");
+    let ts = timeseries_for_instance(&checks, instance1_uuid);
     assert_eq!(
         ts.fields.get(STATE_FIELD).unwrap(),
         &FieldValue::String(STATE_STARTING.to_string())
@@ -384,20 +399,8 @@ async fn test_instance_watcher_metrics(
         .iter()
         .find(|t| t.name() == "virtual_machine:check")
         .expect("missing virtual_machine:check");
-    let ts1 = checks
-        .timeseries()
-        .find(|ts| {
-            ts.fields.get(INSTANCE_ID_FIELD).unwrap()
-                == &FieldValue::Uuid(instance1_uuid)
-        })
-        .expect("missing timeseries for instance1 checks");
-    let ts2 = checks
-        .timeseries()
-        .find(|ts| {
-            ts.fields.get(INSTANCE_ID_FIELD).unwrap()
-                == &FieldValue::Uuid(instance2_uuid)
-        })
-        .expect("missing timeseries for instance2 checks");
+    let ts1 = timeseries_for_instance(&checks, instance1_uuid);
+    let ts2 = timeseries_for_instance(&checks, instance2_uuid);
     assert_eq!(
         ts1.fields.get(STATE_FIELD).unwrap(),
         &FieldValue::String(STATE_STARTING.to_string())

From 40d553b88382c3de9af5a6164de0d50abdca352c Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 16:16:02 -0700
Subject: [PATCH 55/69] WELL IF IT ISN'T THE CONSEQUENCES OF MY OWN PRS

---
 nexus/tests/integration_tests/metrics.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index b383ce355a..9fb88fe075 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -383,7 +383,7 @@ async fn test_instance_watcher_metrics(
     let ts = timeseries_for_instance(&checks, instance1_uuid);
     assert_eq!(
         ts.fields.get(STATE_FIELD).unwrap(),
-        &FieldValue::String(STATE_STARTING.to_string())
+        &FieldValue::from(STATE_STARTING)
     );
 
     // okay, make another instance
@@ -403,11 +403,11 @@ async fn test_instance_watcher_metrics(
     let ts2 = timeseries_for_instance(&checks, instance2_uuid);
     assert_eq!(
         ts1.fields.get(STATE_FIELD).unwrap(),
-        &FieldValue::String(STATE_STARTING.to_string())
+        &FieldValue::from(STATE_STARTING)
     );
     assert_eq!(
         ts2.fields.get(STATE_FIELD).unwrap(),
-        &FieldValue::String(STATE_STARTING.to_string())
+        &FieldValue::from(STATE_STARTING)
     );
 }
 

From f355b757adbcb590901e59ad827198094b29deaa Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 16:16:18 -0700
Subject: [PATCH 56/69] fix schema diff

---
 schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql b/schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql
index d280ff1e05..7f9262e4fe 100644
--- a/schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql
+++ b/schema/crdb/add-lookup-vmm-by-sled-id-index/up.sql
@@ -1,3 +1,3 @@
-CREATE UNIQUE INDEX IF NOT EXISTS lookup_vmms_by_sled_id ON omicron.public.vmm (
+CREATE INDEX IF NOT EXISTS lookup_vmms_by_sled_id ON omicron.public.vmm (
     sled_id
 ) WHERE time_deleted IS NULL;

From d4fdfffd673669885a826546fc06d39c2dcfb674 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 16:18:52 -0700
Subject: [PATCH 57/69] also update omdb tests

---
 dev-tools/omdb/tests/successes.out | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
index b4dfac712c..d7711610bd 100644
--- a/dev-tools/omdb/tests/successes.out
+++ b/dev-tools/omdb/tests/successes.out
@@ -407,8 +407,8 @@ task: "instance_watcher"
     started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
     total instances checked: 0
     checks completed: 0
-     -> successful checks: 0
-     -> failed checks: 0
+       successful checks: 0
+       failed checks: 0
     checks that could not be completed: 0
     stale instance metrics pruned: 0
 

From 96d7b71cb1e75976ef197ca5acda52e3971f3222 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 16:29:01 -0700
Subject: [PATCH 58/69] allocate way fewer strings with `Cow`

---
 nexus/src/app/background/instance_watcher.rs | 91 ++++++++++----------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 02efbb09ea..d536ce9d52 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -17,8 +17,8 @@ use omicron_common::api::external::InstanceState;
 use omicron_common::api::internal::nexus::SledInstanceState;
 use oximeter::types::ProducerRegistry;
 use sled_agent_client::Client as SledAgentClient;
+use std::borrow::Cow;
 use std::collections::BTreeMap;
-use std::fmt;
 use std::future::Future;
 use std::net::IpAddr;
 use std::num::NonZeroU32;
@@ -237,6 +237,22 @@ enum CheckOutcome {
     Failure(Failure),
 }
 
+impl CheckOutcome {
+    fn state_str(&self) -> Cow<'static, str> {
+        match self {
+            Self::Success(state) => state.label().into(),
+            Self::Failure(reason) => InstanceState::Failed.label().into(),
+        }
+    }
+
+    fn reason_str(&self) -> Cow<'static, str> {
+        match self {
+            Self::Success(_) => "success".into(),
+            Self::Failure(reason) => reason.as_str(),
+        }
+    }
+}
+
 #[derive(
     Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
 )]
@@ -255,6 +271,16 @@ enum Failure {
     NoSuchInstance,
 }
 
+impl Failure {
+    fn as_str(&self) -> Cow<'static, str> {
+        match self {
+            Self::SledAgentUnreachable => "unreachable".into(),
+            Self::SledAgentResponse(status) => status.to_string().into(),
+            Self::NoSuchInstance => "no_such_instance".into(),
+        }
+    }
+}
+
 #[derive(
     Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
 )]
@@ -277,34 +303,13 @@ enum Incomplete {
     UpdateFailed,
 }
 
-impl fmt::Display for CheckOutcome {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+impl Incomplete {
+    fn as_str(&self) -> Cow<'static, str> {
         match self {
-            Self::Success(state) => write!(f, "{state}"),
-            Self::Failure(reason) => write!(f, "{reason}"),
-        }
-    }
-}
-
-impl fmt::Display for Failure {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::SledAgentUnreachable => f.write_str("unreachable"),
-            Self::SledAgentResponse(status) => {
-                write!(f, "{status}")
-            }
-            Self::NoSuchInstance => f.write_str("no_such_instance"),
-        }
-    }
-}
-
-impl fmt::Display for Incomplete {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            Self::ClientHttpError(status) => write!(f, "{status}"),
-            Self::ClientError => f.write_str("client_error"),
-            Self::InstanceNotFound => f.write_str("instance_not_found"),
-            Self::UpdateFailed => f.write_str("update_failed"),
+            Self::ClientHttpError(status) => status.to_string().into(),
+            Self::ClientError => "client_error".into(),
+            Self::InstanceNotFound => "instance_not_found".into(),
+            Self::UpdateFailed => "update_failed".into(),
         }
     }
 }
@@ -420,13 +425,13 @@ impl BackgroundTask for InstanceWatcher {
                                 .or_default() += 1;
                         }
                         CheckOutcome::Failure(reason) => {
-                            *check_failures.entry(reason.to_string()).or_default() += 1;
+                            *check_failures.entry(reason.as_str().to_owned()).or_default() += 1;
                         }
                     }
                 }
                 if let Err(reason) = result {
                     metric.check_error(reason);
-                    *check_errors.entry(reason.to_string()).or_default() += 1;
+                    *check_errors.entry(reason.as_str().to_owned()).or_default() += 1;
                 }
             }
 
@@ -450,11 +455,12 @@ impl BackgroundTask for InstanceWatcher {
 }
 
 mod metrics {
-    use super::{CheckOutcome, Incomplete, InstanceState, VirtualMachine};
+    use super::{CheckOutcome, Incomplete, VirtualMachine};
     use oximeter::types::Cumulative;
     use oximeter::Metric;
     use oximeter::MetricsError;
     use oximeter::Sample;
+    use std::borrow::Cow;
     use std::collections::BTreeMap;
     use std::sync::Arc;
     use std::sync::Mutex;
@@ -512,17 +518,10 @@ mod metrics {
         pub(super) fn completed(&mut self, outcome: CheckOutcome) {
             self.checks
                 .entry(outcome)
-                .or_insert_with(|| match outcome {
-                    CheckOutcome::Failure(failure) => Check {
-                        state: InstanceState::Failed.to_string(),
-                        reason: failure.to_string(),
-                        datum: Cumulative::default(),
-                    },
-                    CheckOutcome::Success(state) => Check {
-                        state: state.to_string(),
-                        reason: "success".to_string(),
-                        datum: Cumulative::default(),
-                    },
+                .or_insert_with(|| Check {
+                    state: outcome.state_str(),
+                    reason: outcome.reason_str(),
+                    datum: Cumulative::default(),
                 })
                 .datum += 1;
 
@@ -533,7 +532,7 @@ mod metrics {
             self.check_errors
                 .entry(reason)
                 .or_insert_with(|| IncompleteCheck {
-                    reason: reason.to_string(),
+                    reason: reason.as_str(),
                     datum: Cumulative::default(),
                 })
                 .datum += 1;
@@ -564,7 +563,7 @@ mod metrics {
     struct Check {
         /// The string representation of the instance's state as understood by
         /// the VMM. If the check failed, this will generally be "failed".
-        state: String,
+        state: Cow<'static, str>,
         /// `Why the instance was marked as being in this state.
         ///
         /// If an instance was marked as "failed" due to a check failure, this
@@ -575,7 +574,7 @@ mod metrics {
         /// sled-agent, and the *sled-agent* reported that the instance has
         /// failed --- which is distinct from the instance watcher marking an
         /// instance as failed due to a failed check.
-        reason: String,
+        reason: Cow<'static, str>,
         /// this will be a string representation of the failure reason.
         datum: Cumulative<u64>,
     }
@@ -586,7 +585,7 @@ mod metrics {
         /// The reason why the check was unsuccessful.
         ///
         /// This is generated from the [`Incomplete`] enum's `Display` implementation.
-        reason: String,
+        reason: Cow<'static, str>,
         /// The number of failed checks for this instance and sled agent.
         datum: Cumulative<u64>,
     }

From bd373300a293ab21cc8102cf8c0603aea3422411 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 16:29:37 -0700
Subject: [PATCH 59/69] fixup stray comment

---
 nexus/src/app/background/instance_watcher.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index d536ce9d52..f38b5280ec 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -575,7 +575,8 @@ mod metrics {
         /// failed --- which is distinct from the instance watcher marking an
         /// instance as failed due to a failed check.
         reason: Cow<'static, str>,
-        /// this will be a string representation of the failure reason.
+        /// The number of checks for this instance and sled agent which recorded
+        /// this state for this reason.
         datum: Cumulative<u64>,
     }
 

From 23c01d36b62cd41c1aa281b2fd19df23af99ba31 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Wed, 8 May 2024 16:55:27 -0700
Subject: [PATCH 60/69] blegh

---
 nexus/src/app/background/instance_watcher.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index f38b5280ec..efdec7292d 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -241,7 +241,7 @@ impl CheckOutcome {
     fn state_str(&self) -> Cow<'static, str> {
         match self {
             Self::Success(state) => state.label().into(),
-            Self::Failure(reason) => InstanceState::Failed.label().into(),
+            Self::Failure(_) => InstanceState::Failed.label().into(),
         }
     }
 
@@ -425,13 +425,13 @@ impl BackgroundTask for InstanceWatcher {
                                 .or_default() += 1;
                         }
                         CheckOutcome::Failure(reason) => {
-                            *check_failures.entry(reason.as_str().to_owned()).or_default() += 1;
+                            *check_failures.entry(reason.as_str().into_owned()).or_default() += 1;
                         }
                     }
                 }
                 if let Err(reason) = result {
                     metric.check_error(reason);
-                    *check_errors.entry(reason.as_str().to_owned()).or_default() += 1;
+                    *check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
                 }
             }
 

From c96c7d724407b20060bf0de766bdead86f65f7c9 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 9 May 2024 08:32:46 -0700
Subject: [PATCH 61/69] clean up target construction slightly

---
 nexus/src/app/background/instance_watcher.rs | 44 ++++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index efdec7292d..23da1974b4 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -7,7 +7,10 @@
 use super::common::BackgroundTask;
 use futures::{future::BoxFuture, FutureExt};
 use http::StatusCode;
+use nexus_db_model::Instance;
+use nexus_db_model::Project;
 use nexus_db_model::Sled;
+use nexus_db_model::Vmm;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
@@ -206,6 +209,26 @@ struct VirtualMachine {
     sled_agent_port: u16,
 }
 
+impl VirtualMachine {
+    fn new(
+        sled: &Sled,
+        instance: &Instance,
+        vmm: &Vmm,
+        project: &Project,
+    ) -> Self {
+        let addr = sled.address();
+        Self {
+            instance_id: instance.id(),
+            silo_id: project.silo_id,
+            project_id: project.id(),
+            vmm_id: vmm.id,
+            sled_agent_id: sled.id(),
+            sled_agent_ip: (*addr.ip()).into(),
+            sled_agent_port: addr.port(),
+        }
+    }
+}
+
 struct Check {
     target: VirtualMachine,
 
@@ -363,15 +386,7 @@ impl BackgroundTask for InstanceWatcher {
                 let mut batch = batch.into_iter();
                 if let Some((mut curr_sled, instance, vmm, project)) = batch.next() {
                     let mut client = mk_client(&curr_sled);
-                    let target = VirtualMachine {
-                        instance_id: instance.id(),
-                        silo_id: project.silo_id,
-                        project_id: project.id(),
-                        vmm_id: vmm.id,
-                        sled_agent_id: curr_sled.id(),
-                        sled_agent_ip: (*curr_sled.address().ip()).into(),
-                        sled_agent_port: curr_sled.address().port(),
-                    };
+                    let target = VirtualMachine::new(&curr_sled, &instance, &vmm, &project);
                     tasks.spawn(self.check_instance(opctx, &client, target));
 
                     for (sled, instance, vmm, project) in batch {
@@ -380,15 +395,8 @@ impl BackgroundTask for InstanceWatcher {
                             client = mk_client(&sled);
                             curr_sled = sled;
                         }
-                        let target = VirtualMachine {
-                            instance_id: instance.id(),
-                            silo_id: project.silo_id,
-                            project_id: project.id(),
-                            vmm_id: vmm.id,
-                            sled_agent_id: curr_sled.id(),
-                            sled_agent_ip: (*curr_sled.address().ip()).into(),
-                            sled_agent_port: curr_sled.address().port(),
-                        };
+
+                        let target = VirtualMachine::new(&curr_sled, &instance, &vmm, &project);
                         tasks.spawn(self.check_instance(opctx, &client, target));
                     }
                 }

From d70388591c3e4ef5f512ca98a3665a0a8dbfd2c8 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 9 May 2024 08:45:36 -0700
Subject: [PATCH 62/69] prune instances only *after* checks have completed

---
 nexus/src/app/background/instance_watcher.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 23da1974b4..239d9b8aac 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -402,10 +402,6 @@ impl BackgroundTask for InstanceWatcher {
                 }
             }
 
-            // All requests fired off! While we wait for them to come back,
-            // let's prune old instances.
-            let pruned = self.metrics.lock().unwrap().prune();
-
             // Now, wait for the check results to come back.
             let mut total: usize = 0;
             let mut instance_states: BTreeMap<String, usize> =
@@ -443,6 +439,12 @@ impl BackgroundTask for InstanceWatcher {
                 }
             }
 
+            // All requests completed! Prune any old instance metrics for
+            // instances that we didn't check --- if we didn't spawn a check for
+            // something, that means it wasn't present in the most recent
+            // database query.
+            let pruned = self.metrics.lock().unwrap().prune();
+
             slog::info!(opctx.log, "all instance checks complete";
                 "total_instances" => total,
                 "total_completed" => instance_states.len() + check_failures.len(),

From d1d3fe2f58cc14ae7f6487f925dc76c12ded1e19 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 9 May 2024 08:52:03 -0700
Subject: [PATCH 63/69] clone slightly less stuff

---
 nexus/src/app/background/instance_watcher.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 239d9b8aac..96d1eaad35 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -30,7 +30,6 @@ use std::sync::Mutex;
 use uuid::Uuid;
 
 /// Background task that periodically checks instance states.
-#[derive(Clone)]
 pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
@@ -61,7 +60,8 @@ impl InstanceWatcher {
         client: &SledAgentClient,
         target: VirtualMachine,
     ) -> impl Future<Output = Check> + Send + 'static {
-        let watcher = self.clone();
+        let datastore = self.datastore.clone();
+        let resolver = self.resolver.clone();
 
         let opctx = opctx.child(
             std::iter::once((
@@ -73,7 +73,6 @@ impl InstanceWatcher {
         let client = client.clone();
 
         async move {
-            let InstanceWatcher { datastore, resolver, .. } = watcher;
             slog::trace!(opctx.log, "checking on instance...");
             let rsp = client.instance_get_state(&target.instance_id).await;
             let mut check = Check { target, outcome: None, result: Ok(()) };

From 8be9a8d1ec264b112536bf00f4887330b58bbde5 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 9 May 2024 10:07:55 -0700
Subject: [PATCH 64/69] less flaky way of activating instance watcher in test

---
 Cargo.lock                               |  1 +
 nexus/Cargo.toml                         |  1 +
 nexus/tests/integration_tests/metrics.rs | 65 ++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a362c64eef..0a6dd178e8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5392,6 +5392,7 @@ dependencies = [
  "itertools 0.12.1",
  "macaddr",
  "mg-admin-client",
+ "nexus-client",
  "nexus-config",
  "nexus-db-model",
  "nexus-db-queries",
diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml
index 5222d104c3..c68a567044 100644
--- a/nexus/Cargo.toml
+++ b/nexus/Cargo.toml
@@ -39,6 +39,7 @@ itertools.workspace = true
 macaddr.workspace = true
 # Not under "dev-dependencies"; these also need to be implemented for
 # integration tests.
+nexus-client.workspace = true
 nexus-config.workspace = true
 nexus-networking.workspace = true
 nexus-test-interface.workspace = true
diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index 9fb88fe075..1c2a88e9ac 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -322,7 +322,42 @@ async fn test_instance_watcher_metrics(
     let client = &cptestctx.external_client;
     let internal_client = &cptestctx.internal_client;
 
-    let kick_instance_watcher = || async {
+    // TODO(eliza): consider factoring this out to a generic
+    // `activate_background_task` function in `nexus-test-utils` eventually?
+    let activate_instance_watcher = || async {
+        use nexus_client::types::BackgroundTask;
+        use nexus_client::types::CurrentStatus;
+        use nexus_client::types::CurrentStatusRunning;
+        use nexus_client::types::LastResult;
+        use nexus_client::types::LastResultCompleted;
+
+        fn most_recent_start_time(
+            task: &BackgroundTask,
+        ) -> Option<chrono::DateTime<chrono::Utc>> {
+            match task.current {
+                CurrentStatus::Idle => match task.last {
+                    LastResult::Completed(LastResultCompleted {
+                        start_time,
+                        ..
+                    }) => Some(start_time),
+                    LastResult::NeverCompleted => None,
+                },
+                CurrentStatus::Running(CurrentStatusRunning {
+                    start_time,
+                    ..
+                }) => Some(start_time),
+            }
+        }
+
+        eprintln!("\n --- activating instance watcher ---\n");
+        let task = NexusRequest::object_get(
+            internal_client,
+            "/bgtasks/view/instance_watcher",
+        )
+        .execute_and_parse_unwrap::<BackgroundTask>()
+        .await;
+        let last_start = most_recent_start_time(&task);
+
         internal_client
             .make_request(
                 http::Method::POST,
@@ -334,8 +369,28 @@ async fn test_instance_watcher_metrics(
             )
             .await
             .unwrap();
-        // bleh...
-        tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
+        // Wait for the instance watcher task to finish
+        wait_for_condition(
+            || async {
+                let task = NexusRequest::object_get(
+                    internal_client,
+                    "/bgtasks/view/instance_watcher",
+                )
+                .execute_and_parse_unwrap::<BackgroundTask>()
+                .await;
+                if matches!(&task.current, CurrentStatus::Idle)
+                    && most_recent_start_time(&task) > last_start
+                {
+                    Ok(())
+                } else {
+                    Err(CondCheckError::<()>::NotYet)
+                }
+            },
+            &Duration::from_millis(500),
+            &Duration::from_secs(60),
+        )
+        .await
+        .unwrap();
     };
 
     #[track_caller]
@@ -372,7 +427,7 @@ async fn test_instance_watcher_metrics(
     let instance1_uuid = instance1.identity.id;
 
     // activate the instance watcher background task.
-    kick_instance_watcher().await;
+    activate_instance_watcher().await;
 
     let metrics =
         dbg!(timeseries_query(&cptestctx, "get virtual_machine:check").await);
@@ -391,7 +446,7 @@ async fn test_instance_watcher_metrics(
     let instance2_uuid = instance2.identity.id;
 
     // activate the instance watcher background task.
-    kick_instance_watcher().await;
+    activate_instance_watcher().await;
 
     let metrics =
         dbg!(timeseries_query(&cptestctx, "get virtual_machine:check").await);

From 406e69d4333358ac926a6f2251989c34bb3ab759 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Thu, 9 May 2024 11:33:45 -0700
Subject: [PATCH 65/69] simulate more instance state transitions in test

---
 nexus/tests/integration_tests/instances.rs |   4 +-
 nexus/tests/integration_tests/metrics.rs   | 169 ++++++++++++++++-----
 2 files changed, 136 insertions(+), 37 deletions(-)

diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs
index 08cf195384..0b5947ef7e 100644
--- a/nexus/tests/integration_tests/instances.rs
+++ b/nexus/tests/integration_tests/instances.rs
@@ -116,7 +116,9 @@ fn default_vpc_subnets_url() -> String {
     format!("/v1/vpc-subnets?{}&vpc=default", get_project_selector())
 }
 
-async fn create_project_and_pool(client: &ClientTestContext) -> views::Project {
+pub async fn create_project_and_pool(
+    client: &ClientTestContext,
+) -> views::Project {
     create_default_ip_pool(client).await;
     create_project(client, PROJECT_NAME).await
 }
diff --git a/nexus/tests/integration_tests/metrics.rs b/nexus/tests/integration_tests/metrics.rs
index 1c2a88e9ac..ec44c3747a 100644
--- a/nexus/tests/integration_tests/metrics.rs
+++ b/nexus/tests/integration_tests/metrics.rs
@@ -4,6 +4,9 @@
 
 use std::time::Duration;
 
+use crate::integration_tests::instances::{
+    create_project_and_pool, instance_post, instance_simulate, InstanceOp,
+};
 use chrono::Utc;
 use dropshot::test_util::ClientTestContext;
 use dropshot::ResultsPage;
@@ -288,7 +291,7 @@ pub async fn timeseries_query(
     let body = nexus_types::external_api::params::TimeseriesQuery {
         query: query.to_string(),
     };
-    let query = dbg!(&body.query);
+    let query = &body.query;
     let rsp = NexusRequest::new(
         nexus_test_utils::http_testing::RequestBuilder::new(
             &cptestctx.external_client,
@@ -303,9 +306,10 @@ pub async fn timeseries_query(
     .unwrap_or_else(|e| {
         panic!("timeseries query failed: {e:?}\nquery: {query}")
     });
-    dbg!(rsp).parsed_body().unwrap_or_else(|e| {
+    rsp.parsed_body().unwrap_or_else(|e| {
         panic!(
-            "could not parse timeseries query response: {e:?}\nquery: {query}"
+            "could not parse timeseries query response: {e:?}\n\
+            query: {query}\nresponse: {rsp:#?}"
         );
     })
 }
@@ -318,9 +322,13 @@ async fn test_instance_watcher_metrics(
     const INSTANCE_ID_FIELD: &str = "instance_id";
     const STATE_FIELD: &str = "state";
     const STATE_STARTING: &str = "starting";
+    const STATE_RUNNING: &str = "running";
+    const STATE_STOPPING: &str = "stopping";
+    const OXQL_QUERY: &str = "get virtual_machine:check";
 
     let client = &cptestctx.external_client;
     let internal_client = &cptestctx.internal_client;
+    let nexus = &cptestctx.server.apictx().nexus;
 
     // TODO(eliza): consider factoring this out to a generic
     // `activate_background_task` function in `nexus-test-utils` eventually?
@@ -394,76 +402,165 @@ async fn test_instance_watcher_metrics(
     };
 
     #[track_caller]
-    fn timeseries_for_instance(
+    fn count_state(
         table: &oximeter_db::oxql::Table,
         instance_id: Uuid,
-    ) -> &oximeter_db::oxql::Timeseries {
+        state: &'static str,
+    ) -> i64 {
+        use oximeter_db::oxql::point::ValueArray;
         let uuid = FieldValue::Uuid(instance_id);
-        let mut timeserieses = table
-            .timeseries()
-            .filter(move |ts| ts.fields.get(INSTANCE_ID_FIELD) == Some(&uuid));
+        let state = FieldValue::String(state.into());
+        let mut timeserieses = table.timeseries().filter(|ts| {
+            ts.fields.get(INSTANCE_ID_FIELD) == Some(&uuid)
+                && ts.fields.get(STATE_FIELD) == Some(&state)
+        });
         let Some(timeseries) = timeserieses.next() else {
-            panic!("missing timeseries for instance {instance_id}")
+            panic!(
+                "missing timeseries for instance {instance_id}, state {state}\n\
+                found: {table:#?}"
+            )
         };
         if let Some(timeseries) = timeserieses.next() {
             panic!(
-                "multiple timeseries for instance {instance_id}: \
-                {timeseries:?}, {timeseries:?}, ..."
+                "multiple timeseries for instance {instance_id}, state {state}: \
+                {timeseries:?}, {timeseries:?}, ...\n\
+                found: {table:#?}"
             )
         }
-        timeseries
+        match timeseries.points.values(0) {
+            Some(ValueArray::Integer(ref vals)) => {
+                vals.iter().filter_map(|&v| v).sum()
+            }
+            x => panic!(
+                "expected timeseries for instance {instance_id}, \
+                state {state} to be an integer, but found: {x:?}"
+            ),
+        }
     }
 
-    create_default_ip_pool(&client).await; // needed for instance create to work
-                                           // Wait until Nexus registers as a producer with Oximeter.
+    // N.B. that we've gotta use the project name that this function hardcodes
+    // if we're going to use the `instance_post` test helper later.
+    let project = create_project_and_pool(&client).await;
+    let project_name = project.identity.name.as_str();
+    // Wait until Nexus registers as a producer with Oximeter.
     wait_for_producer(
         &cptestctx.oximeter,
         cptestctx.server.apictx().nexus.id(),
     )
     .await;
 
-    create_project(&client, "p-1").await;
-    let instance1 = create_instance(&client, "p-1", "i-1").await;
+    eprintln!("--- creating instance 1 ---");
+    let instance1 = create_instance(&client, project_name, "i-1").await;
     let instance1_uuid = instance1.identity.id;
 
     // activate the instance watcher background task.
     activate_instance_watcher().await;
 
-    let metrics =
-        dbg!(timeseries_query(&cptestctx, "get virtual_machine:check").await);
+    let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await;
     let checks = metrics
         .iter()
         .find(|t| t.name() == "virtual_machine:check")
         .expect("missing virtual_machine:check");
-    let ts = timeseries_for_instance(&checks, instance1_uuid);
-    assert_eq!(
-        ts.fields.get(STATE_FIELD).unwrap(),
-        &FieldValue::from(STATE_STARTING)
-    );
+    let ts = dbg!(count_state(&checks, instance1_uuid, STATE_STARTING));
+    assert_eq!(ts, 1);
 
     // okay, make another instance
-    let instance2 = create_instance(&client, "p-1", "i-2").await;
+    eprintln!("--- creating instance 2 ---");
+    let instance2 = create_instance(&client, project_name, "i-2").await;
     let instance2_uuid = instance2.identity.id;
 
     // activate the instance watcher background task.
     activate_instance_watcher().await;
 
-    let metrics =
-        dbg!(timeseries_query(&cptestctx, "get virtual_machine:check").await);
+    let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await;
     let checks = metrics
         .iter()
         .find(|t| t.name() == "virtual_machine:check")
         .expect("missing virtual_machine:check");
-    let ts1 = timeseries_for_instance(&checks, instance1_uuid);
-    let ts2 = timeseries_for_instance(&checks, instance2_uuid);
-    assert_eq!(
-        ts1.fields.get(STATE_FIELD).unwrap(),
-        &FieldValue::from(STATE_STARTING)
-    );
-    assert_eq!(
-        ts2.fields.get(STATE_FIELD).unwrap(),
-        &FieldValue::from(STATE_STARTING)
-    );
+    let ts1 = dbg!(count_state(&checks, instance1_uuid, STATE_STARTING));
+    let ts2 = dbg!(count_state(&checks, instance2_uuid, STATE_STARTING));
+    assert_eq!(ts1, 2);
+    assert_eq!(ts2, 1);
+
+    // poke instance 1 to get it into the running state
+    eprintln!("--- starting instance 1 ---");
+    instance_simulate(nexus, &instance1_uuid).await;
+
+    // activate the instance watcher background task.
+    activate_instance_watcher().await;
+
+    let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await;
+    let checks = metrics
+        .iter()
+        .find(|t| t.name() == "virtual_machine:check")
+        .expect("missing virtual_machine:check");
+    let ts1_starting =
+        dbg!(count_state(&checks, instance1_uuid, STATE_STARTING));
+    let ts1_running = dbg!(count_state(&checks, instance1_uuid, STATE_RUNNING));
+    let ts2 = dbg!(count_state(&checks, instance2_uuid, STATE_STARTING));
+    assert_eq!(ts1_starting, 2);
+    assert_eq!(ts1_running, 1);
+    assert_eq!(ts2, 2);
+
+    // poke instance 2 to get it into the Running state.
+    eprintln!("--- starting instance 2 ---");
+    instance_simulate(nexus, &instance2_uuid).await;
+    // stop instance 1
+    eprintln!("--- start stopping instance 1 ---");
+    instance_simulate(nexus, &instance1_uuid).await;
+    instance_post(&client, &instance1.identity.name.as_str(), InstanceOp::Stop)
+        .await;
+
+    // activate the instance watcher background task.
+    activate_instance_watcher().await;
+
+    let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await;
+    let checks = metrics
+        .iter()
+        .find(|t| t.name() == "virtual_machine:check")
+        .expect("missing virtual_machine:check");
+
+    let ts1_starting =
+        dbg!(count_state(&checks, instance1_uuid, STATE_STARTING));
+    let ts1_running = dbg!(count_state(&checks, instance1_uuid, STATE_RUNNING));
+    let ts1_stopping =
+        dbg!(count_state(&checks, instance1_uuid, STATE_STOPPING));
+    let ts2_starting =
+        dbg!(count_state(&checks, instance2_uuid, STATE_STARTING));
+    let ts2_running = dbg!(count_state(&checks, instance2_uuid, STATE_RUNNING));
+    assert_eq!(ts1_starting, 2);
+    assert_eq!(ts1_running, 1);
+    assert_eq!(ts1_stopping, 1);
+    assert_eq!(ts2_starting, 2);
+    assert_eq!(ts2_running, 1);
+
+    // simulate instance 1 completing its stop, which will remove it from the
+    // set of active instances in CRDB. now, it won't be checked again.
+
+    eprintln!("--- finish stopping instance 1 ---");
+    instance_simulate(nexus, &instance1_uuid).await;
+
+    // activate the instance watcher background task.
+    activate_instance_watcher().await;
+
+    let metrics = timeseries_query(&cptestctx, OXQL_QUERY).await;
+    let checks = metrics
+        .iter()
+        .find(|t| t.name() == "virtual_machine:check")
+        .expect("missing virtual_machine:check");
+    let ts1_starting =
+        dbg!(count_state(&checks, instance1_uuid, STATE_STARTING));
+    let ts1_running = dbg!(count_state(&checks, instance1_uuid, STATE_RUNNING));
+    let ts1_stopping =
+        dbg!(count_state(&checks, instance1_uuid, STATE_STOPPING));
+    let ts2_starting =
+        dbg!(count_state(&checks, instance2_uuid, STATE_STARTING));
+    let ts2_running = dbg!(count_state(&checks, instance2_uuid, STATE_RUNNING));
+    assert_eq!(ts1_starting, 2);
+    assert_eq!(ts1_running, 1);
+    assert_eq!(ts1_stopping, 1);
+    assert_eq!(ts2_starting, 2);
+    assert_eq!(ts2_running, 2);
 }
 
 /// Wait until a producer is registered with Oximeter.

From 08e01adccc0c5a028c5dcca2554ee234ec76fc2c Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 10 May 2024 10:58:07 -0700
Subject: [PATCH 66/69] add nexus UUID to check metrics

---
 nexus/src/app/background/instance_watcher.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 96d1eaad35..2220672c78 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -34,6 +34,7 @@ pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
     metrics: Arc<Mutex<metrics::Metrics>>,
+    nexus_id: Uuid,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -46,12 +47,13 @@ impl InstanceWatcher {
         datastore: Arc<DataStore>,
         resolver: internal_dns::resolver::Resolver,
         producer_registry: &ProducerRegistry,
+        nexus_id: Uuid,
     ) -> Self {
         let metrics = Arc::new(Mutex::new(metrics::Metrics::default()));
         producer_registry
             .register_producer(metrics::Producer(metrics.clone()))
             .unwrap();
-        Self { datastore, resolver, metrics }
+        Self { datastore, resolver, metrics, nexus_id }
     }
 
     fn check_instance(
@@ -192,6 +194,8 @@ impl InstanceWatcher {
     Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, oximeter::Target,
 )]
 struct VirtualMachine {
+    /// The ID of the Nexus process which performed the health check.
+    nexus_id: Uuid,
     /// The instance's ID.
     instance_id: Uuid,
     /// The silo ID of the instance's silo.
@@ -210,6 +214,7 @@ struct VirtualMachine {
 
 impl VirtualMachine {
     fn new(
+        nexus_id: Uuid,
         sled: &Sled,
         instance: &Instance,
         vmm: &Vmm,
@@ -217,6 +222,7 @@ impl VirtualMachine {
     ) -> Self {
         let addr = sled.address();
         Self {
+            nexus_id,
             instance_id: instance.id(),
             silo_id: project.silo_id,
             project_id: project.id(),
@@ -385,7 +391,7 @@ impl BackgroundTask for InstanceWatcher {
                 let mut batch = batch.into_iter();
                 if let Some((mut curr_sled, instance, vmm, project)) = batch.next() {
                     let mut client = mk_client(&curr_sled);
-                    let target = VirtualMachine::new(&curr_sled, &instance, &vmm, &project);
+                    let target = VirtualMachine::new(self.nexus_id, &curr_sled, &instance, &vmm, &project);
                     tasks.spawn(self.check_instance(opctx, &client, target));
 
                     for (sled, instance, vmm, project) in batch {
@@ -395,7 +401,7 @@ impl BackgroundTask for InstanceWatcher {
                             curr_sled = sled;
                         }
 
-                        let target = VirtualMachine::new(&curr_sled, &instance, &vmm, &project);
+                        let target = VirtualMachine::new(self.nexus_id, &curr_sled, &instance, &vmm, &project);
                         tasks.spawn(self.check_instance(opctx, &client, target));
                     }
                 }

From c909fc79ecb29aad8f036ac48791faf26671749d Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 10 May 2024 11:13:45 -0700
Subject: [PATCH 67/69] emit `state: "unknown"` for incomplete checks

---
 nexus/src/app/background/instance_watcher.rs | 124 +++++++++----------
 1 file changed, 59 insertions(+), 65 deletions(-)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 2220672c78..6043bc4aa9 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -77,7 +77,8 @@ impl InstanceWatcher {
         async move {
             slog::trace!(opctx.log, "checking on instance...");
             let rsp = client.instance_get_state(&target.instance_id).await;
-            let mut check = Check { target, outcome: None, result: Ok(()) };
+            let mut check =
+                Check { target, outcome: Default::default(), result: Ok(()) };
             let state = match rsp {
                 Ok(rsp) => rsp.into_inner(),
                 Err(ClientError::ErrorResponse(rsp)) => {
@@ -89,9 +90,8 @@ impl InstanceWatcher {
                         slog::info!(opctx.log, "instance is wayyyyy gone");
                         // TODO(eliza): eventually, we should attempt to put the
                         // instance in the `Failed` state here.
-                        check.outcome = Some(CheckOutcome::Failure(
-                            Failure::NoSuchInstance,
-                        ));
+                        check.outcome =
+                            CheckOutcome::Failure(Failure::NoSuchInstance);
                         return check;
                     }
                     if status.is_client_error() {
@@ -104,9 +104,9 @@ impl InstanceWatcher {
                         "status" => ?status, "error" => ?rsp.into_inner());
                     }
 
-                    check.outcome = Some(CheckOutcome::Failure(
+                    check.outcome = CheckOutcome::Failure(
                         Failure::SledAgentResponse(status.as_u16()),
-                    ));
+                    );
                     return check;
                 }
                 Err(ClientError::CommunicationError(e)) => {
@@ -121,9 +121,8 @@ impl InstanceWatcher {
                     // unreachable. We should start doing that here at some
                     // point.
                     slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
-                    check.outcome = Some(CheckOutcome::Failure(
-                        Failure::SledAgentUnreachable,
-                    ));
+                    check.outcome =
+                        CheckOutcome::Failure(Failure::SledAgentUnreachable);
                     return check;
                 }
                 Err(e) => {
@@ -140,7 +139,7 @@ impl InstanceWatcher {
 
             let new_runtime_state: SledInstanceState = state.into();
             check.outcome =
-                Some(CheckOutcome::Success(new_runtime_state.vmm_state.state));
+                CheckOutcome::Success(new_runtime_state.vmm_state.state);
             slog::debug!(
                 opctx.log,
                 "updating instance state";
@@ -244,7 +243,7 @@ struct Check {
     ///
     /// If we were not able to perform the request at all due to an error on
     /// *our* end, this will be `None`.
-    outcome: Option<CheckOutcome>,
+    outcome: CheckOutcome,
 
     /// `Some` if the instance check was unsuccessful.
     ///
@@ -259,24 +258,31 @@ struct Check {
     result: Result<(), Incomplete>,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)]
 enum CheckOutcome {
     Success(InstanceState),
     Failure(Failure),
+    #[default]
+    Unknown,
 }
 
-impl CheckOutcome {
+impl Check {
     fn state_str(&self) -> Cow<'static, str> {
-        match self {
-            Self::Success(state) => state.label().into(),
-            Self::Failure(_) => InstanceState::Failed.label().into(),
+        match self.outcome {
+            CheckOutcome::Success(state) => state.label().into(),
+            CheckOutcome::Failure(_) => InstanceState::Failed.label().into(),
+            CheckOutcome::Unknown => "unknown".into(),
         }
     }
 
     fn reason_str(&self) -> Cow<'static, str> {
-        match self {
-            Self::Success(_) => "success".into(),
-            Self::Failure(reason) => reason.as_str(),
+        match self.outcome {
+            CheckOutcome::Success(_) => "success".into(),
+            CheckOutcome::Failure(reason) => reason.as_str(),
+            CheckOutcome::Unknown => match self.result {
+                Ok(()) => "unknown".into(), // this shouldn't happen, but there's no way to prevent it from happening,
+                Err(e) => e.as_str(),
+            },
         }
     }
 }
@@ -416,32 +422,28 @@ impl BackgroundTask for InstanceWatcher {
             let mut check_errors: BTreeMap<String, usize> = BTreeMap::new();
             while let Some(result) = tasks.join_next().await {
                 total += 1;
-                let Check { target, outcome, result } = result.expect(
+                let check = result.expect(
                     "a `JoinError` is returned if a spawned task \
                     panics, or if the task is aborted. we never abort \
                     tasks on this `JoinSet`, and nexus is compiled with \
                     `panic=\"abort\"`, so neither of these cases should \
                     ever occur",
                 );
-                let mut metrics = self.metrics.lock().unwrap();
-                let metric = metrics.instance(target);
-                if let Some(outcome) = outcome {
-                    metric.completed(outcome);
-                    match outcome {
-                        CheckOutcome::Success(state) => {
-                            *instance_states
-                                .entry(state.to_string())
-                                .or_default() += 1;
-                        }
-                        CheckOutcome::Failure(reason) => {
-                            *check_failures.entry(reason.as_str().into_owned()).or_default() += 1;
-                        }
+                match check.outcome {
+                    CheckOutcome::Success(state) => {
+                        *instance_states
+                            .entry(state.to_string())
+                            .or_default() += 1;
+                    }
+                    CheckOutcome::Failure(reason) => {
+                        *check_failures.entry(reason.as_str().into_owned()).or_default() += 1;
                     }
+                    CheckOutcome::Unknown => {}
                 }
-                if let Err(reason) = result {
-                    metric.check_error(reason);
+                if let Err(ref reason) = check.result {
                     *check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
                 }
+                self.metrics.lock().unwrap().record_check(check);
             }
 
             // All requests completed! Prune any old instance metrics for
@@ -489,18 +491,34 @@ mod metrics {
     pub(super) struct Producer(pub(super) Arc<Mutex<Metrics>>);
 
     #[derive(Debug, Default)]
-    pub(super) struct Instance {
+    struct Instance {
         checks: BTreeMap<CheckOutcome, Check>,
         check_errors: BTreeMap<Incomplete, IncompleteCheck>,
         touched: bool,
     }
 
     impl Metrics {
-        pub(crate) fn instance(
-            &mut self,
-            instance: VirtualMachine,
-        ) -> &mut Instance {
-            self.instances.entry(instance).or_default()
+        pub(crate) fn record_check(&mut self, check: super::Check) {
+            let instance = self.instances.entry(check.target).or_default();
+            instance
+                .checks
+                .entry(check.outcome)
+                .or_insert_with(|| Check {
+                    state: check.state_str(),
+                    reason: check.reason_str(),
+                    datum: Cumulative::default(),
+                })
+                .datum += 1;
+            if let Err(error) = check.result {
+                instance
+                    .check_errors
+                    .entry(error)
+                    .or_insert_with(|| IncompleteCheck {
+                        reason: error.as_str(),
+                        datum: Cumulative::default(),
+                    })
+                    .datum += 1;
+            }
         }
 
         pub(super) fn prune(&mut self) -> usize {
@@ -530,30 +548,6 @@ mod metrics {
     }
 
     impl Instance {
-        pub(super) fn completed(&mut self, outcome: CheckOutcome) {
-            self.checks
-                .entry(outcome)
-                .or_insert_with(|| Check {
-                    state: outcome.state_str(),
-                    reason: outcome.reason_str(),
-                    datum: Cumulative::default(),
-                })
-                .datum += 1;
-
-            self.touched = true;
-        }
-
-        pub(super) fn check_error(&mut self, reason: Incomplete) {
-            self.check_errors
-                .entry(reason)
-                .or_insert_with(|| IncompleteCheck {
-                    reason: reason.as_str(),
-                    datum: Cumulative::default(),
-                })
-                .datum += 1;
-            self.touched = true;
-        }
-
         fn len(&self) -> usize {
             self.checks.len() + self.check_errors.len()
         }

From d6fe22aca530643c174dac25bba24d897458e39f Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 10 May 2024 11:43:00 -0700
Subject: [PATCH 68/69] may as well throw in the rack ID too...

---
 nexus/src/app/background/init.rs             |  1 +
 nexus/src/app/background/instance_watcher.rs | 26 +++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs
index e44672036a..9d9a65c23b 100644
--- a/nexus/src/app/background/init.rs
+++ b/nexus/src/app/background/init.rs
@@ -357,6 +357,7 @@ impl BackgroundTasks {
                 datastore.clone(),
                 resolver.clone(),
                 producer_registry,
+                instance_watcher::WatcherIdentity { nexus_id, rack_id },
             );
             driver.register(
                 "instance_watcher".to_string(),
diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index 6043bc4aa9..cac069454b 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -34,7 +34,7 @@ pub(crate) struct InstanceWatcher {
     datastore: Arc<DataStore>,
     resolver: internal_dns::resolver::Resolver,
     metrics: Arc<Mutex<metrics::Metrics>>,
-    nexus_id: Uuid,
+    id: WatcherIdentity,
 }
 
 const MAX_SLED_AGENTS: NonZeroU32 = unsafe {
@@ -47,13 +47,13 @@ impl InstanceWatcher {
         datastore: Arc<DataStore>,
         resolver: internal_dns::resolver::Resolver,
         producer_registry: &ProducerRegistry,
-        nexus_id: Uuid,
+        id: WatcherIdentity,
     ) -> Self {
         let metrics = Arc::new(Mutex::new(metrics::Metrics::default()));
         producer_registry
             .register_producer(metrics::Producer(metrics.clone()))
             .unwrap();
-        Self { datastore, resolver, metrics, nexus_id }
+        Self { datastore, resolver, metrics, id }
     }
 
     fn check_instance(
@@ -189,10 +189,23 @@ impl InstanceWatcher {
     }
 }
 
+/// The identity of the process performing the health check, for distinguishing
+/// health check metrics emitted by different Nexus instances.
+///
+/// This is a struct just to ensure that the two UUIDs are named arguments
+/// (rather than positional arguments) and can't be swapped accidentally.
+#[derive(Copy, Clone)]
+pub struct WatcherIdentity {
+    pub nexus_id: Uuid,
+    pub rack_id: Uuid,
+}
+
 #[derive(
     Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, oximeter::Target,
 )]
 struct VirtualMachine {
+    /// The rack ID of the Nexus process which performed the health check.
+    rack_id: Uuid,
     /// The ID of the Nexus process which performed the health check.
     nexus_id: Uuid,
     /// The instance's ID.
@@ -213,7 +226,7 @@ struct VirtualMachine {
 
 impl VirtualMachine {
     fn new(
-        nexus_id: Uuid,
+        WatcherIdentity { rack_id, nexus_id }: WatcherIdentity,
         sled: &Sled,
         instance: &Instance,
         vmm: &Vmm,
@@ -221,6 +234,7 @@ impl VirtualMachine {
     ) -> Self {
         let addr = sled.address();
         Self {
+            rack_id,
             nexus_id,
             instance_id: instance.id(),
             silo_id: project.silo_id,
@@ -397,7 +411,7 @@ impl BackgroundTask for InstanceWatcher {
                 let mut batch = batch.into_iter();
                 if let Some((mut curr_sled, instance, vmm, project)) = batch.next() {
                     let mut client = mk_client(&curr_sled);
-                    let target = VirtualMachine::new(self.nexus_id, &curr_sled, &instance, &vmm, &project);
+                    let target = VirtualMachine::new(self.id, &curr_sled, &instance, &vmm, &project);
                     tasks.spawn(self.check_instance(opctx, &client, target));
 
                     for (sled, instance, vmm, project) in batch {
@@ -407,7 +421,7 @@ impl BackgroundTask for InstanceWatcher {
                             curr_sled = sled;
                         }
 
-                        let target = VirtualMachine::new(self.nexus_id, &curr_sled, &instance, &vmm, &project);
+                        let target = VirtualMachine::new(self.id, &curr_sled, &instance, &vmm, &project);
                         tasks.spawn(self.check_instance(opctx, &client, target));
                     }
                 }

From 6e9155ef51a322f04303dbfaa8479b0e99d21713 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Fri, 10 May 2024 13:16:54 -0700
Subject: [PATCH 69/69] whoops i forgot to touch the instances

---
 nexus/src/app/background/instance_watcher.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nexus/src/app/background/instance_watcher.rs b/nexus/src/app/background/instance_watcher.rs
index cac069454b..4cdca3c4b7 100644
--- a/nexus/src/app/background/instance_watcher.rs
+++ b/nexus/src/app/background/instance_watcher.rs
@@ -533,6 +533,7 @@ mod metrics {
                     })
                     .datum += 1;
             }
+            instance.touched = true;
         }
 
         pub(super) fn prune(&mut self) -> usize {