oxidecomputer · hawkw · May 10, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 17, 2024
diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs
@@ -889,8 +889,95 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
                 );
             }
         };
+    } else if name == "instance_watcher" {
+        #[derive(Deserialize)]
+        struct TaskSuccess {
+            /// total number of instances checked
+            total_instances: usize,
+
+            /// number of stale instance metrics that were deleted
+            pruned_instances: usize,
+
+            /// instance states from completed checks.
+            ///
+            /// this is a mapping of stringified instance states to the number
+            /// of instances in that state. these stringified states correspond
+            /// to the `state` field recorded by the instance watcher's
+            /// `virtual_machine:check` timeseries with the `healthy` field set
+            /// to `true`. any changes to the instance state type which cause it
+            /// to print differently will be counted as a distinct state.
+            instance_states: BTreeMap<String, usize>,
+
+            /// instance check failures.
+            ///
+            /// this is a mapping of stringified instance check failure reasons
+            /// to the number of instances with checks that failed for that
+            /// reason. these stringified  failure reasons correspond to the
+            /// `state` field recorded by the instance watcher's
+            /// `virtual_machine:check` timeseries with the `healthy` field set
+            /// to `false`. any changes to the instance state type which cause
+            /// it to print differently will be counted as a distinct failure
+            /// reason.
+            failed_checks: BTreeMap<String, usize>,
+
+            /// instance checks that could not be completed successfully.
+            ///
+            /// this is a mapping of stringified instance check errors
+            /// to the number of instance checks that were not completed due to
+            /// that error. these stringified errors correspond to the `reason `
+            /// field recorded by the instance watcher's
+            /// `virtual_machine:incomplete_check` timeseries. any changes to
+            /// the check error type which cause it to print
+            /// differently will be counted as a distinct check error.
+            incomplete_checks: BTreeMap<String, usize>,
+        }
+
+        match serde_json::from_value::<TaskSuccess>(details.clone()) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
+            Ok(TaskSuccess {
+                total_instances,
+                pruned_instances,
+                instance_states,
+                failed_checks,
+                incomplete_checks,
+            }) => {
+                let total_successes: usize = instance_states.values().sum();
+                let total_failures: usize = failed_checks.values().sum();
+                let total_incomplete: usize = incomplete_checks.values().sum();
+                println!("    total instances checked: {total_instances}",);
+                println!(
+                    "    checks completed: {}",
+                    total_successes + total_failures
+                );
+                println!("       successful checks: {total_successes}",);
+                for (state, count) in &instance_states {
+                    println!("       -> {count} instances {state}")
+                }
+
+                println!("       failed checks: {total_failures}");
+                for (failure, count) in &failed_checks {
+                    println!("       -> {count} {failure}")
+                }
+                println!(
+                    "    checks that could not be completed: {total_incomplete}",
+                );
+                for (error, count) in &incomplete_checks {
+                    println!("       -> {count} {error} errors")
+                }
+                println!(
+                    "    stale instance metrics pruned: {pruned_instances}"
+                );
+            }
+        };
     } else if name == "service_firewall_rule_propagation" {
         match serde_json::from_value::<serde_json::Value>(details.clone()) {
+            Err(error) => eprintln!(
+                "warning: failed to interpret task details: {:?}: {:?}",
+                error, details
+            ),
             Ok(serde_json::Value::Object(map)) => {
                 if !map.is_empty() {
                     eprintln!(
@@ -902,11 +989,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
             Ok(val) => {
                 eprintln!("    unexpected return value from task: {:?}", val)
             }
-            Err(error) => eprintln!(
-                "warning: failed to interpret task details: {:?}: {:?}",
-                error, details
-            ),
-        }
+        };
     } else {
         println!(
             "warning: unknown background task: {:?} \

diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out
@@ -72,6 +72,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 
@@ -179,6 +183,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 
@@ -273,6 +281,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 

diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out
@@ -249,6 +249,10 @@ task: "external_endpoints"
     on each one
 
 
+task: "instance_watcher"
+    periodically checks instance states
+
+
 task: "inventory_collection"
     collects hardware and software inventory data from the whole system
 
@@ -396,6 +400,18 @@ task: "external_endpoints"
 
     TLS certificates: 0
 
+task: "instance_watcher"
+  configured period: every 30s
+  currently executing: no
+  last completed activation: <REDACTED ITERATIONS>, triggered by an explicit signal
+    started at <REDACTED     TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
+    total instances checked: 0
+    checks completed: 0
+       successful checks: 0
+       failed checks: 0
+    checks that could not be completed: 0
+    stale instance metrics pruned: 0
+
 task: "inventory_collection"
   configured period: every 10m
   currently executing: no

diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs
@@ -375,6 +375,8 @@ pub struct BackgroundTaskConfig {
     pub switch_port_settings_manager: SwitchPortSettingsManagerConfig,
     /// configuration for region replacement task
     pub region_replacement: RegionReplacementConfig,
+    /// configuration for instance watcher task
+    pub instance_watcher: InstanceWatcherConfig,
     /// configuration for service VPC firewall propagation task
     pub service_firewall_propagation: ServiceFirewallPropagationConfig,
 }
@@ -521,6 +523,14 @@ pub struct RegionReplacementConfig {
     pub period_secs: Duration,
 }
 
+#[serde_as]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
+pub struct InstanceWatcherConfig {
+    /// period (in seconds) for periodic activations of this background task
+    #[serde_as(as = "DurationSeconds<u64>")]
+    pub period_secs: Duration,
+}
+
 #[serde_as]
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct ServiceFirewallPropagationConfig {
@@ -765,6 +775,7 @@ mod test {
             sync_service_zone_nat.period_secs = 30
             switch_port_settings_manager.period_secs = 30
             region_replacement.period_secs = 30
+            instance_watcher.period_secs = 30
             service_firewall_propagation.period_secs = 300
             [default_region_allocation_strategy]
             type = "random"
@@ -894,6 +905,9 @@ mod test {
                         region_replacement: RegionReplacementConfig {
                             period_secs: Duration::from_secs(30),
                         },
+                        instance_watcher: InstanceWatcherConfig {
+                            period_secs: Duration::from_secs(30),
+                        },
                         service_firewall_propagation:
                             ServiceFirewallPropagationConfig {
                                 period_secs: Duration::from_secs(300),
@@ -964,6 +978,7 @@ mod test {
             sync_service_zone_nat.period_secs = 30
             switch_port_settings_manager.period_secs = 30
             region_replacement.period_secs = 30
+            instance_watcher.period_secs = 30
             service_firewall_propagation.period_secs = 300
             [default_region_allocation_strategy]
             type = "random"

diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs
@@ -1770,3 +1770,5 @@ allow_tables_to_appear_in_same_query!(volume, virtual_provisioning_resource);
 allow_tables_to_appear_in_same_query!(ssh_key, instance_ssh_key, instance);
 joinable!(instance_ssh_key -> ssh_key (ssh_key_id));
 joinable!(instance_ssh_key -> instance (instance_id));
+
+allow_tables_to_appear_in_same_query!(sled, sled_instance);
diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs
@@ -17,7 +17,7 @@ use std::collections::BTreeMap;
 ///
 /// This must be updated when you change the database schema.  Refer to
 /// schema/crdb/README.adoc in the root of this repository for details.
-pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(59, 0, 0);
+pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(60, 0, 0);
 
 /// List of all past database schema versions, in *reverse* order
 ///
@@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
         // |  leaving the first copy as an example for the next person.
         // v
         // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
+        KnownVersion::new(60, "add-lookup-vmm-by-sled-id-index"),
         KnownVersion::new(59, "enforce-first-as-default"),
         KnownVersion::new(58, "insert-default-allowlist"),
         KnownVersion::new(57, "add-allowed-source-ips"),

diff --git a/nexus/db-model/src/sled_instance.rs b/nexus/db-model/src/sled_instance.rs
@@ -41,3 +41,9 @@ impl From<SledInstance> for views::SledInstance {
         }
     }
 }
+
+impl SledInstance {
+    pub fn instance_id(&self) -> Uuid {
+        self.identity.id
+    }
+}
diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs
@@ -22,18 +22,22 @@ use crate::db::model::Instance;
 use crate::db::model::InstanceRuntimeState;
 use crate::db::model::Name;
 use crate::db::model::Project;
+use crate::db::model::Sled;
 use crate::db::model::Vmm;
 use crate::db::pagination::paginated;
 use crate::db::update_and_check::UpdateAndCheck;
 use crate::db::update_and_check::UpdateStatus;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use chrono::Utc;
 use diesel::prelude::*;
+use nexus_db_model::ApplySledFilterExt;
 use nexus_db_model::Disk;
 use nexus_db_model::VmmRuntimeState;
+use nexus_types::deployment::SledFilter;
 use omicron_common::api;
 use omicron_common::api::external::http_pagination::PaginatedBy;
 use omicron_common::api::external::CreateResult;
+use omicron_common::api::external::DataPageParams;
 use omicron_common::api::external::DeleteResult;
 use omicron_common::api::external::Error;
 use omicron_common::api::external::ListResultVec;
@@ -385,6 +389,58 @@ impl DataStore {
         Ok((instance_updated, vmm_updated))
     }
 
+    /// Lists all instances on in-service sleds with active Propolis VMM
+    /// processes, returning the instance along with the VMM on which it's
+    /// running, the sled on which the VMM is running, and the project that owns
+    /// the instance.
+    ///
+    /// The query performed by this function is paginated by the sled's UUID.
+    pub async fn instance_and_vmm_list_by_sled_agent(
+        &self,
+        opctx: &OpContext,
+        pagparams: &DataPageParams<'_, Uuid>,
+    ) -> ListResultVec<(Sled, Instance, Vmm, Project)> {
+        use crate::db::schema::{
+            instance::dsl as instance_dsl, project::dsl as project_dsl,
+            sled::dsl as sled_dsl, vmm::dsl as vmm_dsl,
+        };
+        opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
+        let conn = self.pool_connection_authorized(opctx).await?;
+
+        let result = paginated(sled_dsl::sled, sled_dsl::id, pagparams)
+            .filter(sled_dsl::time_deleted.is_null())
+            .sled_filter(SledFilter::InService)
+            .inner_join(
+                vmm_dsl::vmm
+                    .on(vmm_dsl::sled_id
+                        .eq(sled_dsl::id)
+                        .and(vmm_dsl::time_deleted.is_null()))
+                    .inner_join(
+                        instance_dsl::instance
+                            .on(instance_dsl::id
+                                .eq(vmm_dsl::instance_id)
+                                .and(instance_dsl::time_deleted.is_null()))
+                            .inner_join(
+                                project_dsl::project.on(project_dsl::id
+                                    .eq(instance_dsl::project_id)
+                                    .and(project_dsl::time_deleted.is_null())),
+                            ),
+                    ),
+            )
+            .sled_filter(SledFilter::InService)
+            .select((
+                Sled::as_select(),
+                Instance::as_select(),
+                Vmm::as_select(),
+                Project::as_select(),
+            ))
+            .load_async::<(Sled, Instance, Vmm, Project)>(&*conn)
+            .await
+            .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
+
+        Ok(result)
+    }
+
     pub async fn project_delete_instance(
         &self,
         opctx: &OpContext,

diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml
@@ -113,6 +113,8 @@ blueprints.period_secs_execute = 60
 sync_service_zone_nat.period_secs = 30
 switch_port_settings_manager.period_secs = 30
 region_replacement.period_secs = 30
+# How frequently to query the status of active instances.
+instance_watcher.period_secs = 30
 service_firewall_propagation.period_secs = 300
 
 [default_region_allocation_strategy]