From 1eec1a3c077d0a3ec08cf69c58cb268a72e4ab1d Mon Sep 17 00:00:00 2001
From: Luqman Aden <luqman@oxide.computer>
Date: Wed, 19 Jul 2023 01:14:00 -0700
Subject: [PATCH] Sled Agent service initialization cleanup (#3712)

Roughly 3 changes:
1. Skip initializing any zones we already know are running (both from
sled-agent's perspective and as reflected via `zoneadm`)
2. Log any errors `service_put` would return on the server-side.
3. Bump file-based zpools to 15G

Lack of space was causing service initialization to fail on
single-machine deployments using the `create_virtual_hardware` script.
(3) should fix this.

Because of `ENOSPC` errors, we were running into some future
cancellation issues which #3707 addressed. But actually determining the
underlying cause was a bit difficult as `RSS` was timing out (*) on the
`services_put` call and so never got back the error. (2) should
hopefully make this easier to catch in the future, e.g.:
```
04:12:31.312Z ERRO SledAgent: failed to init services: Failed to install zone 'oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127' from '/opt/oxide/clickhouse.tar.gz': Failed to execute zoneadm command '
Install' for zone 'oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127': Failed to parse command output: exit code 1
    stdout:
    A ZFS file system has been created for this zone.
    INFO: omicron: installing zone oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127 @ "/pool/ext/24b4dc87-ab46-49fb-a4b4-d361ae214c03/crypt/zone/oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127"...
    INFO: omicron: replicating /usr tree...
    INFO: omicron: replicating /lib tree...
    INFO: omicron: replicating /sbin tree...
    INFO: omicron: pruning SMF manifests...
    INFO: omicron: pruning global-only files...
    INFO: omicron: unpacking baseline archive...
    INFO: omicron: unpacking image "/opt/oxide/clickhouse.tar.gz"...
    stderr:
    Error: No space left on device (os error 28)
    sled_id = 8fd66238-6bb3-4f08-89bf-f88fc2320d83

```

(*) speaking of which, do we want to increase that timeout from just
60s? it's not like any subsequent requests will work considering they'll
be blocked on a lock from the initial request. and in failure cases like
this the subsequent requests will timeout as well, leaving a bunch of
tasks in sled-agent all waiting on the same lock to try initializing.
might be worth switching to the single processing task model rack/sled
initialization use.

Finally, (1) is to address the case where we try to initialize a zone
that's already running. That's sometimes fine as we'll eventually
realize the zone already exists. But in the case of a zone with an OPTE
port, we'll run into errors like so:
```
    error_message_internal = Failed to create OPTE port for service nexus: Failure interacting with the OPTE ioctl(2) interface: command CreateXde failed: MacExists { port: "opte12", vni: Vni { inner: 100 }, mac: MacAddr { inner: A8:40:25:FF:EC:09 } }
```
This happens because the
[check](https://github.com/oxidecomputer/omicron/blob/ebd3db27f6bbd08af3194cae84b0efc6b6e7248d/illumos-utils/src/zone.rs#L282C27-L282C27)
for "is zone running" comes after we do all the work necessary to create
the zone (e.g. creating an opte port). (1) updates
`initialize_services_locked` to cross-reference sled-agent and the
system's view of what zones are running so that we don't try to do the
unnecessary work.

---------

Co-authored-by: Sean Klein <sean@oxide.computer>
---
 sled-agent/src/http_entrypoints.rs | 14 ++++-
 sled-agent/src/services.rs         | 96 ++++++++++++++++++++----------
 sled-agent/src/sled_agent.rs       |  6 ++
 tools/create_virtual_hardware.sh   |  2 +-
 tools/destroy_virtual_hardware.sh  |  2 +-
 5 files changed, 85 insertions(+), 35 deletions(-)

diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs
index 6e2e800110..3b528b495a 100644
--- a/sled-agent/src/http_entrypoints.rs
+++ b/sled-agent/src/http_entrypoints.rs
@@ -167,8 +167,18 @@ async fn services_put(
     // times out) could result in leaving zones partially configured and the
     // in-memory state of the service manager invalid. See:
     // oxidecomputer/omicron#3098.
-    match tokio::spawn(async move { sa.services_ensure(body_args).await }).await
-    {
+    let handler = async move {
+        match sa.services_ensure(body_args).await {
+            Ok(()) => Ok(()),
+            Err(e) => {
+                // Log the error here to make things clear even if the client
+                // has already disconnected.
+                error!(sa.logger(), "failed to initialize services: {e}");
+                Err(e)
+            }
+        }
+    };
+    match tokio::spawn(handler).await {
         Ok(result) => result.map_err(|e| Error::from(e))?,
 
         Err(e) => {
diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs
index 174b71701b..f7583fc6dd 100644
--- a/sled-agent/src/services.rs
+++ b/sled-agent/src/services.rs
@@ -84,6 +84,7 @@ use sled_hardware::underlay::BOOTSTRAP_PREFIX;
 use sled_hardware::Baseboard;
 use sled_hardware::SledMode;
 use slog::Logger;
+use std::collections::BTreeMap;
 use std::collections::BTreeSet;
 use std::collections::HashSet;
 use std::io::Cursor;
@@ -362,7 +363,7 @@ pub struct ServiceManagerInner {
     switch_zone_maghemite_links: Vec<PhysicalLink>,
     sidecar_revision: SidecarRevision,
     // Zones representing running services
-    zones: Mutex<Vec<RunningZone>>,
+    zones: Mutex<BTreeMap<String, RunningZone>>,
     underlay_vnic_allocator: VnicAllocator<Etherstub>,
     underlay_vnic: EtherstubVnic,
     bootstrap_vnic_allocator: VnicAllocator<Etherstub>,
@@ -431,7 +432,7 @@ impl ServiceManager {
                 time_synced: AtomicBool::new(false),
                 sidecar_revision,
                 switch_zone_maghemite_links,
-                zones: Mutex::new(vec![]),
+                zones: Mutex::new(BTreeMap::new()),
                 underlay_vnic_allocator: VnicAllocator::new(
                     "Service",
                     underlay_etherstub,
@@ -1908,7 +1909,7 @@ impl ServiceManager {
     // Populates `existing_zones` according to the requests in `services`.
     async fn initialize_services_locked(
         &self,
-        existing_zones: &mut Vec<RunningZone>,
+        existing_zones: &mut BTreeMap<String, RunningZone>,
         requests: &Vec<ZoneRequest>,
     ) -> Result<(), Error> {
         if let Some(name) = requests
@@ -1923,16 +1924,17 @@ impl ServiceManager {
             });
         }
 
-        // We initialize all the zones we can, but only return the first error.
+        // We initialize all the zones we can, but only return one error, if
+        // any.
         let local_existing_zones = Arc::new(Mutex::new(existing_zones));
-        let first_err = Arc::new(Mutex::new(None));
+        let last_err = Arc::new(Mutex::new(None));
         stream::iter(requests)
             // WARNING: Do not use "try_for_each_concurrent" here -- if you do,
             // it's possible that the future will cancel other ongoing requests
             // to "initialize_zone".
             .for_each_concurrent(None, |request| {
                 let local_existing_zones = local_existing_zones.clone();
-                let first_err = first_err.clone();
+                let last_err = last_err.clone();
                 async move {
                     match self
                         .initialize_zone(
@@ -1943,20 +1945,20 @@ impl ServiceManager {
                         .await
                     {
                         Ok(running_zone) => {
-                            local_existing_zones
-                                .lock()
-                                .await
-                                .push(running_zone);
+                            local_existing_zones.lock().await.insert(
+                                running_zone.name().to_string(),
+                                running_zone,
+                            );
                         }
                         Err(err) => {
-                            *first_err.lock().await = Some(err);
+                            *last_err.lock().await = Some(err);
                         }
                     }
                 }
             })
             .await;
 
-        if let Some(err) = Arc::into_inner(first_err)
+        if let Some(err) = Arc::into_inner(last_err)
             .expect("Should have last reference")
             .into_inner()
         {
@@ -2245,9 +2247,7 @@ impl ServiceManager {
                     .map_err(Error::from);
             }
         }
-        if let Some(zone) =
-            self.inner.zones.lock().await.iter().find(|z| z.name() == name)
-        {
+        if let Some(zone) = self.inner.zones.lock().await.get(name) {
             return self
                 .create_zone_bundle_impl(zone)
                 .await
@@ -2424,9 +2424,14 @@ impl ServiceManager {
         {
             zone_names.push(String::from(zone.name()))
         }
-        for zone in self.inner.zones.lock().await.iter() {
-            zone_names.push(String::from(zone.name()));
-        }
+        zone_names.extend(
+            self.inner
+                .zones
+                .lock()
+                .await
+                .values()
+                .map(|zone| zone.name().to_string()),
+        );
         zone_names.sort();
         Ok(zone_names)
     }
@@ -2480,7 +2485,7 @@ impl ServiceManager {
     // re-instantiated on boot.
     async fn ensure_all_services(
         &self,
-        existing_zones: &mut MutexGuard<'_, Vec<RunningZone>>,
+        existing_zones: &mut MutexGuard<'_, BTreeMap<String, RunningZone>>,
         old_request: &AllZoneRequests,
         request: ServiceEnsureBody,
     ) -> Result<AllZoneRequests, Error> {
@@ -2502,11 +2507,7 @@ impl ServiceManager {
         // Destroy zones that should not be running
         for zone in zones_to_be_removed {
             let expected_zone_name = zone.zone_name();
-            if let Some(idx) = existing_zones
-                .iter()
-                .position(|z| z.name() == expected_zone_name)
-            {
-                let mut zone = existing_zones.remove(idx);
+            if let Some(mut zone) = existing_zones.remove(&expected_zone_name) {
                 if let Err(e) = zone.stop().await {
                     error!(log, "Failed to stop zone {}: {e}", zone.name());
                 }
@@ -2520,6 +2521,36 @@ impl ServiceManager {
         let all_u2_roots =
             self.inner.storage.all_u2_mountpoints(ZONE_DATASET).await;
         for zone in zones_to_be_added {
+            // Check if we think the zone should already be running
+            let name = zone.zone_name();
+            if existing_zones.contains_key(&name) {
+                // Make sure the zone actually exists in the right state too
+                match Zones::find(&name).await {
+                    Ok(Some(zone)) if zone.state() == zone::State::Running => {
+                        info!(log, "skipping running zone"; "zone" => &name);
+                        continue;
+                    }
+                    _ => {
+                        // Mismatch between SA's view and reality, let's try to
+                        // clean up any remanants and try initialize it again
+                        warn!(
+                            log,
+                            "expected to find existing zone in running state";
+                            "zone" => &name,
+                        );
+                        if let Err(e) =
+                            existing_zones.remove(&name).unwrap().stop().await
+                        {
+                            error!(
+                                log,
+                                "Failed to stop zone";
+                                "zone" => &name,
+                                "error" => %e,
+                            );
+                        }
+                    }
+                }
+            }
             // For each new zone request, we pick an arbitrary U.2 to store
             // the zone filesystem. Note: This isn't known to Nexus right now,
             // so it's a local-to-sled decision.
@@ -2552,7 +2583,7 @@ impl ServiceManager {
     pub async fn cockroachdb_initialize(&self) -> Result<(), Error> {
         let log = &self.inner.log;
         let dataset_zones = self.inner.zones.lock().await;
-        for zone in dataset_zones.iter() {
+        for zone in dataset_zones.values() {
             // TODO: We could probably store the ZoneKind in the running zone to
             // make this "comparison to existing zones by name" mechanism a bit
             // safer.
@@ -2608,7 +2639,10 @@ impl ServiceManager {
         Ok(())
     }
 
-    pub fn boottime_rewrite(&self, zones: &Vec<RunningZone>) {
+    pub fn boottime_rewrite<'a>(
+        &self,
+        zones: impl Iterator<Item = &'a RunningZone>,
+    ) {
         if self
             .inner
             .time_synced
@@ -2626,7 +2660,6 @@ impl ServiceManager {
         info!(self.inner.log, "Setting boot time to {:?}", now);
 
         let files: Vec<Utf8PathBuf> = zones
-            .iter()
             .map(|z| z.root())
             .chain(iter::once(Utf8PathBuf::from("/")))
             .flat_map(|r| [r.join("var/adm/utmpx"), r.join("var/adm/wtmpx")])
@@ -2655,7 +2688,7 @@ impl ServiceManager {
 
         if let Some(true) = self.inner.skip_timesync {
             info!(self.inner.log, "Configured to skip timesync checks");
-            self.boottime_rewrite(&existing_zones);
+            self.boottime_rewrite(existing_zones.values());
             return Ok(TimeSync { sync: true, skew: 0.00, correction: 0.00 });
         };
 
@@ -2664,8 +2697,9 @@ impl ServiceManager {
 
         let ntp_zone = existing_zones
             .iter()
-            .find(|z| z.name().starts_with(&ntp_zone_name))
-            .ok_or_else(|| Error::NtpZoneNotReady)?;
+            .find(|(name, _)| name.starts_with(&ntp_zone_name))
+            .ok_or_else(|| Error::NtpZoneNotReady)?
+            .1;
 
         // XXXNTP - This could be replaced with a direct connection to the
         // daemon using a patched version of the chrony_candm crate to allow
@@ -2687,7 +2721,7 @@ impl ServiceManager {
                         && correction.abs() <= 0.05;
 
                     if sync {
-                        self.boottime_rewrite(&existing_zones);
+                        self.boottime_rewrite(existing_zones.values());
                     }
 
                     Ok(TimeSync { sync, skew, correction })
diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs
index 7ee5402f4c..5b38db58a7 100644
--- a/sled-agent/src/sled_agent.rs
+++ b/sled-agent/src/sled_agent.rs
@@ -210,6 +210,7 @@ impl SledAgentInner {
 #[derive(Clone)]
 pub struct SledAgent {
     inner: Arc<SledAgentInner>,
+    log: Logger,
 }
 
 impl SledAgent {
@@ -351,6 +352,7 @@ impl SledAgent {
                 // Also, we could maybe de-dup some of the backoff code in the request queue?
                 nexus_request_queue: NexusRequestQueue::new(),
             }),
+            log: log.clone(),
         };
 
         // We immediately add a notification to the request queue about our
@@ -475,6 +477,10 @@ impl SledAgent {
         self.inner.id
     }
 
+    pub fn logger(&self) -> &Logger {
+        &self.log
+    }
+
     // Sends a request to Nexus informing it that the current sled exists.
     fn notify_nexus_about_self(&self, log: &Logger) {
         let sled_id = self.inner.id;
diff --git a/tools/create_virtual_hardware.sh b/tools/create_virtual_hardware.sh
index ff2904c1d4..40356c2397 100755
--- a/tools/create_virtual_hardware.sh
+++ b/tools/create_virtual_hardware.sh
@@ -47,7 +47,7 @@ function ensure_zpools {
             echo "Zpool: [$ZPOOL]"
             VDEV_PATH="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev"
             if ! [[ -f "$VDEV_PATH" ]]; then
-                dd if=/dev/zero of="$VDEV_PATH" bs=1 count=0 seek=10G
+                dd if=/dev/zero of="$VDEV_PATH" bs=1 count=0 seek=15G
             fi
             success "ZFS vdev $VDEV_PATH exists"
             if [[ -z "$(zpool list -o name | grep $ZPOOL)" ]]; then
diff --git a/tools/destroy_virtual_hardware.sh b/tools/destroy_virtual_hardware.sh
index a5b0b78ac4..250eb542f4 100755
--- a/tools/destroy_virtual_hardware.sh
+++ b/tools/destroy_virtual_hardware.sh
@@ -108,7 +108,7 @@ function try_destroy_zpools {
         for ZPOOL in "${ZPOOLS[@]}"; do
             VDEV_FILE="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev"
             zfs destroy -r "$ZPOOL" && \
-                    zfs unmount "$ZPOOL" && \
+                    (zfs unmount "$ZPOOL" || true) && \
                     zpool destroy "$ZPOOL" && \
                     rm -f "$VDEV_FILE" || \
                     warn "Failed to remove ZFS pool and vdev: $ZPOOL"