From 1eec1a3c077d0a3ec08cf69c58cb268a72e4ab1d Mon Sep 17 00:00:00 2001 From: Luqman Aden Date: Wed, 19 Jul 2023 01:14:00 -0700 Subject: [PATCH] Sled Agent service initialization cleanup (#3712) Roughly 3 changes: 1. Skip initializing any zones we already know are running (both from sled-agent's perspective and as reflected via `zoneadm`) 2. Log any errors `service_put` would return on the server-side. 3. Bump file-based zpools to 15G Lack of space was causing service initialization to fail on single-machine deployments using the `create_virtual_hardware` script. (3) should fix this. Because of `ENOSPC` errors, we were running into some future cancellation issues which #3707 addressed. But actually determining the underlying cause was a bit difficult as `RSS` was timing out (*) on the `services_put` call and so never got back the error. (2) should hopefully make this easier to catch in the future, e.g.: ``` 04:12:31.312Z ERRO SledAgent: failed to init services: Failed to install zone 'oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127' from '/opt/oxide/clickhouse.tar.gz': Failed to execute zoneadm command ' Install' for zone 'oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127': Failed to parse command output: exit code 1 stdout: A ZFS file system has been created for this zone. INFO: omicron: installing zone oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127 @ "/pool/ext/24b4dc87-ab46-49fb-a4b4-d361ae214c03/crypt/zone/oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127"... INFO: omicron: replicating /usr tree... INFO: omicron: replicating /lib tree... INFO: omicron: replicating /sbin tree... INFO: omicron: pruning SMF manifests... INFO: omicron: pruning global-only files... INFO: omicron: unpacking baseline archive... INFO: omicron: unpacking image "/opt/oxide/clickhouse.tar.gz"... stderr: Error: No space left on device (os error 28) sled_id = 8fd66238-6bb3-4f08-89bf-f88fc2320d83 ``` (*) speaking of which, do we want to increase that timeout from just 60s? it's not like any subsequent requests will work considering they'll be blocked on a lock from the initial request. and in failure cases like this the subsequent requests will timeout as well, leaving a bunch of tasks in sled-agent all waiting on the same lock to try initializing. might be worth switching to the single processing task model rack/sled initialization use. Finally, (1) is to address the case where we try to initialize a zone that's already running. That's sometimes fine as we'll eventually realize the zone already exists. But in the case of a zone with an OPTE port, we'll run into errors like so: ``` error_message_internal = Failed to create OPTE port for service nexus: Failure interacting with the OPTE ioctl(2) interface: command CreateXde failed: MacExists { port: "opte12", vni: Vni { inner: 100 }, mac: MacAddr { inner: A8:40:25:FF:EC:09 } } ``` This happens because the [check](https://github.com/oxidecomputer/omicron/blob/ebd3db27f6bbd08af3194cae84b0efc6b6e7248d/illumos-utils/src/zone.rs#L282C27-L282C27) for "is zone running" comes after we do all the work necessary to create the zone (e.g. creating an opte port). (1) updates `initialize_services_locked` to cross-reference sled-agent and the system's view of what zones are running so that we don't try to do the unnecessary work. --------- Co-authored-by: Sean Klein --- sled-agent/src/http_entrypoints.rs | 14 ++++- sled-agent/src/services.rs | 96 ++++++++++++++++++++---------- sled-agent/src/sled_agent.rs | 6 ++ tools/create_virtual_hardware.sh | 2 +- tools/destroy_virtual_hardware.sh | 2 +- 5 files changed, 85 insertions(+), 35 deletions(-) diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 6e2e800110..3b528b495a 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -167,8 +167,18 @@ async fn services_put( // times out) could result in leaving zones partially configured and the // in-memory state of the service manager invalid. See: // oxidecomputer/omicron#3098. - match tokio::spawn(async move { sa.services_ensure(body_args).await }).await - { + let handler = async move { + match sa.services_ensure(body_args).await { + Ok(()) => Ok(()), + Err(e) => { + // Log the error here to make things clear even if the client + // has already disconnected. + error!(sa.logger(), "failed to initialize services: {e}"); + Err(e) + } + } + }; + match tokio::spawn(handler).await { Ok(result) => result.map_err(|e| Error::from(e))?, Err(e) => { diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 174b71701b..f7583fc6dd 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -84,6 +84,7 @@ use sled_hardware::underlay::BOOTSTRAP_PREFIX; use sled_hardware::Baseboard; use sled_hardware::SledMode; use slog::Logger; +use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashSet; use std::io::Cursor; @@ -362,7 +363,7 @@ pub struct ServiceManagerInner { switch_zone_maghemite_links: Vec, sidecar_revision: SidecarRevision, // Zones representing running services - zones: Mutex>, + zones: Mutex>, underlay_vnic_allocator: VnicAllocator, underlay_vnic: EtherstubVnic, bootstrap_vnic_allocator: VnicAllocator, @@ -431,7 +432,7 @@ impl ServiceManager { time_synced: AtomicBool::new(false), sidecar_revision, switch_zone_maghemite_links, - zones: Mutex::new(vec![]), + zones: Mutex::new(BTreeMap::new()), underlay_vnic_allocator: VnicAllocator::new( "Service", underlay_etherstub, @@ -1908,7 +1909,7 @@ impl ServiceManager { // Populates `existing_zones` according to the requests in `services`. async fn initialize_services_locked( &self, - existing_zones: &mut Vec, + existing_zones: &mut BTreeMap, requests: &Vec, ) -> Result<(), Error> { if let Some(name) = requests @@ -1923,16 +1924,17 @@ impl ServiceManager { }); } - // We initialize all the zones we can, but only return the first error. + // We initialize all the zones we can, but only return one error, if + // any. let local_existing_zones = Arc::new(Mutex::new(existing_zones)); - let first_err = Arc::new(Mutex::new(None)); + let last_err = Arc::new(Mutex::new(None)); stream::iter(requests) // WARNING: Do not use "try_for_each_concurrent" here -- if you do, // it's possible that the future will cancel other ongoing requests // to "initialize_zone". .for_each_concurrent(None, |request| { let local_existing_zones = local_existing_zones.clone(); - let first_err = first_err.clone(); + let last_err = last_err.clone(); async move { match self .initialize_zone( @@ -1943,20 +1945,20 @@ impl ServiceManager { .await { Ok(running_zone) => { - local_existing_zones - .lock() - .await - .push(running_zone); + local_existing_zones.lock().await.insert( + running_zone.name().to_string(), + running_zone, + ); } Err(err) => { - *first_err.lock().await = Some(err); + *last_err.lock().await = Some(err); } } } }) .await; - if let Some(err) = Arc::into_inner(first_err) + if let Some(err) = Arc::into_inner(last_err) .expect("Should have last reference") .into_inner() { @@ -2245,9 +2247,7 @@ impl ServiceManager { .map_err(Error::from); } } - if let Some(zone) = - self.inner.zones.lock().await.iter().find(|z| z.name() == name) - { + if let Some(zone) = self.inner.zones.lock().await.get(name) { return self .create_zone_bundle_impl(zone) .await @@ -2424,9 +2424,14 @@ impl ServiceManager { { zone_names.push(String::from(zone.name())) } - for zone in self.inner.zones.lock().await.iter() { - zone_names.push(String::from(zone.name())); - } + zone_names.extend( + self.inner + .zones + .lock() + .await + .values() + .map(|zone| zone.name().to_string()), + ); zone_names.sort(); Ok(zone_names) } @@ -2480,7 +2485,7 @@ impl ServiceManager { // re-instantiated on boot. async fn ensure_all_services( &self, - existing_zones: &mut MutexGuard<'_, Vec>, + existing_zones: &mut MutexGuard<'_, BTreeMap>, old_request: &AllZoneRequests, request: ServiceEnsureBody, ) -> Result { @@ -2502,11 +2507,7 @@ impl ServiceManager { // Destroy zones that should not be running for zone in zones_to_be_removed { let expected_zone_name = zone.zone_name(); - if let Some(idx) = existing_zones - .iter() - .position(|z| z.name() == expected_zone_name) - { - let mut zone = existing_zones.remove(idx); + if let Some(mut zone) = existing_zones.remove(&expected_zone_name) { if let Err(e) = zone.stop().await { error!(log, "Failed to stop zone {}: {e}", zone.name()); } @@ -2520,6 +2521,36 @@ impl ServiceManager { let all_u2_roots = self.inner.storage.all_u2_mountpoints(ZONE_DATASET).await; for zone in zones_to_be_added { + // Check if we think the zone should already be running + let name = zone.zone_name(); + if existing_zones.contains_key(&name) { + // Make sure the zone actually exists in the right state too + match Zones::find(&name).await { + Ok(Some(zone)) if zone.state() == zone::State::Running => { + info!(log, "skipping running zone"; "zone" => &name); + continue; + } + _ => { + // Mismatch between SA's view and reality, let's try to + // clean up any remanants and try initialize it again + warn!( + log, + "expected to find existing zone in running state"; + "zone" => &name, + ); + if let Err(e) = + existing_zones.remove(&name).unwrap().stop().await + { + error!( + log, + "Failed to stop zone"; + "zone" => &name, + "error" => %e, + ); + } + } + } + } // For each new zone request, we pick an arbitrary U.2 to store // the zone filesystem. Note: This isn't known to Nexus right now, // so it's a local-to-sled decision. @@ -2552,7 +2583,7 @@ impl ServiceManager { pub async fn cockroachdb_initialize(&self) -> Result<(), Error> { let log = &self.inner.log; let dataset_zones = self.inner.zones.lock().await; - for zone in dataset_zones.iter() { + for zone in dataset_zones.values() { // TODO: We could probably store the ZoneKind in the running zone to // make this "comparison to existing zones by name" mechanism a bit // safer. @@ -2608,7 +2639,10 @@ impl ServiceManager { Ok(()) } - pub fn boottime_rewrite(&self, zones: &Vec) { + pub fn boottime_rewrite<'a>( + &self, + zones: impl Iterator, + ) { if self .inner .time_synced @@ -2626,7 +2660,6 @@ impl ServiceManager { info!(self.inner.log, "Setting boot time to {:?}", now); let files: Vec = zones - .iter() .map(|z| z.root()) .chain(iter::once(Utf8PathBuf::from("/"))) .flat_map(|r| [r.join("var/adm/utmpx"), r.join("var/adm/wtmpx")]) @@ -2655,7 +2688,7 @@ impl ServiceManager { if let Some(true) = self.inner.skip_timesync { info!(self.inner.log, "Configured to skip timesync checks"); - self.boottime_rewrite(&existing_zones); + self.boottime_rewrite(existing_zones.values()); return Ok(TimeSync { sync: true, skew: 0.00, correction: 0.00 }); }; @@ -2664,8 +2697,9 @@ impl ServiceManager { let ntp_zone = existing_zones .iter() - .find(|z| z.name().starts_with(&ntp_zone_name)) - .ok_or_else(|| Error::NtpZoneNotReady)?; + .find(|(name, _)| name.starts_with(&ntp_zone_name)) + .ok_or_else(|| Error::NtpZoneNotReady)? + .1; // XXXNTP - This could be replaced with a direct connection to the // daemon using a patched version of the chrony_candm crate to allow @@ -2687,7 +2721,7 @@ impl ServiceManager { && correction.abs() <= 0.05; if sync { - self.boottime_rewrite(&existing_zones); + self.boottime_rewrite(existing_zones.values()); } Ok(TimeSync { sync, skew, correction }) diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 7ee5402f4c..5b38db58a7 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -210,6 +210,7 @@ impl SledAgentInner { #[derive(Clone)] pub struct SledAgent { inner: Arc, + log: Logger, } impl SledAgent { @@ -351,6 +352,7 @@ impl SledAgent { // Also, we could maybe de-dup some of the backoff code in the request queue? nexus_request_queue: NexusRequestQueue::new(), }), + log: log.clone(), }; // We immediately add a notification to the request queue about our @@ -475,6 +477,10 @@ impl SledAgent { self.inner.id } + pub fn logger(&self) -> &Logger { + &self.log + } + // Sends a request to Nexus informing it that the current sled exists. fn notify_nexus_about_self(&self, log: &Logger) { let sled_id = self.inner.id; diff --git a/tools/create_virtual_hardware.sh b/tools/create_virtual_hardware.sh index ff2904c1d4..40356c2397 100755 --- a/tools/create_virtual_hardware.sh +++ b/tools/create_virtual_hardware.sh @@ -47,7 +47,7 @@ function ensure_zpools { echo "Zpool: [$ZPOOL]" VDEV_PATH="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev" if ! [[ -f "$VDEV_PATH" ]]; then - dd if=/dev/zero of="$VDEV_PATH" bs=1 count=0 seek=10G + dd if=/dev/zero of="$VDEV_PATH" bs=1 count=0 seek=15G fi success "ZFS vdev $VDEV_PATH exists" if [[ -z "$(zpool list -o name | grep $ZPOOL)" ]]; then diff --git a/tools/destroy_virtual_hardware.sh b/tools/destroy_virtual_hardware.sh index a5b0b78ac4..250eb542f4 100755 --- a/tools/destroy_virtual_hardware.sh +++ b/tools/destroy_virtual_hardware.sh @@ -108,7 +108,7 @@ function try_destroy_zpools { for ZPOOL in "${ZPOOLS[@]}"; do VDEV_FILE="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev" zfs destroy -r "$ZPOOL" && \ - zfs unmount "$ZPOOL" && \ + (zfs unmount "$ZPOOL" || true) && \ zpool destroy "$ZPOOL" && \ rm -f "$VDEV_FILE" || \ warn "Failed to remove ZFS pool and vdev: $ZPOOL"