diff --git a/docs/how-to-run.adoc b/docs/how-to-run.adoc index e6da695685..1494907939 100644 --- a/docs/how-to-run.adoc +++ b/docs/how-to-run.adoc @@ -172,7 +172,7 @@ The rest of these instructions assume that you're building and running Omicron o The Sled Agent supports operation on both: * a Gimlet (i.e., real Oxide hardware), and -* an ordinary PC that's been set up to look like a Gimlet using the `./tools_create_virtual_hardware.sh` script. +* an ordinary PC that's been set up to look like a Gimlet using the `./tools/create_virtual_hardware.sh` script. This script also sets up a "softnpu" zone to implement Boundary Services. SoftNPU simulates the Tofino device that's used in real systems. Just like Tofino, it can implement sled-to-sled networking, but that's beyond the scope of this doc. @@ -373,7 +373,9 @@ $ dig recovery.sys.oxide.test @192.168.1.20 +short 192.168.1.21 ---- -Where did 192.168.1.20 come from? That's the external address of the external DNS server. We knew that only because it's the first address in the "internal services" IP pool in config-rss.toml. +Where did 192.168.1.20 come from? That's the external address of the external +DNS server. We knew that because it's listed in the `external_dns_ips` entry of +the `config-rss.toml` file we're using. Having looked this up, the easiest thing will be to use `http://192.168.1.21` for your URL (replacing with `https` if you used a certificate, and replacing that IP if needed). If you've set up networking right, you should be able to reach this from your web browser. You may have to instruct the browser to accept a self-signed TLS certificate. See also <<_connecting_securely_with_tls_using_the_cli>>. @@ -392,12 +394,19 @@ An IP pool is needed to provide external connectivity to Instances. The address [source,console] ---- -$ oxide api /v1/system/ip-pools/default/ranges/add --method POST --input - < Result, ServiceError> { - const NEEDLES: [&str; 2] = ["/oxide", "/system/illumos"]; let output = self.run_cmd(&["svcs", "-H", "-o", "fmri"])?; Ok(output .lines() - .filter(|line| NEEDLES.iter().any(|needle| line.contains(needle))) + .filter(|line| is_oxide_smf_log_file(line)) .map(|line| line.trim().to_string()) .collect()) } @@ -1191,3 +1190,11 @@ impl InstalledZone { path } } + +/// Return true if the named file appears to be a log file for an Oxide SMF +/// service. +pub fn is_oxide_smf_log_file(name: impl AsRef) -> bool { + const SMF_SERVICE_PREFIXES: [&str; 2] = ["/oxide", "/system/illumos"]; + let name = name.as_ref(); + SMF_SERVICE_PREFIXES.iter().any(|needle| name.contains(needle)) +} diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 79ae3f645e..2fe887f2fe 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -10,6 +10,34 @@ "version": "0.0.1" }, "paths": { + "/all-zone-bundles": { + "get": { + "summary": "List all zone bundles that exist, even for now-deleted zones.", + "operationId": "zone_bundle_list_all", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_ZoneBundleMetadata", + "type": "array", + "items": { + "$ref": "#/components/schemas/ZoneBundleMetadata" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/cockroachdb": { "post": { "summary": "Initializes a CockroachDB cluster", @@ -528,7 +556,7 @@ }, "/zones/{zone_name}/bundles": { "get": { - "summary": "List the zone bundles that are current available for a zone.", + "summary": "List the zone bundles that are available for a running zone.", "operationId": "zone_bundle_list", "parameters": [ { @@ -639,6 +667,42 @@ "$ref": "#/components/responses/Error" } } + }, + "delete": { + "summary": "Delete a zone bundle.", + "operationId": "zone_bundle_delete", + "parameters": [ + { + "in": "path", + "name": "bundle_id", + "description": "The ID for this bundle itself.", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + }, + { + "in": "path", + "name": "zone_name", + "description": "The name of the zone this bundle is derived from.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "204": { + "description": "successful deletion" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } } }, "/zpools": { @@ -2654,6 +2718,39 @@ "vni" ] }, + "ZoneBundleCause": { + "description": "The reason or cause for a zone bundle, i.e., why it was created.", + "oneOf": [ + { + "description": "Generated in response to an explicit request to the sled agent.", + "type": "string", + "enum": [ + "explicit_request" + ] + }, + { + "description": "A zone bundle taken when a sled agent finds a zone that it does not expect to be running.", + "type": "string", + "enum": [ + "unexpected_zone" + ] + }, + { + "description": "An instance zone was terminated.", + "type": "string", + "enum": [ + "terminated_instance" + ] + }, + { + "description": "Some other, unspecified reason.", + "type": "string", + "enum": [ + "other" + ] + } + ] + }, "ZoneBundleId": { "description": "An identifier for a zone bundle.", "type": "object", @@ -2677,6 +2774,14 @@ "description": "Metadata about a zone bundle.", "type": "object", "properties": { + "cause": { + "description": "The reason or cause a bundle was created.", + "allOf": [ + { + "$ref": "#/components/schemas/ZoneBundleCause" + } + ] + }, "id": { "description": "Identifier for this zone bundle", "allOf": [ @@ -2689,11 +2794,19 @@ "description": "The time at which this zone bundle was created.", "type": "string", "format": "date-time" + }, + "version": { + "description": "A version number for this zone bundle.", + "type": "integer", + "format": "uint8", + "minimum": 0 } }, "required": [ + "cause", "id", - "time_created" + "time_created", + "version" ] }, "ZoneType": { diff --git a/schema/zone-bundle-metadata.json b/schema/zone-bundle-metadata.json index 83e8faafac..561d5f42d3 100644 --- a/schema/zone-bundle-metadata.json +++ b/schema/zone-bundle-metadata.json @@ -4,10 +4,20 @@ "description": "Metadata about a zone bundle.", "type": "object", "required": [ + "cause", "id", - "time_created" + "time_created", + "version" ], "properties": { + "cause": { + "description": "The reason or cause a bundle was created.", + "allOf": [ + { + "$ref": "#/definitions/ZoneBundleCause" + } + ] + }, "id": { "description": "Identifier for this zone bundle", "allOf": [ @@ -20,9 +30,48 @@ "description": "The time at which this zone bundle was created.", "type": "string", "format": "date-time" + }, + "version": { + "description": "A version number for this zone bundle.", + "type": "integer", + "format": "uint8", + "minimum": 0.0 } }, "definitions": { + "ZoneBundleCause": { + "description": "The reason or cause for a zone bundle, i.e., why it was created.", + "oneOf": [ + { + "description": "Generated in response to an explicit request to the sled agent.", + "type": "string", + "enum": [ + "explicit_request" + ] + }, + { + "description": "A zone bundle taken when a sled agent finds a zone that it does not expect to be running.", + "type": "string", + "enum": [ + "unexpected_zone" + ] + }, + { + "description": "An instance zone was terminated.", + "type": "string", + "enum": [ + "terminated_instance" + ] + }, + { + "description": "Some other, unspecified reason.", + "type": "string", + "enum": [ + "other" + ] + } + ] + }, "ZoneBundleId": { "description": "An identifier for a zone bundle.", "type": "object", diff --git a/sled-agent/src/bin/zone-bundle.rs b/sled-agent/src/bin/zone-bundle.rs index d99bbf06b7..f1492a7c52 100644 --- a/sled-agent/src/bin/zone-bundle.rs +++ b/sled-agent/src/bin/zone-bundle.rs @@ -46,15 +46,54 @@ struct Cli { cmd: Cmd, } +#[derive(Clone, Copy, Debug, clap::ValueEnum)] +enum ListFields { + ZoneName, + BundleId, + TimeCreated, + Cause, + Version, +} + +impl std::fmt::Display for ListFields { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + use ListFields::*; + match self { + ZoneName => write!(f, "zone-name"), + BundleId => write!(f, "bundle-id"), + TimeCreated => write!(f, "time-created"), + Cause => write!(f, "cause"), + Version => write!(f, "version"), + } + } +} + +impl ListFields { + fn all() -> Vec { + use ListFields::*; + vec![ZoneName, BundleId, TimeCreated] + } +} + #[derive(Clone, Debug, Subcommand)] enum Cmd { /// List the zones available for collecting bundles from. ListZones, - /// List existing bundles for a zone. + /// List existing bundles for a zone or all zones. #[clap(visible_alias = "ls")] List { - /// The name of the zone to list bundles for. - zone_name: String, + /// A filter for the zones whose bundles should be listed. + /// + /// If provided, this is used to filter the existing zones. Any zone + /// with a name containing the provided substring will be used, and its + /// zone bundles listed. + filter: Option, + /// Generate parseable output. + #[arg(long, short, default_value_t = false)] + parseable: bool, + /// Fields to print. + #[arg(long, short = 'o', default_values_t = ListFields::all(), value_delimiter = ',')] + fields: Vec, }, /// Request the sled agent create a new zone bundle. Create { @@ -66,10 +105,10 @@ enum Cmd { /// The name of the zone to fetch the bundle for. zone_name: String, /// The ID of the bundle to fetch. - #[arg(long, group = "id")] + #[arg(long, group = "id", required = true)] bundle_id: Option, /// Create a new bundle, and then fetch it. - #[arg(long, group = "id")] + #[arg(long, group = "id", required = true)] create: bool, /// The output file. /// @@ -77,6 +116,14 @@ enum Cmd { #[arg(long)] output: Option, }, + /// Delete a zone bundle. + #[clap(visible_aliases = ["del", "rm"])] + Delete { + /// The name of the zone to delete a bundle for. + zone_name: String, + /// The ID of the bundle to delete. + bundle_id: Uuid, + }, } #[tokio::main] @@ -87,7 +134,7 @@ async fn main() -> anyhow::Result<()> { let drain = FullFormat::new(decorator).build().fuse(); let drain = slog_async::Async::new(drain).build().fuse(); let drain = LevelFilter::new(drain, args.log_level).fuse(); - let log = Logger::root(drain, slog::o!("unit" => "zb")); + let log = Logger::root(drain, slog::o!("unit" => "zone-bundle")); let client = Client::new(&addr, log); match args.cmd { Cmd::ListZones => { @@ -100,19 +147,94 @@ async fn main() -> anyhow::Result<()> { println!("{zone}"); } } - Cmd::List { zone_name } => { + Cmd::List { filter, parseable, fields } => { let bundles = client - .zone_bundle_list(&zone_name) + .zone_bundle_list_all() .await .context("failed to list zone bundles")? - .into_inner(); - for bundle in bundles { - println!( - "{}/{} {}", - bundle.id.zone_name, - bundle.id.bundle_id, - bundle.time_created - ); + .into_inner() + .into_iter() + .filter(|bundle| { + if let Some(filter) = &filter { + bundle.id.zone_name.contains(filter) + } else { + true + } + }) + .collect::>(); + if bundles.is_empty() { + return Ok(()); + } + if parseable { + for bundle in bundles { + let line = fields + .iter() + .map(|field| match field { + ListFields::ZoneName => bundle.id.zone_name.clone(), + ListFields::BundleId => { + bundle.id.bundle_id.to_string() + } + ListFields::TimeCreated => { + bundle.time_created.to_rfc3339() + } + ListFields::Cause => format!("{:?}", bundle.cause), + ListFields::Version => bundle.version.to_string(), + }) + .collect::>() + .join(","); + println!("{line}"); + } + } else { + const ZONE_NAME_WIDTH: usize = 64; + const BUNDLE_ID_WIDTH: usize = 36; + const TIMESTAMP_WIDTH: usize = 34; + const CAUSE_WIDTH: usize = 20; + const VERSION_WIDTH: usize = 7; + for field in fields.iter() { + match field { + ListFields::ZoneName => { + print!("{:ZONE_NAME_WIDTH$} ", "Zone") + } + ListFields::BundleId => { + print!("{:BUNDLE_ID_WIDTH$} ", "Bundle ID") + } + ListFields::TimeCreated => { + print!("{:TIMESTAMP_WIDTH$} ", "Created") + } + ListFields::Cause => { + print!("{:CAUSE_WIDTH$} ", "Cause") + } + ListFields::Version => { + print!("{:VERSION_WIDTH$} ", "Version") + } + } + } + println!(); + for bundle in bundles { + for field in fields.iter() { + match field { + ListFields::ZoneName => print!( + "{:ZONE_NAME_WIDTH$} ", + bundle.id.zone_name + ), + ListFields::BundleId => print!( + "{:BUNDLE_ID_WIDTH$} ", + bundle.id.bundle_id + ), + ListFields::TimeCreated => print!( + "{:TIMESTAMP_WIDTH$} ", + bundle.time_created + ), + ListFields::Cause => { + print!("{:CAUSE_WIDTH$?} ", bundle.cause,) + } + ListFields::Version => { + print!("{:VERSION_WIDTH$} ", bundle.version,) + } + } + } + println!(); + } } } Cmd::Create { zone_name } => { @@ -165,6 +287,12 @@ async fn main() -> anyhow::Result<()> { .context("failed to write bundle data")?; } } + Cmd::Delete { zone_name, bundle_id } => { + client + .zone_bundle_delete(&zone_name, &bundle_id) + .await + .context("failed to delete zone bundle")?; + } } Ok(()) } diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 3b528b495a..2b6551f899 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -12,8 +12,8 @@ use crate::params::{ }; use dropshot::{ endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseCreated, - HttpResponseHeaders, HttpResponseOk, HttpResponseUpdatedNoContent, Path, - RequestContext, TypedBody, + HttpResponseDeleted, HttpResponseHeaders, HttpResponseOk, + HttpResponseUpdatedNoContent, Path, RequestContext, TypedBody, }; use illumos_utils::opte::params::SetVirtualNetworkInterfaceHost; use omicron_common::api::external::Error; @@ -41,8 +41,10 @@ pub fn api() -> SledApiDescription { api.register(services_put)?; api.register(zones_list)?; api.register(zone_bundle_list)?; + api.register(zone_bundle_list_all)?; api.register(zone_bundle_create)?; api.register(zone_bundle_get)?; + api.register(zone_bundle_delete)?; api.register(sled_role_get)?; api.register(set_v2p)?; api.register(del_v2p)?; @@ -67,7 +69,22 @@ struct ZonePathParam { zone_name: String, } -/// List the zone bundles that are current available for a zone. +/// List all zone bundles that exist, even for now-deleted zones. +#[endpoint { + method = GET, + path = "/all-zone-bundles", +}] +async fn zone_bundle_list_all( + rqctx: RequestContext, +) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.list_all_zone_bundles() + .await + .map(HttpResponseOk) + .map_err(HttpError::from) +} + +/// List the zone bundles that are available for a running zone. #[endpoint { method = GET, path = "/zones/{zone_name}/bundles", @@ -139,6 +156,35 @@ async fn zone_bundle_get( Ok(response) } +/// Delete a zone bundle. +#[endpoint { + method = DELETE, + path = "/zones/{zone_name}/bundles/{bundle_id}", +}] +async fn zone_bundle_delete( + rqctx: RequestContext, + params: Path, +) -> Result { + let params = params.into_inner(); + let zone_name = params.zone_name; + let bundle_id = params.bundle_id; + let sa = rqctx.context(); + let Some(path) = sa.get_zone_bundle_path(&zone_name, &bundle_id) + .await + .map_err(HttpError::from)? else { + return Err(HttpError::for_not_found( + None, + format!("No zone bundle for zone '{}' with ID '{}'", zone_name, bundle_id))); + }; + tokio::fs::remove_file(&path).await.map(|_| HttpResponseDeleted()).map_err( + |e| { + HttpError::for_internal_error(format!( + "Failed to delete zone bundle: {e}" + )) + }, + ) +} + /// List the zones that are currently managed by the sled agent. #[endpoint { method = GET, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 8cb21140d7..84bc33babf 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -34,6 +34,10 @@ use omicron_common::api::internal::shared::{ }; use omicron_common::backoff; //use propolis_client::generated::DiskRequest; +use crate::params::ZoneBundleCause; +use crate::params::ZoneBundleMetadata; +use crate::zone_bundle; +use crate::zone_bundle::BundleError; use propolis_client::Client as PropolisClient; use rand::prelude::SliceRandom; use rand::SeedableRng; @@ -151,7 +155,9 @@ fn fmri_name() -> String { format!("{}:default", service_name()) } -fn propolis_zone_name(id: &Uuid) -> String { +/// Return the expected name of a Propolis zone managing an instance with the +/// provided ID. +pub fn propolis_zone_name(id: &Uuid) -> String { format!("{}{}", PROPOLIS_ZONE_PREFIX, id) } @@ -526,12 +532,47 @@ impl InstanceInner { /// This routine is safe to call even if the instance's zone was never /// started. It is also safe to call multiple times on a single instance. async fn terminate(&mut self) -> Result<(), Error> { + let zname = propolis_zone_name(self.propolis_id()); + + // First fetch the running state. + // + // If there is nothing here, then there is no `RunningZone`, and so + // there's no zone or resources to clean up at all. + let mut running_state = if let Some(state) = self.running_state.take() { + state + } else { + debug!( + self.log, + "Instance::terminate() called with no running state" + ); + return Ok(()); + }; + + // Take a zone bundle whenever this instance stops. + let context = self + .storage + .zone_bundle_context(&zname, ZoneBundleCause::TerminatedInstance) + .await; + if let Err(e) = zone_bundle::create( + &self.log, + &running_state.running_zone, + &context, + ) + .await + { + error!( + self.log, + "Failed to take zone bundle for terminated instance"; + "zone_name" => &zname, + "reason" => ?e, + ); + } + // Ensure that no zone exists. This succeeds even if no zone was ever // created. // NOTE: we call`Zones::halt_and_remove_logged` directly instead of // `RunningZone::stop` in case we're called between creating the // zone and assigning `running_state`. - let zname = propolis_zone_name(self.propolis_id()); warn!(self.log, "Halting and removing zone: {}", zname); Zones::halt_and_remove_logged(&self.log, &zname).await.unwrap(); @@ -539,12 +580,7 @@ impl InstanceInner { self.instance_ticket.terminate(); // See if there are any runtime objects to clean up. - let mut running_state = if let Some(state) = self.running_state.take() { - state - } else { - return Ok(()); - }; - + // // We already removed the zone above but mark it as stopped running_state.running_zone.stop().await.unwrap(); @@ -627,6 +663,33 @@ impl Instance { Ok(Instance { inner }) } + /// Create bundle from an instance zone. + pub async fn request_zone_bundle( + &self, + ) -> Result { + let inner = self.inner.lock().await; + let name = propolis_zone_name(inner.propolis_id()); + match &*inner { + InstanceInner { running_state: None, .. } => { + Err(BundleError::Unavailable { name }) + } + InstanceInner { + ref log, + running_state: Some(RunningState { ref running_zone, .. }), + .. + } => { + let context = inner + .storage + .zone_bundle_context( + &name, + ZoneBundleCause::ExplicitRequest, + ) + .await; + zone_bundle::create(log, running_zone, &context).await + } + } + } + pub async fn current_state(&self) -> InstanceRuntimeState { let inner = self.inner.lock().await; inner.state.current().clone() diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 6b21abf831..2be8223bce 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -4,13 +4,16 @@ //! API for controlling multiple instances on a sled. +use crate::instance::propolis_zone_name; use crate::instance::Instance; use crate::nexus::NexusClientWithResolver; +use crate::params::ZoneBundleMetadata; use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, }; use crate::storage_manager::StorageResources; +use crate::zone_bundle::BundleError; use illumos_utils::dladm::Etherstub; use illumos_utils::link::VnicAllocator; use illumos_utils::opte::PortManager; @@ -38,6 +41,9 @@ pub enum Error { #[error("Cannot find data link: {0}")] Underlay(#[from] sled_hardware::underlay::Error), + + #[error("Zone bundle error")] + ZoneBundle(#[from] BundleError), } struct InstanceManagerInternal { @@ -320,6 +326,33 @@ impl InstanceManager { .await .map_err(Error::from) } + + /// Create a zone bundle from a named instance zone, if it exists. + pub async fn create_zone_bundle( + &self, + name: &str, + ) -> Result { + // We need to find the instance and take its lock, but: + // + // 1. The instance-map lock is sync, and + // 2. we don't want to hold the instance-map lock for the entire + // bundling duration. + // + // Instead, we cheaply clone the instance through its `Arc` around the + // `InstanceInner`, which is ultimately what we want. + let Some((_propolis_id, instance)) = self + .inner + .instances + .lock() + .unwrap() + .values() + .find(|(propolis_id, _instance)| name == propolis_zone_name(propolis_id)) + .cloned() + else { + return Err(BundleError::NoSuchZone { name: name.to_string() }); + }; + instance.request_zone_bundle().await + } } /// Represents membership of an instance in the [`InstanceManager`]. diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index cb2f02ccf2..5c4dbd8310 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -34,6 +34,7 @@ pub(crate) mod storage; mod storage_manager; mod swap_device; mod updates; +mod zone_bundle; #[cfg(test)] mod fakes; diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 381c20d7fe..64a3bf520a 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -863,6 +863,36 @@ pub struct ZoneBundleId { pub bundle_id: Uuid, } +/// The reason or cause for a zone bundle, i.e., why it was created. +#[derive( + Clone, + Copy, + Debug, + Default, + Deserialize, + Eq, + Hash, + JsonSchema, + Ord, + PartialEq, + PartialOrd, + Serialize, +)] +#[serde(rename_all = "snake_case")] +#[non_exhaustive] +pub enum ZoneBundleCause { + /// Generated in response to an explicit request to the sled agent. + ExplicitRequest, + /// A zone bundle taken when a sled agent finds a zone that it does not + /// expect to be running. + UnexpectedZone, + /// An instance zone was terminated. + TerminatedInstance, + /// Some other, unspecified reason. + #[default] + Other, +} + /// Metadata about a zone bundle. #[derive( Clone, @@ -881,17 +911,25 @@ pub struct ZoneBundleMetadata { pub id: ZoneBundleId, /// The time at which this zone bundle was created. pub time_created: DateTime, + /// A version number for this zone bundle. + pub version: u8, + /// The reason or cause a bundle was created. + pub cause: ZoneBundleCause, } impl ZoneBundleMetadata { + const VERSION: u8 = 0; + /// Create a new set of metadata for the provided zone. - pub(crate) fn new(zone_name: &str) -> Self { + pub(crate) fn new(zone_name: &str, cause: ZoneBundleCause) -> Self { Self { id: ZoneBundleId { zone_name: zone_name.to_string(), bundle_id: Uuid::new_v4(), }, time_created: Utc::now(), + version: Self::VERSION, + cause, } } } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 13922af2f6..670a7d7b69 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -31,18 +31,20 @@ use crate::bootstrap::early_networking::{ use crate::config::SidecarRevision; use crate::params::{ DendriteAsic, ServiceEnsureBody, ServiceType, ServiceZoneRequest, - ServiceZoneService, TimeSync, ZoneBundleMetadata, ZoneType, + ServiceZoneService, TimeSync, ZoneBundleCause, ZoneBundleMetadata, + ZoneType, }; use crate::profile::*; use crate::smf_helper::Service; use crate::smf_helper::SmfHelper; use crate::storage_manager::StorageResources; +use crate::zone_bundle; +use crate::zone_bundle::BundleError; use anyhow::anyhow; use camino::{Utf8Path, Utf8PathBuf}; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use dpd_client::{types as DpdTypes, Client as DpdClient, Error as DpdError}; use dropshot::HandlerTaskMode; -use flate2::bufread::GzDecoder; use illumos_utils::addrobj::AddrObject; use illumos_utils::addrobj::IPV6_LINK_LOCAL_NAME; use illumos_utils::dladm::{ @@ -91,10 +93,8 @@ use sled_hardware::underlay::BOOTSTRAP_PREFIX; use sled_hardware::Baseboard; use sled_hardware::SledMode; use slog::Logger; -use std::collections::BTreeSet; use std::collections::HashSet; use std::collections::{BTreeMap, HashMap}; -use std::io::Cursor; use std::iter; use std::iter::FromIterator; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; @@ -102,9 +102,6 @@ use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; -use tar::Archive; -use tar::Builder; -use tar::Header; use tokio::io::AsyncWriteExt; use tokio::sync::oneshot; use tokio::sync::Mutex; @@ -215,9 +212,6 @@ pub enum Error { #[error("Sidecar revision error")] SidecarRevision(#[from] anyhow::Error), - #[error("Zone bundle error")] - Bundle(#[from] BundleError), - #[error("Early networking setup error")] EarlyNetworkSetupError(#[from] EarlyNetworkSetupError), @@ -257,30 +251,6 @@ fn display_zone_init_errors(errors: &[(String, Box)]) -> String { output } -#[derive(Debug, thiserror::Error)] -pub enum BundleError { - #[error("I/O error")] - Io(#[from] std::io::Error), - - #[error("TOML serialization failure")] - Serialization(#[from] toml::ser::Error), - - #[error("TOML deserialization failure")] - Deserialization(#[from] toml::de::Error), - - #[error("No zone named '{name}' is available for bundling")] - NoSuchZone { name: String }, - - #[error("No storage available for bundles")] - NoStorage, - - #[error("Failed to join zone bundling task")] - Task(#[from] tokio::task::JoinError), - - #[error("Failed to create bundle")] - BundleFailed(#[from] anyhow::Error), -} - /// Configuration parameters which modify the [`ServiceManager`]'s behavior. pub struct Config { /// Identifies the sled being configured @@ -299,15 +269,6 @@ impl Config { // The filename of the ledger, within the provided directory. const SERVICES_LEDGER_FILENAME: &str = "services.json"; -// The directory within the debug dataset in which bundles are created. -const BUNDLE_DIRECTORY: &str = "bundle"; - -// The directory for zone bundles. -const ZONE_BUNDLE_DIRECTORY: &str = "zone"; - -// The name for zone bundle metadata files. -const ZONE_BUNDLE_METADATA_FILENAME: &str = "metadata.toml"; - // A wrapper around `ZoneRequest`, which allows it to be serialized // to a JSON file. #[derive(Clone, serde::Serialize, serde::Deserialize, schemars::JsonSchema)] @@ -503,32 +464,6 @@ impl ServiceManager { self.inner.switch_zone_bootstrap_address } - // Return the directories for storing debug information. - async fn all_debug_directories(&self) -> Vec { - self.inner - .storage - .all_m2_mountpoints(sled_hardware::disk::DEBUG_DATASET) - .await - } - - // Return the directories for storing all service bundles. - async fn all_service_bundle_directories(&self) -> Vec { - self.all_debug_directories() - .await - .into_iter() - .map(|p| p.join(BUNDLE_DIRECTORY)) - .collect() - } - - // Return the directories for storing zone service bundles. - async fn all_zone_bundle_directories(&self) -> Vec { - self.all_service_bundle_directories() - .await - .into_iter() - .map(|p| p.join(ZONE_BUNDLE_DIRECTORY)) - .collect() - } - async fn all_service_ledgers(&self) -> Vec { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; @@ -2074,472 +2009,39 @@ impl ServiceManager { Ok(()) } - // Create a zone bundle for the named running zone. - async fn create_zone_bundle_impl( - &self, - zone: &RunningZone, - ) -> Result { - // Fetch the directory into which we'll store data, and ensure it - // exists. - let log = &self.inner.log; - let directories = self.all_zone_bundle_directories().await; - if directories.is_empty() { - warn!(log, "no directories available for zone bundles"); - return Err(BundleError::NoStorage); - } - info!( - log, - "creating zone bundle"; - "zone" => zone.name(), - "directories" => ?directories, - ); - let mut zone_bundle_dirs = Vec::with_capacity(directories.len()); - for dir in directories.iter() { - let bundle_dir = dir.join(zone.name()); - debug!(log, "creating bundle directory"; "dir" => %bundle_dir); - tokio::fs::create_dir_all(&bundle_dir).await?; - zone_bundle_dirs.push(bundle_dir); - } - - // Create metadata and the tarball writer. - // - // We'll write the contents of the bundle into a gzipped tar archive, - // including metadata and a file for the output of each command we run - // in the zone. - let zone_metadata = ZoneBundleMetadata::new(zone.name()); - let filename = format!("{}.tar.gz", zone_metadata.id.bundle_id); - let full_path = zone_bundle_dirs[0].join(&filename); - let file = match tokio::fs::OpenOptions::new() - .read(true) - .write(true) - .create(true) - .open(&full_path) - .await - { - Ok(f) => f.into_std().await, - Err(e) => { - error!( - log, - "failed to create bundle file"; - "zone" => zone.name(), - "file" => %full_path, - "error" => ?e, - ); - return Err(BundleError::from(e)); - } - }; - debug!( - log, - "created bundle tarball file"; - "zone" => zone.name(), - "path" => %full_path - ); - let gz = flate2::GzBuilder::new() - .filename(filename.as_str()) - .write(file, flate2::Compression::best()); - let mut builder = Builder::new(gz); - - // Helper function to write an array of bytes into the tar archive, with - // the provided name. - fn insert_data( - builder: &mut Builder, - name: &str, - contents: &[u8], - ) -> Result<(), BundleError> { - let mtime = SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .map_err(|e| anyhow::anyhow!("failed to compute mtime: {e}"))? - .as_secs(); - - let mut hdr = Header::new_ustar(); - hdr.set_size(contents.len().try_into().unwrap()); - hdr.set_mode(0o444); - hdr.set_mtime(mtime); - hdr.set_entry_type(tar::EntryType::Regular); - // NOTE: This internally sets the path and checksum. - builder - .append_data(&mut hdr, name, Cursor::new(contents)) - .map_err(BundleError::from) - } - - // Write the metadata file itself, in TOML format. - let contents = toml::to_string(&zone_metadata)?; - insert_data( - &mut builder, - ZONE_BUNDLE_METADATA_FILENAME, - contents.as_bytes(), - )?; - debug!( - log, - "wrote zone bundle metadata"; - "zone" => zone.name(), - ); - - // The set of zone-wide commands, which don't require any details about - // the processes we've launched in the zone. - const ZONE_WIDE_COMMANDS: [&[&str]; 6] = [ - &["ptree"], - &["uptime"], - &["last"], - &["who"], - &["svcs", "-p"], - &["netstat", "-an"], - ]; - for cmd in ZONE_WIDE_COMMANDS { - debug!( - log, - "running zone bundle command"; - "zone" => zone.name(), - "command" => ?cmd, - ); - let output = match zone.run_cmd(cmd) { - Ok(s) => s, - Err(e) => format!("{}", e), - }; - let contents = - format!("Command: {:?}\n{}", cmd, output).into_bytes(); - if let Err(e) = insert_data(&mut builder, cmd[0], &contents) { - error!( - log, - "failed to save zone bundle command output"; - "zone" => zone.name(), - "command" => ?cmd, - "error" => ?e, - ); - } - } - - // Debugging commands run on the specific processes this zone defines. - const ZONE_PROCESS_COMMANDS: [&str; 3] = [ - "pfiles", "pstack", - "pargs", - // TODO-completeness: We may want `gcore`, since that encompasses - // the above commands and much more. It seems like overkill now, - // however. - ]; - let procs = match zone.service_processes() { - Ok(p) => { - debug!( - log, - "enumerated service processes"; - "zone" => zone.name(), - "procs" => ?p, - ); - p - } - Err(e) => { - error!( - log, - "failed to enumerate zone service processes"; - "zone" => zone.name(), - "error" => ?e, - ); - let err = anyhow::anyhow!( - "failed to enumerate zone service processes: {e}" - ); - return Err(BundleError::from(err)); - } - }; - for svc in procs.into_iter() { - let pid_s = svc.pid.to_string(); - for cmd in ZONE_PROCESS_COMMANDS { - let args = &[cmd, &pid_s]; - debug!( - log, - "running zone bundle command"; - "zone" => zone.name(), - "command" => ?args, - ); - let output = match zone.run_cmd(args) { - Ok(s) => s, - Err(e) => format!("{}", e), - }; - let contents = - format!("Command: {:?}\n{}", args, output).into_bytes(); - - // There may be multiple Oxide service processes for which we - // want to capture the command output. Name each output after - // the command and PID to disambiguate. - let filename = format!("{}.{}", cmd, svc.pid); - if let Err(e) = insert_data(&mut builder, &filename, &contents) - { - error!( - log, - "failed to save zone bundle command output"; - "zone" => zone.name(), - "command" => ?args, - "error" => ?e, - ); - } - } - - // Copy any log files, current and rotated, into the tarball as - // well. - // - // Safety: This pathbuf was retrieved by locating an existing file - // on the filesystem, so we're sure it has a name and the unwrap is - // safe. - debug!( - log, - "appending current log file to zone bundle"; - "zone" => zone.name(), - "log_file" => %svc.log_file, - ); - if let Err(e) = builder.append_path_with_name( - &svc.log_file, - svc.log_file.file_name().unwrap(), - ) { - error!( - log, - "failed to append current log file to zone bundle"; - "zone" => zone.name(), - "log_file" => %svc.log_file, - "error" => ?e, - ); - return Err(e.into()); - } - for f in svc.rotated_log_files.iter() { - debug!( - log, - "appending rotated log file to zone bundle"; - "zone" => zone.name(), - "log_file" => %svc.log_file, - ); - if let Err(e) = - builder.append_path_with_name(f, f.file_name().unwrap()) - { - error!( - log, - "failed to append current log file to zone bundle"; - "zone" => zone.name(), - "log_file" => %svc.log_file, - "error" => ?e, - ); - return Err(e.into()); - } - } - } - - // Finish writing out the tarball itself. - builder - .into_inner() - .map_err(|e| anyhow::anyhow!("Failed to build bundle: {e}"))?; - - // Copy the bundle to the other locations. We really want the bundles to - // be duplicates, not an additional, new bundle. - for other_dir in zone_bundle_dirs[1..].iter() { - let to = other_dir.join(&filename); - debug!(log, "copying bundle"; "from" => %full_path, "to" => %to); - tokio::fs::copy(&full_path, to).await?; - } - - info!(log, "finished zone bundle"; "metadata" => ?zone_metadata); - Ok(zone_metadata) - } - /// Create a zone bundle for the provided zone. pub async fn create_zone_bundle( &self, name: &str, - ) -> Result { + ) -> Result { // Search for the named zone. if let SledLocalZone::Running { zone, .. } = &*self.inner.switch_zone.lock().await { if zone.name() == name { - return self - .create_zone_bundle_impl(zone) - .await - .map_err(Error::from); + let context = self + .inner + .storage + .zone_bundle_context(name, ZoneBundleCause::ExplicitRequest) + .await; + return crate::zone_bundle::create( + &self.inner.log, + zone, + &context, + ) + .await; } } if let Some(zone) = self.inner.zones.lock().await.get(name) { - return self - .create_zone_bundle_impl(zone) - .await - .map_err(Error::from); - } - Err(Error::from(BundleError::NoSuchZone { name: name.to_string() })) - } - - fn extract_zone_bundle_metadata( - path: &std::path::PathBuf, - ) -> Result { - // Build a reader for the whole archive. - let reader = std::fs::File::open(path).map_err(BundleError::from)?; - let buf_reader = std::io::BufReader::new(reader); - let gz = GzDecoder::new(buf_reader); - let mut archive = Archive::new(gz); - - // Find the metadata entry, if it exists. - let entries = archive.entries()?; - let Some(md_entry) = entries - // The `Archive::entries` iterator - // returns a result, so filter to those - // that are OK first. - .filter_map(Result::ok) - .find(|entry| { - entry - .path() - .map(|p| p.to_str() == Some(ZONE_BUNDLE_METADATA_FILENAME)) - .unwrap_or(false) - }) - else { - return Err(BundleError::from( - anyhow::anyhow!("Zone bundle is missing metadata file") - )); - }; - - // Extract its contents and parse as metadata. - let contents = std::io::read_to_string(md_entry)?; - toml::from_str(&contents).map_err(BundleError::from) - } - - /// List the bundles available for the zone of the provided name. - pub async fn list_zone_bundles( - &self, - name: &str, - ) -> Result, Error> { - let log = &self.inner.log; - - // The zone bundles are replicated in several places, so we'll use a set - // to collect them all, to avoid duplicating. - let mut bundles = BTreeSet::new(); - - for path in self.all_zone_bundle_directories().await { - info!(log, "searching zone bundle directory"; "directory" => ?path); - let zone_bundle_dir = path.join(name); - if zone_bundle_dir.is_dir() { - let mut dir = tokio::fs::read_dir(zone_bundle_dir) - .await - .map_err(BundleError::from)?; - while let Some(zone_bundle) = - dir.next_entry().await.map_err(BundleError::from)? - { - let bundle_path = zone_bundle.path(); - info!( - log, - "checking possible zone bundle"; - "bundle_path" => %bundle_path.display(), - ); - - // Zone bundles _should_ be named like: - // - // .../bundle/zone//.tar.gz. - // - // However, really a zone bundle is any tarball with the - // right metadata file, which contains a TOML-serialized - // `ZoneBundleMetadata` file. Try to create an archive out - // of each file we find in this directory, and parse out a - // metadata file. - let tarball = bundle_path.to_owned(); - let task = tokio::task::spawn_blocking(move || { - Self::extract_zone_bundle_metadata(&tarball) - }); - let metadata = match task.await { - Ok(Ok(md)) => md, - Ok(Err(e)) => { - error!( - log, - "failed to read zone bundle metadata"; - "error" => ?e, - ); - return Err(Error::from(e)); - } - Err(e) => { - error!( - log, - "failed to join zone bundle metadata read task"; - "error" => ?e, - ); - return Err(Error::from(BundleError::from(e))); - } - }; - info!(log, "found zone bundle"; "metadata" => ?metadata); - bundles.insert(metadata); - } - } - } - Ok(bundles.into_iter().collect()) - } - - /// Get the path to a zone bundle, if it exists. - pub async fn get_zone_bundle_path( - &self, - zone_name: &str, - id: &Uuid, - ) -> Result, Error> { - let log = &self.inner.log; - for path in self.all_zone_bundle_directories().await { - info!(log, "searching zone bundle directory"; "directory" => ?path); - let zone_bundle_dir = path.join(zone_name); - if zone_bundle_dir.is_dir() { - let mut dir = tokio::fs::read_dir(zone_bundle_dir) - .await - .map_err(BundleError::from)?; - while let Some(zone_bundle) = - dir.next_entry().await.map_err(BundleError::from)? - { - let path = zone_bundle.path(); - let task = tokio::task::spawn_blocking(move || { - Self::extract_zone_bundle_metadata(&path) - }); - let metadata = match task.await { - Ok(Ok(md)) => md, - Ok(Err(e)) => { - error!( - log, - "failed to read zone bundle metadata"; - "error" => ?e, - ); - return Err(Error::from(e)); - } - Err(e) => { - error!( - log, - "failed to join zone bundle metadata read task"; - "error" => ?e, - ); - return Err(Error::from(BundleError::from(e))); - } - }; - let bundle_id = &metadata.id; - if bundle_id.zone_name == zone_name - && bundle_id.bundle_id == *id - { - let path = Utf8PathBuf::try_from(zone_bundle.path()) - .map_err(|_| { - BundleError::from(anyhow::anyhow!( - "Non-UTF-8 path name: {}", - zone_bundle.path().display() - )) - })?; - return Ok(Some(path)); - } - } - } - } - Ok(None) - } - - /// List all zones that are currently managed. - pub async fn list_all_zones(&self) -> Result, Error> { - let mut zone_names = vec![]; - if let SledLocalZone::Running { zone, .. } = - &*self.inner.switch_zone.lock().await - { - zone_names.push(String::from(zone.name())) + let context = self + .inner + .storage + .zone_bundle_context(name, ZoneBundleCause::ExplicitRequest) + .await; + return crate::zone_bundle::create(&self.inner.log, zone, &context) + .await; } - zone_names.extend( - self.inner - .zones - .lock() - .await - .values() - .map(|zone| zone.name().to_string()), - ); - zone_names.sort(); - Ok(zone_names) + Err(BundleError::NoSuchZone { name: name.to_string() }) } /// Ensures that particular services should be initialized. @@ -2615,6 +2117,28 @@ impl ServiceManager { for zone in zones_to_be_removed { let expected_zone_name = zone.zone_name(); if let Some(mut zone) = existing_zones.remove(&expected_zone_name) { + debug!( + log, + "removing an existing zone"; + "zone_name" => &expected_zone_name, + ); + let context = self + .inner + .storage + .zone_bundle_context( + &expected_zone_name, + ZoneBundleCause::UnexpectedZone, + ) + .await; + if let Err(e) = zone_bundle::create(log, &zone, &context).await + { + error!( + log, + "Failed to take bundle of unexpected zone"; + "zone_name" => &expected_zone_name, + "reason" => ?e, + ); + } if let Err(e) = zone.stop().await { error!(log, "Failed to stop zone {}: {e}", zone.name()); } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 4adef62700..ec8df1259f 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -20,11 +20,15 @@ use crate::params::{ use crate::services::{self, ServiceManager}; use crate::storage_manager::{self, StorageManager}; use crate::updates::{ConfigUpdates, UpdateManager}; +use crate::zone_bundle; +use crate::zone_bundle::BundleError; use bootstore::schemes::v0 as bootstore; use camino::Utf8PathBuf; use dropshot::HttpError; use illumos_utils::opte::params::SetVirtualNetworkInterfaceHost; use illumos_utils::opte::PortManager; +use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; +use illumos_utils::zone::ZONE_PREFIX; use omicron_common::address::{ get_sled_address, get_switch_zone_address, Ipv6Subnet, SLED_PREFIX, }; @@ -41,6 +45,7 @@ use omicron_common::backoff::{ use sled_hardware::underlay; use sled_hardware::HardwareManager; use slog::Logger; +use std::collections::BTreeSet; use std::net::{Ipv6Addr, SocketAddrV6}; use std::sync::Arc; use uuid::Uuid; @@ -108,6 +113,9 @@ pub enum Error { #[error("Failed to deserialize early network config: {0}")] EarlyNetworkDeserialize(serde_json::Error), + + #[error("Zone bundle error: {0}")] + ZoneBundle(#[from] BundleError), } impl From for omicron_common::api::external::Error { @@ -159,13 +167,11 @@ impl From for dropshot::HttpError { e => HttpError::for_internal_error(e.to_string()), } } - crate::sled_agent::Error::Services( - crate::services::Error::Bundle(ref inner), - ) => match inner { - crate::services::BundleError::NoStorage => { + crate::sled_agent::Error::ZoneBundle(ref inner) => match inner { + BundleError::NoStorage | BundleError::Unavailable { .. } => { HttpError::for_unavail(None, inner.to_string()) } - crate::services::BundleError::NoSuchZone { .. } => { + BundleError::NoSuchZone { .. } => { HttpError::for_not_found(None, inner.to_string()) } _ => HttpError::for_internal_error(err.to_string()), @@ -182,6 +188,9 @@ struct SledAgentInner { // ID of the Sled id: Uuid, + // Logger used for generic sled agent operations, e.g., zone bundles. + log: Logger, + // Subnet of the Sled's underlay. // // The Sled Agent's address can be derived from this value. @@ -199,7 +208,7 @@ struct SledAgentInner { // Component of Sled Agent responsible for managing updates. updates: UpdateManager, - /// Component of Sled Agent responsible for managing OPTE ports. + // Component of Sled Agent responsible for managing OPTE ports. port_manager: PortManager, // Other Oxide-controlled services running on this Sled. @@ -395,6 +404,7 @@ impl SledAgent { let sled_agent = SledAgent { inner: Arc::new(SledAgentInner { id: request.id, + log: log.clone(), subnet: request.subnet, storage, instances, @@ -661,12 +671,104 @@ impl SledAgent { }); } + /// List all zone bundles on the system, for any zones live or dead. + pub async fn list_all_zone_bundles( + &self, + ) -> Result, Error> { + let mut bundles = BTreeSet::new(); + let log = &self.inner.log; + for path in + self.inner.storage.resources().all_zone_bundle_directories().await + { + debug!(log, "searching zone bundle directory"; "directory" => ?path); + // It's possible that the debug directories themselves do not exist, + // since we create them when we create the first bundles. Return an + // empty set in this case. + let mut entries = match tokio::fs::read_dir(path).await { + Ok(ent) => ent, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => continue, + Err(e) => { + return Err(Error::from(BundleError::from( + anyhow::anyhow!("failed to read bundle directory: {e}"), + ))); + } + }; + + // First iterate over all the possible zone-names here. + loop { + let Some(zone_name) = entries + .next_entry() + .await + .map_err(|e| BundleError::from(anyhow::anyhow!("failed to read zone bundle dir entry: {e}")))? else { + break; + }; + + // Enumerate and iterate over all the contained entries, which + // _should_ all be zone bundles. + let mut bundle_entries = tokio::fs::read_dir(zone_name.path()) + .await + .map_err(|e| { + BundleError::from(anyhow::anyhow!( + "failed to read zone directory: {e}" + )) + })?; + loop { + let Some(bundle) = bundle_entries + .next_entry() + .await + .map_err(|e| BundleError::from(anyhow::anyhow!("failed to read zone bundle dir entry: {e}")))? else { + break; + }; + match zone_bundle::extract_zone_bundle_metadata( + bundle.path(), + ) + .await + { + Ok(metadata) => { + debug!( + log, + "found zone bundle"; + "zone_name" => &metadata.id.zone_name, + "id" => %&metadata.id.bundle_id, + "path" => ?bundle.path(), + ); + bundles.insert(metadata); + } + Err(e) => warn!( + log, + "found file in zone bundle directory which doesn't \ + appear to be a valid zone bundle"; + "path" => ?bundle.path(), + "err" => ?e, + ), + } + } + } + } + Ok(bundles.into_iter().collect()) + } + /// List zone bundles for the provided zone. pub async fn list_zone_bundles( &self, name: &str, ) -> Result, Error> { - self.inner.services.list_zone_bundles(name).await.map_err(Error::from) + // The zone bundles are replicated in several places, so we'll use a set + // to collect them all, to avoid duplicating. + let mut bundles = BTreeSet::new(); + let log = &self.inner.log; + for path in + self.inner.storage.resources().all_zone_bundle_directories().await + { + debug!(log, "searching zone bundle directory"; "directory" => ?path); + bundles.extend( + zone_bundle::list_bundles_for_zone(log, &path, name) + .await? + .into_iter() + .map(|(_path, bdl)| bdl), + ); + } + Ok(bundles.into_iter().collect()) } /// Create a zone bundle for the provided zone. @@ -674,7 +776,21 @@ impl SledAgent { &self, name: &str, ) -> Result { - self.inner.services.create_zone_bundle(name).await.map_err(Error::from) + if name.starts_with(PROPOLIS_ZONE_PREFIX) { + self.inner + .instances + .create_zone_bundle(name) + .await + .map_err(Error::from) + } else if name.starts_with(ZONE_PREFIX) { + self.inner + .services + .create_zone_bundle(name) + .await + .map_err(Error::from) + } else { + Err(Error::from(BundleError::NoSuchZone { name: name.to_string() })) + } } /// Fetch the path to a zone bundle. @@ -683,16 +799,33 @@ impl SledAgent { name: &str, id: &Uuid, ) -> Result, Error> { - self.inner - .services - .get_zone_bundle_path(name, id) - .await - .map_err(Error::from) + zone_bundle::get_zone_bundle_path( + &self.inner.log, + &self.inner.storage.resources().all_zone_bundle_directories().await, + name, + id, + ) + .await + .map_err(Error::from) } /// List the zones that the sled agent is currently managing. pub async fn zones_list(&self) -> Result, Error> { - self.inner.services.list_all_zones().await.map_err(Error::from) + Zones::get() + .await + .map(|zones| { + zones + .into_iter() + .filter_map(|zone| { + if matches!(zone.state(), zone::State::Running) { + Some(String::from(zone.name())) + } else { + None + } + }) + .collect() + }) + .map_err(|e| Error::from(BundleError::from(e))) } /// Ensures that particular services should be initialized. diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index 97c02f2d0f..7153bd33df 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -5,8 +5,10 @@ //! Management of sled-local storage. use crate::nexus::NexusClientWithResolver; +use crate::params::ZoneBundleCause; use crate::storage::dataset::DatasetName; use crate::storage::dump_setup::DumpSetup; +use crate::zone_bundle::ZoneBundleContext; use camino::Utf8PathBuf; use derive_more::From; use futures::stream::FuturesOrdered; @@ -233,6 +235,12 @@ pub struct StorageResources { pools: Arc>>, } +// The directory within the debug dataset in which bundles are created. +const BUNDLE_DIRECTORY: &str = "bundle"; + +// The directory for zone bundles. +const ZONE_BUNDLE_DIRECTORY: &str = "zone"; + impl StorageResources { /// Creates a fabricated view of storage resources. /// @@ -334,6 +342,41 @@ impl StorageResources { }) .collect() } + + /// Return the directories for storing zone service bundles. + pub async fn all_zone_bundle_directories(&self) -> Vec { + self.all_m2_mountpoints(sled_hardware::disk::M2_DEBUG_DATASET) + .await + .into_iter() + .map(|p| p.join(BUNDLE_DIRECTORY).join(ZONE_BUNDLE_DIRECTORY)) + .collect() + } + + /// Return context for storing zone bundles. + /// + /// See [`ZoneBundleContext`] for details. + pub async fn zone_bundle_context( + &self, + zone_name: &str, + cause: ZoneBundleCause, + ) -> ZoneBundleContext { + // As of #3713, rotated log files are moved out of their original home, + // and onto longer-term storage some U.2s. Which one houses them is + // effectively random. Add the U.2 debug datasets into the + // `extra_log_dirs` field for search during the zone bundle process. + let extra_log_dirs = self + .all_u2_mountpoints(sled_hardware::disk::U2_DEBUG_DATASET) + .await + .into_iter() + .map(|p| p.join(zone_name)) + .collect(); + ZoneBundleContext { + cause, + storage_dirs: self.all_zone_bundle_directories().await, + extra_log_dirs, + ..Default::default() + } + } } /// Describes the access to the underlay used by the StorageManager. diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs new file mode 100644 index 0000000000..f26ecd4c76 --- /dev/null +++ b/sled-agent/src/zone_bundle.rs @@ -0,0 +1,608 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2023 Oxide Computer Company + +//! Tools for collecting and inspecting service bundles for zones. + +use crate::params::ZoneBundleCause; +use crate::params::ZoneBundleMetadata; +use camino::FromPathBufError; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use flate2::bufread::GzDecoder; +use illumos_utils::running_zone::is_oxide_smf_log_file; +use illumos_utils::running_zone::RunningZone; +use illumos_utils::zone::AdmError; +use slog::Logger; +use std::io::Cursor; +use std::path::PathBuf; +use std::time::SystemTime; +use tar::Archive; +use tar::Builder; +use tar::Header; +use uuid::Uuid; + +/// Context for creating a bundle of a specified zone. +#[derive(Debug, Default)] +pub struct ZoneBundleContext { + /// The directories into which the zone bundles are written. + pub storage_dirs: Vec, + /// The reason or cause for creating a zone bundle. + pub cause: ZoneBundleCause, + /// Extra directories searched for logfiles for the name zone. + /// + /// Logs are periodically archived out of their original location, and onto + /// one or more U.2 drives. This field is used to specify that archive + /// location, so that rotated logs for the zone's services may be found. + pub extra_log_dirs: Vec, + /// Any zone-specific commands that will be part of the zone bundle. + /// + /// These should be specified as a list of strings, as passed into + /// `RunningZone::run_cmd()`. + pub zone_specific_commands: Vec>, +} + +// The set of zone-wide commands, which don't require any details about the +// processes we've launched in the zone. +const ZONE_WIDE_COMMANDS: [&[&str]; 6] = [ + &["ptree"], + &["uptime"], + &["last"], + &["who"], + &["svcs", "-p"], + &["netstat", "-an"], +]; + +// The name for zone bundle metadata files. +const ZONE_BUNDLE_METADATA_FILENAME: &str = "metadata.toml"; + +/// Errors related to managing service zone bundles. +#[derive(Debug, thiserror::Error)] +pub enum BundleError { + #[error("I/O error")] + Io(#[from] std::io::Error), + + #[error("TOML serialization failure")] + Serialization(#[from] toml::ser::Error), + + #[error("TOML deserialization failure")] + Deserialization(#[from] toml::de::Error), + + #[error("No zone named '{name}' is available for bundling")] + NoSuchZone { name: String }, + + #[error("No storage available for bundles")] + NoStorage, + + #[error("Failed to join zone bundling task")] + Task(#[from] tokio::task::JoinError), + + #[error("Failed to create bundle: {0}")] + BundleFailed(#[from] anyhow::Error), + + #[error("Zone error")] + Zone(#[from] AdmError), + + #[error(transparent)] + PathBuf(#[from] FromPathBufError), + + #[error("Zone '{name}' cannot currently be bundled")] + Unavailable { name: String }, +} + +/// Create a service bundle for the provided zone. +/// +/// This runs a series of debugging commands in the zone, to collect data about +/// the state of the zone and any Oxide service processes running inside. The +/// data is packaged into a tarball, and placed in the provided output +/// directories. +pub async fn create( + log: &Logger, + zone: &RunningZone, + context: &ZoneBundleContext, +) -> Result { + // Fetch the directory into which we'll store data, and ensure it exists. + if context.storage_dirs.is_empty() { + warn!(log, "no directories available for zone bundles"); + return Err(BundleError::NoStorage); + } + info!(log, "creating zone bundle"; "zone" => zone.name()); + let mut zone_bundle_dirs = Vec::with_capacity(context.storage_dirs.len()); + for dir in context.storage_dirs.iter() { + let bundle_dir = dir.join(zone.name()); + debug!(log, "creating bundle directory"; "dir" => %bundle_dir); + tokio::fs::create_dir_all(&bundle_dir).await?; + zone_bundle_dirs.push(bundle_dir); + } + + // Create metadata and the tarball writer. + // + // We'll write the contents of the bundle into a gzipped tar archive, + // including metadata and a file for the output of each command we run in + // the zone. + let zone_metadata = ZoneBundleMetadata::new(zone.name(), context.cause); + let filename = format!("{}.tar.gz", zone_metadata.id.bundle_id); + let full_path = zone_bundle_dirs[0].join(&filename); + let file = match tokio::fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&full_path) + .await + { + Ok(f) => f.into_std().await, + Err(e) => { + error!( + log, + "failed to create bundle file"; + "zone" => zone.name(), + "file" => %full_path, + "error" => ?e, + ); + return Err(BundleError::from(e)); + } + }; + debug!( + log, + "created bundle tarball file"; + "zone" => zone.name(), + "path" => %full_path + ); + let gz = flate2::GzBuilder::new() + .filename(filename.as_str()) + .write(file, flate2::Compression::best()); + let mut builder = Builder::new(gz); + + // Helper function to write an array of bytes into the tar archive, with + // the provided name. + fn insert_data( + builder: &mut Builder, + name: &str, + contents: &[u8], + ) -> Result<(), BundleError> { + let mtime = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|e| anyhow::anyhow!("failed to compute mtime: {e}"))? + .as_secs(); + + let mut hdr = Header::new_ustar(); + hdr.set_size(contents.len().try_into().unwrap()); + hdr.set_mode(0o444); + hdr.set_mtime(mtime); + hdr.set_entry_type(tar::EntryType::Regular); + // NOTE: This internally sets the path and checksum. + builder + .append_data(&mut hdr, name, Cursor::new(contents)) + .map_err(BundleError::from) + } + + // Write the metadata file itself, in TOML format. + let contents = toml::to_string(&zone_metadata)?; + insert_data( + &mut builder, + ZONE_BUNDLE_METADATA_FILENAME, + contents.as_bytes(), + )?; + debug!( + log, + "wrote zone bundle metadata"; + "zone" => zone.name(), + ); + for cmd in ZONE_WIDE_COMMANDS { + debug!( + log, + "running zone bundle command"; + "zone" => zone.name(), + "command" => ?cmd, + ); + let output = match zone.run_cmd(cmd) { + Ok(s) => s, + Err(e) => format!("{}", e), + }; + let contents = format!("Command: {:?}\n{}", cmd, output).into_bytes(); + if let Err(e) = insert_data(&mut builder, cmd[0], &contents) { + error!( + log, + "failed to save zone bundle command output"; + "zone" => zone.name(), + "command" => ?cmd, + "error" => ?e, + ); + } + } + + // Run any caller-requested zone-specific commands. + for (i, cmd) in context.zone_specific_commands.iter().enumerate() { + if cmd.is_empty() { + continue; + } + debug!( + log, + "running user-requested zone bundle command"; + "zone" => zone.name(), + "command" => ?cmd, + ); + let output = match zone.run_cmd(cmd) { + Ok(s) => s, + Err(e) => format!("{}", e), + }; + let contents = format!("Command: {:?}\n{}", cmd, output).into_bytes(); + + // We'll insert the index into the filename as well, since it's + // plausible that users will run multiple executions of the same + // command. + let filename = format!("zone-specific-{}-{}", i, &cmd[0]); + if let Err(e) = insert_data(&mut builder, &filename, &contents) { + error!( + log, + "failed to save zone bundle command output"; + "zone" => zone.name(), + "command" => ?cmd, + "error" => ?e, + ); + } + } + + // Debugging commands run on the specific processes this zone defines. + const ZONE_PROCESS_COMMANDS: [&str; 3] = [ + "pfiles", "pstack", + "pargs", + // TODO-completeness: We may want `gcore`, since that encompasses + // the above commands and much more. It seems like overkill now, + // however. + ]; + let procs = match zone.service_processes() { + Ok(p) => { + debug!( + log, + "enumerated service processes"; + "zone" => zone.name(), + "procs" => ?p, + ); + p + } + Err(e) => { + error!( + log, + "failed to enumerate zone service processes"; + "zone" => zone.name(), + "error" => ?e, + ); + let err = anyhow::anyhow!( + "failed to enumerate zone service processes: {e}" + ); + return Err(BundleError::from(err)); + } + }; + for svc in procs.into_iter() { + let pid_s = svc.pid.to_string(); + for cmd in ZONE_PROCESS_COMMANDS { + let args = &[cmd, &pid_s]; + debug!( + log, + "running zone bundle command"; + "zone" => zone.name(), + "command" => ?args, + ); + let output = match zone.run_cmd(args) { + Ok(s) => s, + Err(e) => format!("{}", e), + }; + let contents = + format!("Command: {:?}\n{}", args, output).into_bytes(); + + // There may be multiple Oxide service processes for which we + // want to capture the command output. Name each output after + // the command and PID to disambiguate. + let filename = format!("{}.{}", cmd, svc.pid); + if let Err(e) = insert_data(&mut builder, &filename, &contents) { + error!( + log, + "failed to save zone bundle command output"; + "zone" => zone.name(), + "command" => ?args, + "error" => ?e, + ); + } + } + + // We may need to extract log files that have been archived out of the + // zone filesystem itself. See `crate::dump_setup` for the logic which + // does this. + let archived_log_files = find_archived_log_files( + log, + zone.name(), + &svc.service_name, + &context.extra_log_dirs, + ) + .await; + + // Copy any log files, current and rotated, into the tarball as + // well. + // + // Safety: This pathbuf was retrieved by locating an existing file + // on the filesystem, so we're sure it has a name and the unwrap is + // safe. + debug!( + log, + "appending current log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %svc.log_file, + ); + if let Err(e) = builder.append_path_with_name( + &svc.log_file, + svc.log_file.file_name().unwrap(), + ) { + error!( + log, + "failed to append current log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %svc.log_file, + "error" => ?e, + ); + return Err(e.into()); + } + for f in svc.rotated_log_files.iter().chain(archived_log_files.iter()) { + debug!( + log, + "appending rotated log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %f, + ); + if let Err(e) = + builder.append_path_with_name(f, f.file_name().unwrap()) + { + error!( + log, + "failed to append rotated log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %f, + "error" => ?e, + ); + return Err(e.into()); + } + } + } + + // Finish writing out the tarball itself. + builder + .into_inner() + .map_err(|e| anyhow::anyhow!("Failed to build bundle: {e}"))?; + + // Copy the bundle to the other locations. We really want the bundles to + // be duplicates, not an additional, new bundle. + for other_dir in zone_bundle_dirs[1..].iter() { + let to = other_dir.join(&filename); + debug!(log, "copying bundle"; "from" => %full_path, "to" => %to); + tokio::fs::copy(&full_path, to).await?; + } + + info!(log, "finished zone bundle"; "metadata" => ?zone_metadata); + Ok(zone_metadata) +} + +// Find log files for the specified zone / SMF service, which may have been +// archived out to a U.2 dataset. +// +// Note that errors are logged, rather than failing the whole function, so that +// one failed listing does not prevent collecting any other log files. +async fn find_archived_log_files( + log: &Logger, + zone_name: &str, + svc_name: &str, + dirs: &[Utf8PathBuf], +) -> Vec { + // The `dirs` should be things like + // `/pool/ext//crypt/debug/`, but it's really up to + // the caller to verify these exist and possibly contain what they expect. + // + // Within that, we'll just look for things that appear to be Oxide-managed + // SMF service log files. + let mut files = Vec::new(); + for dir in dirs.iter() { + if dir.exists() { + let mut rd = match tokio::fs::read_dir(&dir).await { + Ok(rd) => rd, + Err(e) => { + error!( + log, + "failed to read zone debug directory"; + "directory" => ?dir, + "reason" => ?e, + ); + continue; + } + }; + loop { + match rd.next_entry().await { + Ok(None) => break, + Ok(Some(entry)) => { + let Ok(path) = Utf8PathBuf::try_from(entry.path()) else { + error!( + log, + "skipping possible archived log file with \ + non-UTF-8 path"; + "path" => ?entry.path(), + ); + continue; + }; + let fname = path.file_name().unwrap(); + if is_oxide_smf_log_file(fname) + && fname.contains(svc_name) + { + debug!( + log, + "found archived log file"; + "zone_name" => zone_name, + "service_name" => svc_name, + "path" => ?path, + ); + files.push(path); + } + } + Err(e) => { + error!( + log, + "failed to fetch zone debug directory entry"; + "directory" => ?dir, + "reason" => ?e, + ); + } + } + } + } else { + // The logic in `dump_setup` picks some U.2 in which to start + // archiving logs, and thereafter tries to keep placing new ones + // there, subject to space constraints. It's not really an error for + // there to be no entries for the named zone in any particular U.2 + // debug dataset. + slog::trace!( + log, + "attempting to find archived log files in \ + non-existent directory"; + "directory" => ?dir, + ); + } + } + files +} + +// Extract the zone bundle metadata from a file, if it exists. +fn extract_zone_bundle_metadata_impl( + path: &std::path::PathBuf, +) -> Result { + // Build a reader for the whole archive. + let reader = std::fs::File::open(path).map_err(BundleError::from)?; + let buf_reader = std::io::BufReader::new(reader); + let gz = GzDecoder::new(buf_reader); + let mut archive = Archive::new(gz); + + // Find the metadata entry, if it exists. + let entries = archive.entries()?; + let Some(md_entry) = entries + // The `Archive::entries` iterator + // returns a result, so filter to those + // that are OK first. + .filter_map(Result::ok) + .find(|entry| { + entry + .path() + .map(|p| p.to_str() == Some(ZONE_BUNDLE_METADATA_FILENAME)) + .unwrap_or(false) + }) + else { + return Err(BundleError::from( + anyhow::anyhow!("Zone bundle is missing metadata file") + )); + }; + + // Extract its contents and parse as metadata. + let contents = std::io::read_to_string(md_entry)?; + toml::from_str(&contents).map_err(BundleError::from) +} + +/// List the extant zone bundles for the provided zone, in the provided +/// directory. +pub async fn list_bundles_for_zone( + log: &Logger, + path: &Utf8Path, + zone_name: &str, +) -> Result, BundleError> { + let mut bundles = Vec::new(); + let zone_bundle_dir = path.join(zone_name); + if zone_bundle_dir.is_dir() { + let mut dir = tokio::fs::read_dir(zone_bundle_dir) + .await + .map_err(BundleError::from)?; + while let Some(zone_bundle) = + dir.next_entry().await.map_err(BundleError::from)? + { + let bundle_path = zone_bundle.path(); + debug!( + log, + "checking possible zone bundle"; + "bundle_path" => %bundle_path.display(), + ); + + // Zone bundles _should_ be named like: + // + // .../bundle/zone//.tar.gz. + // + // However, really a zone bundle is any tarball with the + // right metadata file, which contains a TOML-serialized + // `ZoneBundleMetadata` file. Try to create an archive out + // of each file we find in this directory, and parse out a + // metadata file. + let tarball = bundle_path.to_owned(); + let metadata = match extract_zone_bundle_metadata(tarball).await { + Ok(md) => md, + Err(e) => { + error!( + log, + "failed to read zone bundle metadata"; + "error" => ?e, + ); + return Err(e); + } + }; + debug!(log, "found zone bundle"; "metadata" => ?metadata); + bundles.push((Utf8PathBuf::try_from(bundle_path)?, metadata)); + } + } + Ok(bundles) +} + +/// Extract zone bundle metadata from the provided file, if possible. +pub async fn extract_zone_bundle_metadata( + path: PathBuf, +) -> Result { + let task = tokio::task::spawn_blocking(move || { + extract_zone_bundle_metadata_impl(&path) + }); + task.await? +} + +/// Get the path to a zone bundle, if it exists. +pub async fn get_zone_bundle_path( + log: &Logger, + directories: &[Utf8PathBuf], + zone_name: &str, + id: &Uuid, +) -> Result, BundleError> { + for path in directories { + debug!(log, "searching zone bundle directory"; "directory" => ?path); + let zone_bundle_dir = path.join(zone_name); + if zone_bundle_dir.is_dir() { + let mut dir = tokio::fs::read_dir(zone_bundle_dir) + .await + .map_err(BundleError::from)?; + while let Some(zone_bundle) = + dir.next_entry().await.map_err(BundleError::from)? + { + let path = zone_bundle.path(); + let metadata = match extract_zone_bundle_metadata(path).await { + Ok(md) => md, + Err(e) => { + error!( + log, + "failed to read zone bundle metadata"; + "error" => ?e, + ); + return Err(e); + } + }; + let bundle_id = &metadata.id; + if bundle_id.zone_name == zone_name + && bundle_id.bundle_id == *id + { + return Utf8PathBuf::try_from(zone_bundle.path()) + .map(|p| Some(p)) + .map_err(BundleError::from); + } + } + } + } + Ok(None) +} diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index 5ff555cd84..aec99ae3f8 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -255,7 +255,7 @@ pub const INSTALL_DATASET: &'static str = "install"; pub const CRASH_DATASET: &'static str = "crash"; pub const CLUSTER_DATASET: &'static str = "cluster"; pub const CONFIG_DATASET: &'static str = "config"; -pub const DEBUG_DATASET: &'static str = "debug"; +pub const M2_DEBUG_DATASET: &'static str = "debug"; // TODO-correctness: This value of 100GiB is a pretty wild guess, and should be // tuned as needed. pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); @@ -267,6 +267,7 @@ pub const DUMP_DATASET_COMPRESSION: &'static str = "gzip-9"; // U.2 datasets live under the encrypted dataset and inherit encryption pub const ZONE_DATASET: &'static str = "crypt/zone"; pub const DUMP_DATASET: &'static str = "crypt/debug"; +pub const U2_DEBUG_DATASET: &'static str = "crypt/debug"; // This is the root dataset for all U.2 drives. Encryption is inherited. pub const CRYPT_DATASET: &'static str = "crypt"; @@ -301,7 +302,7 @@ static M2_EXPECTED_DATASETS: [ExpectedDataset; M2_EXPECTED_DATASET_COUNT] = [ // Should be duplicated to both M.2s. ExpectedDataset::new(CONFIG_DATASET), // Store debugging data, such as service bundles. - ExpectedDataset::new(DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), + ExpectedDataset::new(M2_DEBUG_DATASET).quota(DEBUG_DATASET_QUOTA), ]; impl Disk { diff --git a/tools/create_virtual_hardware.sh b/tools/create_virtual_hardware.sh index 908cd63dc1..9125013b1f 100755 --- a/tools/create_virtual_hardware.sh +++ b/tools/create_virtual_hardware.sh @@ -50,6 +50,11 @@ function ensure_simulated_links { function ensure_softnpu_zone { zoneadm list | grep -q sidecar_softnpu || { + if ! [[ -f "out/npuzone/npuzone" ]]; then + echo "npuzone binary is not installed" + echo "please re-run ./tools/install_prerequisites.sh" + exit 1 + fi out/npuzone/npuzone create sidecar \ --omicron-zone \ --ports sc0_0,tfportrear0_0 \ @@ -59,7 +64,26 @@ function ensure_softnpu_zone { success "softnpu zone exists" } +function warn_if_physical_link_and_no_proxy_arp { + local PHYSICAL_LINK="$1" + dladm show-phys "$PHYSICAL_LINK" || return + if ! [[ -v PXA_START ]] || ! [[ -v PXA_END ]]; then + warn "You are running with a real physical link, but have not\n\ +set up the proxy-ARP environment variables PXA_{START,END}.\n\ +This implies you're trying to slice out a portion of your\n\ +local network for Omicron. The PXA_* variables are necessary\n\ +to allow SoftNPU to respond to ARP requests for the portion\n\ +of the network you've dedicated to Omicron. Things will not\n\ +work until you add those.\n\ +\n\ +You must either destroy / recreate the Omicron environment,\n\ +or run \`scadm standalone add-proxy-arp\` in the SoftNPU zone\n\ +later to add those entries." + fi +} + ensure_run_as_root ensure_zpools ensure_simulated_links "$PHYSICAL_LINK" +warn_if_physical_link_and_no_proxy_arp "$PHYSICAL_LINK" ensure_softnpu_zone diff --git a/tools/scrimlet/softnpu-init.sh b/tools/scrimlet/softnpu-init.sh index 2c3ed51161..57ef4566a3 100755 --- a/tools/scrimlet/softnpu-init.sh +++ b/tools/scrimlet/softnpu-init.sh @@ -27,7 +27,24 @@ fi # $ arp -a | grep 192.168.21.1 # e1000g1 192.168.21.1 255.255.255.255 90:ec:77:2e:70:27 -GATEWAY_MAC=${GATEWAY_MAC:=$(arp -a | grep "$GATEWAY_IP" | awk -F ' ' '{print $NF}')} +# +# Add an extrac space at the end of the search pattern passed to `grep`, so that +# we can be sure we're matching the exact $GATEWAY_IP, and not something that +# shares the same string prefix. +GATEWAY_MAC=${GATEWAY_MAC:=$(arp -a | grep "$GATEWAY_IP " | awk -F ' ' '{print $NF}')} + +# Check that the MAC appears to be exactly one MAC address. +COUNT=$(grep -c -E '^([0-9a-fA-F]{2}:){5}[0-9a-fA-F]{2}$' <(echo "$GATEWAY_MAC")) +if [[ $COUNT -ne 1 ]]; then + set +x + echo "GATEWAY_MAC does not appear to be a valid MAC address." + echo "It either could not be automatically determined from the" + echo "gateway IP, or the provided environment variable is malformed." + echo "GATEWAY_IP = $GATEWAY_IP" + echo "Extracted or set GATEWAY_MAC = $GATEWAY_MAC" + echo "Please set GATEWAY_MAC manually or use a different GATEWAY_IP" + exit 1 +fi echo "Using $GATEWAY_MAC as gateway mac" z_scadm () {