From 56ad22dd3b514bf3e01b8c7e2d5ca8c6fbccf685 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Thu, 15 Jun 2023 20:26:13 +0000 Subject: [PATCH 1/4] Adds service bundles for zones - Adds a dataset to the M.2s for storing debugging data. - Adds basic mechanism for setting a ZFS quota on datasets. - Adds HTTP endpoints for listing, creating, and fetching zone service bundles from the sled agent. - Adds methods to `ServiceManager` for implementing the above. Zone bundles run a set of commands to get the zone-wide output and some key process-specific data for relevant processes from an Oxide service zone. These are packed into a tarball along with a simple metdata file, describing the zone bundle. - Adds some helper methods in `RunningZone` and related for listing the expected SMF service names and processes associated with them based on the zone's manifest files. - Adds dev tool `zb` for talking to the sled agent to operate on zone bundles. --- Cargo.lock | 26 ++ Cargo.toml | 1 + illumos-utils/src/running_zone.rs | 147 +++++++- illumos-utils/src/zfs.rs | 19 +- nexus/examples/config.toml | 2 +- openapi/sled-agent.json | 179 ++++++++++ sled-agent/Cargo.toml | 3 + sled-agent/src/bin/zb.rs | 165 +++++++++ sled-agent/src/bootstrap/agent.rs | 4 +- sled-agent/src/http_entrypoints.rs | 102 +++++- sled-agent/src/params.rs | 55 +++ sled-agent/src/services.rs | 546 ++++++++++++++++++++++++++++- sled-agent/src/sled_agent.rs | 49 ++- sled-agent/src/storage_manager.rs | 2 + sled-hardware/src/disk.rs | 49 ++- 15 files changed, 1329 insertions(+), 20 deletions(-) create mode 100644 sled-agent/src/bin/zb.rs diff --git a/Cargo.lock b/Cargo.lock index 1c0ca80d65..3a89e3ee31 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2950,6 +2950,12 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-range" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21dec9db110f5f872ed9699c3ecf50cf16f423502706ba5c72462e28d3157573" + [[package]] name = "httparse" version = "1.8.0" @@ -3089,6 +3095,25 @@ dependencies = [ "tokio-rustls", ] +[[package]] +name = "hyper-staticfile" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "318ca89e4827e7fe4ddd2824f52337239796ae8ecc761a663324407dc3d8d7e7" +dependencies = [ + "futures-util", + "http", + "http-range", + "httpdate", + "hyper", + "mime_guess", + "percent-encoding", + "rand 0.8.5", + "tokio", + "url", + "winapi", +] + [[package]] name = "hyper-tls" version = "0.5.0" @@ -4792,6 +4817,7 @@ dependencies = [ "flate2", "futures", "http", + "hyper-staticfile", "illumos-utils", "internal-dns 0.1.0", "ipnetwork", diff --git a/Cargo.toml b/Cargo.toml index f4a50b9734..18d48a76cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -183,6 +183,7 @@ http = "0.2.9" httptest = "0.15.4" hyper-rustls = "0.24.0" hyper = "0.14" +hyper-staticfile = "0.9.5" humantime = "2.1.0" illumos-utils = { path = "illumos-utils" } indexmap = "1.9.3" diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 1d2fd89830..fa85cd2883 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -13,6 +13,7 @@ use crate::zone::{AddressRequest, IPADM, ZONE_PREFIX}; use camino::{Utf8Path, Utf8PathBuf}; use ipnetwork::IpNetwork; use omicron_common::backoff; +use slog::error; use slog::info; use slog::o; use slog::warn; @@ -24,6 +25,16 @@ use crate::zone::MockZones as Zones; #[cfg(not(any(test, feature = "testing")))] use crate::zone::Zones; +/// Errors returned from methods for fetching SMF services and log files +#[derive(thiserror::Error, Debug)] +pub enum ServiceError { + #[error("I/O error")] + Io(#[from] std::io::Error), + + #[error("Failed to run a command")] + RunCommand(#[from] RunCommandError), +} + /// Errors returned from [`RunningZone::run_cmd`]. #[derive(thiserror::Error, Debug)] #[error("Error running command in zone '{zone}': {err}")] @@ -762,6 +773,125 @@ impl RunningZone { pub fn links(&self) -> &Vec { &self.inner.links } + + /// Return the running processes associated with all the SMF services this + /// zone is intended to run. + pub fn service_processes( + &self, + ) -> Result, ServiceError> { + let service_names = self.service_names()?; + let mut services = Vec::with_capacity(service_names.len()); + for service_name in service_names.into_iter() { + let output = self.run_cmd(["ptree", "-s", &service_name])?; + + // All Oxide SMF services currently run a single binary, though it + // may be run in a contract via `ctrun`. We don't care about that + // binary, but any others we _do_ want to collect data from. + for line in output.lines() { + if line.contains("ctrun") { + continue; + } + let line = line.trim(); + let mut parts = line.split_ascii_whitespace(); + + // The first two parts should be the PID and the process binary + // path, respectively. + let Some(pid_s) = parts.next() else { + error!( + self.inner.log, + "failed to get service PID from ptree output"; + "service" => &service_name, + ); + continue; + }; + let Ok(pid) = pid_s.parse() else { + error!( + self.inner.log, + "failed to parse service PID from ptree output"; + "service" => &service_name, + "pid" => pid_s, + ); + continue; + }; + let Some(path) = parts.next() else { + error!( + self.inner.log, + "failed to get service binary from ptree output"; + "service" => &service_name, + ); + continue; + }; + let binary = Utf8PathBuf::from(path); + + // Fetch any log files for this SMF service. + let Some((log_file, rotated_log_files)) = self.service_log_files(&service_name)? else { + error!( + self.inner.log, + "failed to find log files for existing service"; + "service_name" => &service_name, + ); + continue; + }; + + services.push(ServiceProcess { + service_name: service_name.clone(), + binary, + pid, + log_file, + rotated_log_files, + }); + } + } + Ok(services) + } + + /// Return the names of the Oxide SMF services this zone is intended to run. + pub fn service_names(&self) -> Result, ServiceError> { + const NEEDLES: [&str; 2] = ["/oxide", "/system/illumos"]; + let output = self.run_cmd(&["svcs", "-H", "-o", "fmri"])?; + Ok(output + .lines() + .filter(|line| NEEDLES.iter().any(|needle| line.contains(needle))) + .map(|line| line.trim().to_string()) + .collect()) + } + + /// Return any SMF log files associated with the named service. + /// + /// Given a named service, this returns a tuple of the latest or current log + /// file, and an array of any rotated log files. If the service does not + /// exist, or there are no log files, `None` is returned. + pub fn service_log_files( + &self, + name: &str, + ) -> Result)>, ServiceError> { + let output = self.run_cmd(&["svcs", "-L", name])?; + let mut lines = output.lines(); + let Some(current) = lines.next() else { + return Ok(None); + }; + // We need to prepend the zonepath root to get the path in the GZ. We + // can do this with `join()`, but that will _replace_ the path if the + // second one is absolute. So trim any prefixed `/` from each path. + let root = self.root(); + let current_log_file = + root.join(current.trim().trim_start_matches('/')); + + // The rotated log files should have the same prefix as the current, but + // with an index appended. We'll search the parent directory for + // matching names, skipping the current file. + let dir = current_log_file.parent().unwrap(); + let mut rotated_files = Vec::new(); + for entry in dir.read_dir_utf8()? { + let entry = entry?; + let path = entry.path(); + if path != current_log_file && path.starts_with(¤t_log_file) { + rotated_files + .push(root.join(path.strip_prefix("/").unwrap_or(path))); + } + } + Ok(Some((current_log_file, rotated_files))) + } } impl Drop for RunningZone { @@ -783,6 +913,21 @@ impl Drop for RunningZone { } } +/// A process running in the zone associated with an SMF service. +#[derive(Clone, Debug)] +pub struct ServiceProcess { + /// The name of the SMF service. + pub service_name: String, + /// The path of the binary in the process image. + pub binary: Utf8PathBuf, + /// The PID of the process. + pub pid: u32, + /// The path for the current log file. + pub log_file: Utf8PathBuf, + /// The paths for any rotated log files. + pub rotated_log_files: Vec, +} + /// Errors returned from [`InstalledZone::install`]. #[derive(thiserror::Error, Debug)] pub enum InstallZoneError { @@ -817,7 +962,7 @@ pub struct InstalledZone { // NIC used for control plane communication. control_vnic: Link, - // Nic used for bootstrap network communication + // NIC used for bootstrap network communication bootstrap_vnic: Option, // OPTE devices for the guest network interfaces diff --git a/illumos-utils/src/zfs.rs b/illumos-utils/src/zfs.rs index 5e7c4790fb..f3e97efc9e 100644 --- a/illumos-utils/src/zfs.rs +++ b/illumos-utils/src/zfs.rs @@ -171,12 +171,15 @@ impl Zfs { } /// Creates a new ZFS filesystem named `name`, unless one already exists. + /// + /// Applies an optional quota, provided _in bytes_. pub fn ensure_filesystem( name: &str, mountpoint: Mountpoint, zoned: bool, do_format: bool, encryption_details: Option, + quota: Option, ) -> Result<(), EnsureFilesystemError> { let (exists, mounted) = Self::dataset_exists(name, &mountpoint)?; if exists { @@ -225,9 +228,23 @@ impl Zfs { cmd.args(&["-o", &format!("mountpoint={}", mountpoint), name]); execute(cmd).map_err(|err| EnsureFilesystemError { name: name.to_string(), - mountpoint, + mountpoint: mountpoint.clone(), err: err.into(), })?; + + // Apply any quota. + if let Some(quota) = quota { + if let Err(err) = + Self::set_value(name, "quota", &format!("{quota}")) + { + return Err(EnsureFilesystemError { + name: name.to_string(), + mountpoint, + // Take the execution error from the SetValueError + err: err.err.into(), + }); + } + } Ok(()) } diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index a116b31a5e..5e514c4975 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -44,7 +44,7 @@ request_body_max_bytes = 1048576 # initialization. If you're using this config file, you're probably running a # simulated system. In that case, the initial certificate is provided to the # simulated sled agent (acting as RSS) via command-line arguments. -#tls = true +tls = true [deployment.dropshot_internal] # IP Address and TCP port on which to listen for the internal API diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 8f45de2e6c..c6765b111c 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -507,6 +507,149 @@ } } }, + "/zones": { + "get": { + "summary": "List the zones that are currently managed by the sled agent.", + "operationId": "zones_list", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_String", + "type": "array", + "items": { + "type": "string" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/zones/{zone_name}/bundles": { + "get": { + "summary": "List the zone bundles that are current available for a zone.", + "operationId": "zone_bundle_list", + "parameters": [ + { + "in": "path", + "name": "zone_name", + "description": "The name of the zone.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_ZoneBundleMetadata", + "type": "array", + "items": { + "$ref": "#/components/schemas/ZoneBundleMetadata" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "post": { + "summary": "Ask the sled agent to create a zone bundle.", + "operationId": "zone_bundle_create", + "parameters": [ + { + "in": "path", + "name": "zone_name", + "description": "The name of the zone.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "201": { + "description": "successful creation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ZoneBundleMetadata" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/zones/{zone_name}/bundles/{bundle_id}": { + "get": { + "summary": "Fetch the binary content of a single zone bundle.", + "operationId": "zone_bundle_get", + "parameters": [ + { + "in": "path", + "name": "bundle_id", + "description": "The ID for this bundle itself.", + "required": true, + "schema": { + "type": "string", + "format": "uuid" + } + }, + { + "in": "path", + "name": "zone_name", + "description": "The name of the zone this bundle is derived from.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "", + "content": { + "*/*": { + "schema": {} + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/zpools": { "get": { "operationId": "zpools_get", @@ -2454,6 +2597,42 @@ "vni" ] }, + "ZoneBundleId": { + "description": "An identifier for a zone bundle.", + "type": "object", + "properties": { + "bundle_id": { + "description": "The ID for this bundle itself.", + "type": "string", + "format": "uuid" + }, + "zone_name": { + "description": "The name of the zone this bundle is derived from.", + "type": "string" + } + }, + "required": [ + "bundle_id", + "zone_name" + ] + }, + "ZoneBundleMetadata": { + "description": "Metadata about a zone bundle.", + "type": "object", + "properties": { + "id": { + "$ref": "#/components/schemas/ZoneBundleId" + }, + "time_created": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "id", + "time_created" + ] + }, "ZoneType": { "description": "The type of zone which may be requested from Sled Agent", "type": "string", diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index a030eba5eb..60e3a15106 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -28,6 +28,7 @@ dropshot.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +hyper-staticfile.workspace = true illumos-utils.workspace = true internal-dns.workspace = true ipnetwork.workspace = true @@ -53,7 +54,9 @@ serde_json.workspace = true sled-agent-client.workspace = true sled-hardware.workspace = true slog.workspace = true +slog-async.workspace = true slog-dtrace.workspace = true +slog-term.workspace = true smf.workspace = true sp-sim.workspace = true tar.workspace = true diff --git a/sled-agent/src/bin/zb.rs b/sled-agent/src/bin/zb.rs new file mode 100644 index 0000000000..57df293ec4 --- /dev/null +++ b/sled-agent/src/bin/zb.rs @@ -0,0 +1,165 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Small CLI to view and inspect zone bundles from the sled agent. + +use anyhow::anyhow; +use anyhow::Context; +use camino::Utf8PathBuf; +use clap::Parser; +use clap::Subcommand; +use futures::stream::StreamExt; +use omicron_common::address::SLED_AGENT_PORT; +use sled_agent_client::Client; +use slog::Drain; +use slog::Level; +use slog::LevelFilter; +use slog::Logger; +use slog_term::FullFormat; +use slog_term::TermDecorator; +use std::net::Ipv6Addr; +use tokio::io::AsyncWriteExt; +use uuid::Uuid; + +fn parse_log_level(s: &str) -> anyhow::Result { + s.parse().map_err(|_| anyhow!("Invalid log level")) +} + +/// Operate on sled agent zone bundles. +/// +/// Zoneb bundles are the collected state of a service zone. This includes +/// information about the processes running in the zone, and the system on which +/// they're running. +#[derive(Clone, Debug, Parser)] +struct Cli { + /// The IPv6 address for the sled agent to operate on. + #[arg(long, default_value_t = Ipv6Addr::LOCALHOST)] + host: Ipv6Addr, + /// The port on which to connect to the sled agent. + #[arg(long, default_value_t = SLED_AGENT_PORT)] + port: u16, + /// The log level for the command. + #[arg(long, value_parser = parse_log_level, default_value_t = Level::Warning)] + log_level: Level, + #[command(subcommand)] + cmd: Cmd, +} + +#[derive(Clone, Debug, Subcommand)] +enum Cmd { + /// List the zones available for collecting bundles from. + ListZones, + /// List existing bundles for a zone. + #[clap(visible_alias = "ls")] + List { + /// The name of the zone to list bundles for. + zone_name: String, + }, + /// Request the sled agent create a new zone bundle. + Create { + /// The name of the zone to list bundles for. + zone_name: String, + }, + /// Get a zone bundle from the sled agent. + Get { + /// The name of the zone to fetch the bundle for. + zone_name: String, + /// The ID of the bundle to fetch. + #[arg(long, group = "id")] + bundle_id: Option, + /// Create a new bundle, and then fetch it. + #[arg(long, group = "id")] + create: bool, + /// The output file. + /// + /// If not specified, the output file is named by the bundle ID itself. + #[arg(long)] + output: Option, + }, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let args = Cli::parse(); + let addr = format!("http://[{}]:{}", args.host, args.port); + let decorator = TermDecorator::new().build(); + let drain = FullFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain).build().fuse(); + let drain = LevelFilter::new(drain, args.log_level).fuse(); + let log = Logger::root(drain, slog::o!("unit" => "zb")); + let client = Client::new(&addr, log); + match args.cmd { + Cmd::ListZones => { + let zones = client + .zones_list() + .await + .context("failed to list zones")? + .into_inner(); + for zone in zones { + println!("{zone}"); + } + } + Cmd::List { zone_name } => { + let bundles = client + .zone_bundle_list(&zone_name) + .await + .context("failed to list zone bundles")? + .into_inner(); + for bundle in bundles { + println!("{}/{}", bundle.id.zone_name, bundle.id.bundle_id); + } + } + Cmd::Create { zone_name } => { + let bundle = client + .zone_bundle_create(&zone_name) + .await + .context("failed to create zone bundle")? + .into_inner(); + println!( + "Created zone bundle: {}/{}", + bundle.id.zone_name, bundle.id.bundle_id + ); + } + Cmd::Get { zone_name, bundle_id, create, output } => { + let bundle_id = if create { + let bundle = client + .zone_bundle_create(&zone_name) + .await + .context("failed to create zone bundle")? + .into_inner(); + println!( + "Created zone bundle: {}/{}", + bundle.id.zone_name, bundle.id.bundle_id + ); + bundle.id.bundle_id + } else { + bundle_id.expect("clap should have ensured this was Some(_)") + }; + let output = output.unwrap_or_else(|| { + Utf8PathBuf::from(format!("{}.tar.gz", bundle_id)) + }); + let bundle = client + .zone_bundle_get(&zone_name, &bundle_id) + .await + .context("failed to get zone bundle")? + .into_inner(); + let mut f = tokio::fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&output) + .await + .context("failed to open output file")?; + let mut stream = bundle.into_inner(); + while let Some(maybe_bytes) = stream.next().await { + let bytes = + maybe_bytes.context("failed to fetch all bundle data")?; + f.write_all(&bytes) + .await + .context("failed to write bundle data")?; + } + } + } + Ok(()) +} diff --git a/sled-agent/src/bootstrap/agent.rs b/sled-agent/src/bootstrap/agent.rs index 4da8ae0d12..b14f839a6a 100644 --- a/sled-agent/src/bootstrap/agent.rs +++ b/sled-agent/src/bootstrap/agent.rs @@ -344,12 +344,13 @@ impl Agent { // Before we start creating zones, we need to ensure that the // necessary ZFS and Zone resources are ready. // - // TODO(https://github.com/oxidecomputer/omicron/issues/1934): + // TODO(https://github.com/oxidecomputer/omicron/issues/2888): // We should carefully consider which dataset this is using; it's // currently part of the ramdisk. let zoned = true; let do_format = true; let encryption_details = None; + let quota = None; Zfs::ensure_filesystem( ZONE_ZFS_RAMDISK_DATASET, Mountpoint::Path(Utf8PathBuf::from( @@ -358,6 +359,7 @@ impl Agent { zoned, do_format, encryption_details, + quota, )?; // Before we start monitoring for hardware, ensure we're running from a diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 6d81d1b2d4..fcaa327a01 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -8,11 +8,13 @@ use crate::params::{ DatasetEnsureBody, DiskEnsureBody, InstanceEnsureBody, InstancePutMigrationIdsBody, InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, ServiceEnsureBody, - SledRole, TimeSync, VpcFirewallRulesEnsureBody, Zpool, + SledRole, TimeSync, VpcFirewallRulesEnsureBody, ZoneBundleId, + ZoneBundleMetadata, Zpool, }; use dropshot::{ - endpoint, ApiDescription, HttpError, HttpResponseOk, - HttpResponseUpdatedNoContent, Path, RequestContext, TypedBody, + endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseCreated, + HttpResponseHeaders, HttpResponseOk, HttpResponseUpdatedNoContent, Path, + RequestContext, TypedBody, }; use illumos_utils::opte::params::SetVirtualNetworkInterfaceHost; use omicron_common::api::external::Error; @@ -38,6 +40,10 @@ pub fn api() -> SledApiDescription { api.register(instance_register)?; api.register(instance_unregister)?; api.register(services_put)?; + api.register(zones_list)?; + api.register(zone_bundle_list)?; + api.register(zone_bundle_create)?; + api.register(zone_bundle_get)?; api.register(sled_role_get)?; api.register(set_v2p)?; api.register(del_v2p)?; @@ -56,6 +62,96 @@ pub fn api() -> SledApiDescription { api } +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +struct ZonePathParam { + /// The name of the zone. + zone_name: String, +} + +/// List the zone bundles that are current available for a zone. +#[endpoint { + method = GET, + path = "/zones/{zone_name}/bundles", +}] +async fn zone_bundle_list( + rqctx: RequestContext, + params: Path, +) -> Result>, HttpError> { + let params = params.into_inner(); + let zone_name = params.zone_name; + let sa = rqctx.context(); + sa.list_zone_bundles(&zone_name) + .await + .map(HttpResponseOk) + .map_err(HttpError::from) +} + +/// Ask the sled agent to create a zone bundle. +#[endpoint { + method = POST, + path = "/zones/{zone_name}/bundles", +}] +async fn zone_bundle_create( + rqctx: RequestContext, + params: Path, +) -> Result, HttpError> { + let params = params.into_inner(); + let zone_name = params.zone_name; + let sa = rqctx.context(); + sa.create_zone_bundle(&zone_name) + .await + .map(HttpResponseCreated) + .map_err(HttpError::from) +} + +/// Fetch the binary content of a single zone bundle. +#[endpoint { + method = GET, + path = "/zones/{zone_name}/bundles/{bundle_id}", +}] +async fn zone_bundle_get( + rqctx: RequestContext, + params: Path, +) -> Result>, HttpError> { + let params = params.into_inner(); + let zone_name = params.zone_name; + let bundle_id = params.bundle_id; + let sa = rqctx.context(); + let Some(path) = sa.get_zone_bundle_path(&zone_name, &bundle_id) + .await + .map_err(HttpError::from)? else { + return Err(HttpError::for_not_found( + None, + format!("No zone bundle for zone '{}' with ID '{}'", zone_name, bundle_id))); + }; + let f = tokio::fs::File::open(&path).await.map_err(|e| { + HttpError::for_internal_error(format!( + "failed to open zone bundle file at {}: {:?}", + path, e, + )) + })?; + let stream = hyper_staticfile::FileBytesStream::new(f); + let body = FreeformBody(stream.into_body()); + let mut response = HttpResponseHeaders::new_unnamed(HttpResponseOk(body)); + response.headers_mut().append( + http::header::CONTENT_TYPE, + "application/gzip".try_into().unwrap(), + ); + Ok(response) +} + +/// List the zones that are currently managed by the sled agent. +#[endpoint { + method = GET, + path = "/zones", +}] +async fn zones_list( + rqctx: RequestContext, +) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.zones_list().await.map(HttpResponseOk).map_err(HttpError::from) +} + #[endpoint { method = PUT, path = "/services", diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index cfffd1fe82..8bb2027067 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -2,6 +2,8 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +use chrono::DateTime; +use chrono::Utc; use omicron_common::api::internal::nexus::{ DiskRuntimeState, InstanceRuntimeState, }; @@ -656,3 +658,56 @@ pub enum SledRole { /// responsibilities. Scrimlet, } + +/// An identifier for a zone bundle. +#[derive( + Clone, + Debug, + Deserialize, + Eq, + Hash, + JsonSchema, + Ord, + PartialEq, + PartialOrd, + Serialize, +)] +pub struct ZoneBundleId { + /// The name of the zone this bundle is derived from. + pub zone_name: String, + /// The ID for this bundle itself. + pub bundle_id: Uuid, +} + +/// Metadata about a zone bundle. +#[derive( + Clone, + Debug, + Deserialize, + Eq, + Hash, + JsonSchema, + Ord, + PartialEq, + PartialOrd, + Serialize, +)] +pub struct ZoneBundleMetadata { + /// Identifier for this zone bundle + pub id: ZoneBundleId, + /// The time at which this zone bundle was created. + pub time_created: DateTime, +} + +impl ZoneBundleMetadata { + /// Create a new set of metadata for the provided zone. + pub(crate) fn new(zone_name: &str) -> Self { + Self { + id: ZoneBundleId { + zone_name: zone_name.to_string(), + bundle_id: Uuid::new_v4(), + }, + time_created: Utc::now(), + } + } +} diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 74fbb0acf8..046865f7c7 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -29,7 +29,7 @@ use crate::config::SidecarRevision; use crate::ledger::{Ledger, Ledgerable}; use crate::params::{ DendriteAsic, ServiceEnsureBody, ServiceType, ServiceZoneRequest, - ServiceZoneService, TimeSync, ZoneType, + ServiceZoneService, TimeSync, ZoneBundleMetadata, ZoneType, }; use crate::profile::*; use crate::smf_helper::Service; @@ -39,6 +39,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use ddm_admin_client::{Client as DdmAdminClient, DdmError}; use dpd_client::{types as DpdTypes, Client as DpdClient, Error as DpdError}; use dropshot::HandlerTaskMode; +use flate2::bufread::GzDecoder; use illumos_utils::addrobj::AddrObject; use illumos_utils::addrobj::IPV6_LINK_LOCAL_NAME; use illumos_utils::dladm::{Dladm, Etherstub, EtherstubVnic, PhysicalLink}; @@ -80,7 +81,9 @@ use sled_hardware::underlay::BOOTSTRAP_PREFIX; use sled_hardware::Baseboard; use sled_hardware::SledMode; use slog::Logger; +use std::collections::BTreeSet; use std::collections::HashSet; +use std::io::Cursor; use std::iter; use std::iter::FromIterator; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; @@ -88,6 +91,9 @@ use std::str::FromStr; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; +use tar::Archive; +use tar::Builder; +use tar::Header; use tokio::io::AsyncWriteExt; use tokio::sync::oneshot; use tokio::sync::Mutex; @@ -181,6 +187,9 @@ pub enum Error { #[error("Sidecar revision error")] SidecarRevision(#[from] anyhow::Error), + + #[error("Zone bundle error")] + Bundle(#[from] BundleError), } impl Error { @@ -200,6 +209,30 @@ impl From for omicron_common::api::external::Error { } } +#[derive(Debug, thiserror::Error)] +pub enum BundleError { + #[error("I/O error")] + Io(#[from] std::io::Error), + + #[error("TOML serialization failure")] + Serialization(#[from] toml::ser::Error), + + #[error("TOML deserialization failure")] + Deserialization(#[from] toml::de::Error), + + #[error("No zone named '{name}' is available for bundling")] + NoSuchZone { name: String }, + + #[error("No storage available for bundles")] + NoStorage, + + #[error("Failed to join zone bundling task")] + Task(#[from] tokio::task::JoinError), + + #[error("Failed to create bundle")] + BundleFailed(#[from] anyhow::Error), +} + /// Configuration parameters which modify the [`ServiceManager`]'s behavior. pub struct Config { /// Identifies the sled being configured @@ -219,6 +252,12 @@ impl Config { const SERVICES_LEDGER_FILENAME: &str = "services.toml"; const STORAGE_SERVICES_LEDGER_FILENAME: &str = "storage-services.toml"; +// The directory within the debug dataset in which bundles are created. +const BUNDLE_DIRECTORY: &str = "bundle"; + +// The directory for zone bundles. +const ZONE_BUNDLE_DIRECTORY: &str = "zone"; + // A wrapper around `ZoneRequest`, which allows it to be serialized // to a toml file. #[derive(Clone, serde::Serialize, serde::Deserialize)] @@ -416,6 +455,33 @@ impl ServiceManager { self.inner.switch_zone_bootstrap_address } + // Return the directories for storing debug information. + async fn all_debug_directories(&self) -> Vec { + self.inner + .storage + .resources() + .all_m2_mountpoints(sled_hardware::disk::DEBUG_DATASET) + .await + } + + // Return the directories for storing all service bundles. + async fn all_service_bundle_directories(&self) -> Vec { + self.all_debug_directories() + .await + .into_iter() + .map(|p| p.join(BUNDLE_DIRECTORY)) + .collect() + } + + // Return the directories for storing zone service bundles. + async fn all_zone_bundle_directories(&self) -> Vec { + self.all_service_bundle_directories() + .await + .into_iter() + .map(|p| p.join(ZONE_BUNDLE_DIRECTORY)) + .collect() + } + async fn all_service_ledgers(&self) -> Vec { if let Some(dir) = self.inner.ledger_directory_override.get() { return vec![dir.join(SERVICES_LEDGER_FILENAME)]; @@ -1835,6 +1901,8 @@ impl ServiceManager { ); } + // TODO-correctness: It seems like we should continue with the other + // zones, rather than bail out of this method entirely. let running_zone = self .initialize_zone( req, @@ -1847,6 +1915,482 @@ impl ServiceManager { Ok(()) } + // Create a zone bundle for the named running zone. + async fn create_zone_bundle_impl( + &self, + zone: &RunningZone, + ) -> Result { + // Fetch the directory into which we'll store data, and ensure it + // exists. + let log = &self.inner.log; + let directories = self.all_zone_bundle_directories().await; + if directories.is_empty() { + warn!(log, "no directories available for zone bundles"); + return Err(BundleError::NoStorage); + } + info!( + log, + "creating zone bundle"; + "zone" => zone.name(), + "directories" => ?directories, + ); + let mut zone_bundle_dirs = Vec::with_capacity(directories.len()); + for dir in directories.iter() { + let bundle_dir = dir.join(zone.name()); + debug!(log, "creating bundle directory"; "dir" => %bundle_dir); + tokio::fs::create_dir_all(&bundle_dir).await?; + zone_bundle_dirs.push(bundle_dir); + } + + // Create metadata and the tarball writer. + // + // We'll write the contents of the bundle into a gzipped tar archive, + // including metadata and a file for the output of each command we run + // in the zone. + let zone_metadata = ZoneBundleMetadata::new(zone.name()); + let filename = format!("{}.tar.gz", zone_metadata.id.bundle_id); + let full_path = zone_bundle_dirs[0].join(&filename); + let file = match tokio::fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&full_path) + .await + { + Ok(f) => f.into_std().await, + Err(e) => { + error!( + log, + "failed to create bundle file"; + "zone" => zone.name(), + "file" => %full_path, + "error" => ?e, + ); + return Err(BundleError::from(e)); + } + }; + debug!( + log, + "created bundle tarball file"; + "zone" => zone.name(), + "path" => %full_path + ); + let gz = flate2::GzBuilder::new() + .filename(filename.as_str()) + .write(file, flate2::Compression::best()); + let mut builder = Builder::new(gz); + + // Helper function to write an array of bytes into the tar archive, with + // the provided name. + fn insert_data( + builder: &mut Builder, + name: &str, + contents: &[u8], + ) -> Result<(), BundleError> { + let mtime = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|e| anyhow::anyhow!("failed to compute mtime: {e}"))? + .as_secs(); + + let mut hdr = Header::new_ustar(); + hdr.set_size(contents.len().try_into().unwrap()); + hdr.set_mode(0o444); + hdr.set_mtime(mtime); + hdr.set_entry_type(tar::EntryType::Regular); + // NOTE: This internally sets the path and checksum. + builder + .append_data(&mut hdr, name, Cursor::new(contents)) + .map_err(BundleError::from) + } + + // Write the metadata file itself, in TOML format. + let contents = toml::to_string(&zone_metadata)?; + insert_data(&mut builder, "metadata", contents.as_bytes())?; + debug!( + log, + "wrote zone bundle metadata"; + "zone" => zone.name(), + ); + + // The set of zone-wide commands, which don't require any details about + // the processes we've launched in the zone. + const ZONE_WIDE_COMMANDS: [&[&str]; 6] = [ + &["ptree"], + &["uptime"], + &["last"], + &["who"], + &["svcs", "-p"], + &["netstat", "-an"], + ]; + for cmd in ZONE_WIDE_COMMANDS { + debug!( + log, + "running zone bundle command"; + "zone" => zone.name(), + "command" => ?cmd, + ); + let output = match zone.run_cmd(cmd) { + Ok(s) => s, + Err(e) => format!("{}", e), + }; + let contents = + format!("Command: {:?}\n{}", cmd, output).into_bytes(); + if let Err(e) = insert_data(&mut builder, cmd[0], &contents) { + error!( + log, + "failed to save zone bundle command output"; + "zone" => zone.name(), + "command" => ?cmd, + "error" => ?e, + ); + } + } + + // Debugging commands run on the specific processes this zone defines. + const ZONE_PROCESS_COMMANDS: [&str; 3] = [ + "pfiles", "pstack", + "pargs", + // TODO-completeness: We may want `gcore`, since that encompasses + // the above commands and much more. It seems like overkill now, + // however. + ]; + let procs = match zone.service_processes() { + Ok(p) => { + debug!( + log, + "enumerated service processes"; + "zone" => zone.name(), + "procs" => ?p, + ); + p + } + Err(e) => { + error!( + log, + "failed to enumerate zone service processes"; + "zone" => zone.name(), + "error" => ?e, + ); + let err = anyhow::anyhow!( + "failed to enumerate zone service processes: {e}" + ); + return Err(BundleError::from(err)); + } + }; + for svc in procs.into_iter() { + let pid_s = svc.pid.to_string(); + for cmd in ZONE_PROCESS_COMMANDS { + let args = &[cmd, &pid_s]; + debug!( + log, + "running zone bundle command"; + "zone" => zone.name(), + "command" => ?args, + ); + let output = match zone.run_cmd(args) { + Ok(s) => s, + Err(e) => format!("{}", e), + }; + let contents = + format!("Command: {:?}\n{}", args, output).into_bytes(); + + // There may be multiple Oxide service processes for which we + // want to capture the command output. Name each output after + // the command and PID to disambiguate. + let filename = format!("{}.{}", cmd, svc.pid); + if let Err(e) = insert_data(&mut builder, &filename, &contents) + { + error!( + log, + "failed to save zone bundle command output"; + "zone" => zone.name(), + "command" => ?args, + "error" => ?e, + ); + } + } + + // Copy any log files, current and rotated, into the tarball as + // well. + // + // Saftey: This is a log file, so we're sure it's a single, normal + // file and thus has a name. + debug!( + log, + "appending current log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %svc.log_file, + ); + if let Err(e) = builder.append_path_with_name( + &svc.log_file, + svc.log_file.file_name().unwrap(), + ) { + error!( + log, + "failed to append current log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %svc.log_file, + "error" => ?e, + ); + return Err(e.into()); + } + for f in svc.rotated_log_files.iter() { + debug!( + log, + "appending rotated log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %svc.log_file, + ); + if let Err(e) = + builder.append_path_with_name(f, f.file_name().unwrap()) + { + error!( + log, + "failed to append current log file to zone bundle"; + "zone" => zone.name(), + "log_file" => %svc.log_file, + "error" => ?e, + ); + return Err(e.into()); + } + } + } + + // Finish writing out the tarball itself. + builder + .into_inner() + .map_err(|e| anyhow::anyhow!("Failed to build bundle: {e}"))?; + + // Copy the bundle to the other locations. We really want the bundles to + // be duplicates, not an additional, new bundle. + for other_dir in zone_bundle_dirs[1..].iter() { + let to = other_dir.join(&filename); + debug!(log, "copying bundle"; "from" => %full_path, "to" => %to); + tokio::fs::copy(&full_path, to).await?; + } + + info!(log, "finished zone bundle"; "metadata" => ?zone_metadata); + Ok(zone_metadata) + } + + /// Create a zone bundle for the provided zone. + pub async fn create_zone_bundle( + &self, + name: &str, + ) -> Result { + // Search for the named zone. + if let SledLocalZone::Running { zone, .. } = + &*self.inner.switch_zone.lock().await + { + if zone.name() == name { + return self + .create_zone_bundle_impl(zone) + .await + .map_err(Error::from); + } + } + if let Some(zone) = + self.inner.zones.lock().await.iter().find(|z| z.name() == name) + { + return self + .create_zone_bundle_impl(zone) + .await + .map_err(Error::from); + } + if let Some(zone) = self + .inner + .dataset_zones + .lock() + .await + .iter() + .find(|z| z.name() == name) + { + return self + .create_zone_bundle_impl(zone) + .await + .map_err(Error::from); + } + Err(Error::from(BundleError::NoSuchZone { name: name.to_string() })) + } + + fn extract_zone_bundle_metadata( + path: &std::path::PathBuf, + ) -> Result { + // Build a reader for the whole archive. + let reader = std::fs::File::open(path).map_err(BundleError::from)?; + let buf_reader = std::io::BufReader::new(reader); + let gz = GzDecoder::new(buf_reader); + let mut archive = Archive::new(gz); + + // Find the metadata entry, if it exists. + let entries = archive.entries()?; + let Some(md_entry) = entries + // The `Archive::entries` iterator + // returns a result, so filter to those + // that are OK first. + .filter_map(Result::ok) + .find(|entry| { + entry + .path() + .map(|p| p.to_str() == Some("metadata")) + .unwrap_or(false) + }) + else { + return Err(BundleError::from( + anyhow::anyhow!("Zone bundle is missing metadata file") + )); + }; + + // Extract its contents and parse as metadata. + let contents = std::io::read_to_string(md_entry)?; + toml::from_str(&contents).map_err(BundleError::from) + } + + /// List the bundles available for the zone of the provided name. + pub async fn list_zone_bundles( + &self, + name: &str, + ) -> Result, Error> { + let log = &self.inner.log; + + // The zone bundles are replicated in several places, so we'll use a set + // to collect them all, to avoid duplicating. + let mut bundles = BTreeSet::new(); + + for path in self.all_zone_bundle_directories().await { + info!(log, "searching zone bundle directory"; "directory" => ?path); + let zone_bundle_dir = path.join(name); + if zone_bundle_dir.is_dir() { + let mut dir = tokio::fs::read_dir(zone_bundle_dir) + .await + .map_err(BundleError::from)?; + while let Some(zone_bundle) = + dir.next_entry().await.map_err(BundleError::from)? + { + let bundle_path = zone_bundle.path(); + info!( + log, + "checking possible zone bundle"; + "bundle_path" => %bundle_path.display(), + ); + + // Zone bundles _should_ be named like: + // + // .../bundle/zone//.tar.gz. + // + // However, really a zone bundle is any tarball with the + // right metadata file, which contains a TOML-serialized + // `ZoneBundleMetadata` file. Try to create an archive out + // of each file we find in this directory, and parse out a + // metadata file. + let tarball = bundle_path.to_owned(); + let task = tokio::task::spawn_blocking(move || { + Self::extract_zone_bundle_metadata(&tarball) + }); + let metadata = match task.await { + Ok(Ok(md)) => md, + Ok(Err(e)) => { + error!( + log, + "failed to read zone bundle metadata"; + "error" => ?e, + ); + return Err(Error::from(e)); + } + Err(e) => { + error!( + log, + "failed to join zone bundle metadata read task"; + "error" => ?e, + ); + return Err(Error::from(BundleError::from(e))); + } + }; + info!(log, "found zone bundle"; "metadata" => ?metadata); + bundles.insert(metadata); + } + } + } + Ok(bundles.into_iter().collect()) + } + + /// Get the path to a zone bundle, if it exists. + pub async fn get_zone_bundle_path( + &self, + zone_name: &str, + id: &Uuid, + ) -> Result, Error> { + let log = &self.inner.log; + for path in self.all_zone_bundle_directories().await { + info!(log, "searching zone bundle directory"; "directory" => ?path); + let zone_bundle_dir = path.join(zone_name); + if zone_bundle_dir.is_dir() { + let mut dir = tokio::fs::read_dir(zone_bundle_dir) + .await + .map_err(BundleError::from)?; + while let Some(zone_bundle) = + dir.next_entry().await.map_err(BundleError::from)? + { + let path = zone_bundle.path(); + let task = tokio::task::spawn_blocking(move || { + Self::extract_zone_bundle_metadata(&path) + }); + let metadata = match task.await { + Ok(Ok(md)) => md, + Ok(Err(e)) => { + error!( + log, + "failed to read zone bundle metadata"; + "error" => ?e, + ); + return Err(Error::from(e)); + } + Err(e) => { + error!( + log, + "failed to join zone bundle metadata read task"; + "error" => ?e, + ); + return Err(Error::from(BundleError::from(e))); + } + }; + let bundle_id = &metadata.id; + if bundle_id.zone_name == zone_name + && bundle_id.bundle_id == *id + { + let path = Utf8PathBuf::try_from(zone_bundle.path()) + .map_err(|_| { + BundleError::from(anyhow::anyhow!( + "Non-UTF-8 path name: {}", + zone_bundle.path().display() + )) + })?; + return Ok(Some(path)); + } + } + } + } + Ok(None) + } + + /// List all zones that are currently managed. + pub async fn list_all_zones(&self) -> Result, Error> { + let mut zone_names = vec![]; + if let SledLocalZone::Running { zone, .. } = + &*self.inner.switch_zone.lock().await + { + zone_names.push(String::from(zone.name())) + } + for zone in self.inner.zones.lock().await.iter() { + zone_names.push(String::from(zone.name())); + } + for zone in self.inner.dataset_zones.lock().await.iter() { + zone_names.push(String::from(zone.name())); + } + zone_names.sort(); + Ok(zone_names) + } + /// Ensures that particular services should be initialized. /// /// These services will be instantiated by this function, and will be diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 0e63112966..97c3146059 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -12,7 +12,8 @@ use crate::params::{ DatasetKind, DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, ServiceEnsureBody, - ServiceZoneService, SledRole, TimeSync, VpcFirewallRule, Zpool, + ServiceZoneService, SledRole, TimeSync, VpcFirewallRule, + ZoneBundleMetadata, Zpool, }; use crate::services::{self, ServiceManager}; use crate::storage_manager::{self, StorageManager}; @@ -141,7 +142,17 @@ impl From for dropshot::HttpError { e => HttpError::for_internal_error(e.to_string()), } } - + crate::sled_agent::Error::Services( + crate::services::Error::Bundle(ref inner), + ) => match inner { + crate::services::BundleError::NoStorage => { + HttpError::for_unavail(None, inner.to_string()) + } + crate::services::BundleError::NoSuchZone { .. } => { + HttpError::for_bad_request(None, inner.to_string()) + } + _ => HttpError::for_internal_error(err.to_string()), + }, e => HttpError::for_internal_error(e.to_string()), } } @@ -520,6 +531,40 @@ impl SledAgent { }); } + /// List zone bundles for the provided zone. + pub async fn list_zone_bundles( + &self, + name: &str, + ) -> Result, Error> { + self.inner.services.list_zone_bundles(name).await.map_err(Error::from) + } + + /// Create a zone bundle for the provided zone. + pub async fn create_zone_bundle( + &self, + name: &str, + ) -> Result { + self.inner.services.create_zone_bundle(name).await.map_err(Error::from) + } + + /// Fetch the path to a zone bundle. + pub async fn get_zone_bundle_path( + &self, + name: &str, + id: &Uuid, + ) -> Result, Error> { + self.inner + .services + .get_zone_bundle_path(name, id) + .await + .map_err(Error::from) + } + + /// List the zones that the sled agent is currently managing. + pub async fn zones_list(&self) -> Result, Error> { + self.inner.services.list_all_zones().await.map_err(Error::from) + } + /// Ensures that particular services should be initialized. /// /// These services will be instantiated by this function, will be recorded diff --git a/sled-agent/src/storage_manager.rs b/sled-agent/src/storage_manager.rs index c573dbe6a2..72548575ac 100644 --- a/sled-agent/src/storage_manager.rs +++ b/sled-agent/src/storage_manager.rs @@ -315,12 +315,14 @@ impl StorageWorker { let fs_name = &dataset_name.full(); let do_format = true; let encryption_details = None; + let quota = None; Zfs::ensure_filesystem( &dataset_name.full(), Mountpoint::Path(Utf8PathBuf::from("/data")), zoned, do_format, encryption_details, + quota, )?; // Ensure the dataset has a usable UUID. if let Ok(id_str) = Zfs::get_oxide_value(&fs_name, "uuid") { diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index 0e4bc5fbd2..f2f2f68040 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -204,10 +204,35 @@ pub struct Disk { zpool_name: ZpoolName, } +// Helper type for describing expected datasets and their optional quota. +#[derive(Clone, Copy, Debug)] +struct QuotaLimitedDataset { + // Name for the dataset + name: &'static str, + // Optional quota, in _bytes_ + quota: Option, +} + +impl QuotaLimitedDataset { + // Create a new dataset with quota. + const fn new(name: &'static str, quota: usize) -> Self { + QuotaLimitedDataset { name, quota: Some(quota) } + } + + // Create a new dataset with no quota. + const fn no_quota(name: &'static str) -> Self { + Self { name, quota: None } + } +} + pub const INSTALL_DATASET: &'static str = "install"; pub const CRASH_DATASET: &'static str = "crash"; pub const CLUSTER_DATASET: &'static str = "cluster"; pub const CONFIG_DATASET: &'static str = "config"; +pub const DEBUG_DATASET: &'static str = "debug"; +// TODO-correctness: This value of 100GiB is a pretty wild guess, and should be +// tuned as needed. +pub const DEBUG_DATASET_QUOTA: usize = 100 * (1 << 30); // U.2 datasets live under the encrypted dataset and inherit encryption pub const ZONE_DATASET: &'static str = "crypt/zone"; @@ -216,30 +241,32 @@ pub const ZONE_DATASET: &'static str = "crypt/zone"; pub const CRYPT_DATASET: &'static str = "crypt"; const U2_EXPECTED_DATASET_COUNT: usize = 1; -static U2_EXPECTED_DATASETS: [&'static str; U2_EXPECTED_DATASET_COUNT] = [ +static U2_EXPECTED_DATASETS: [QuotaLimitedDataset; U2_EXPECTED_DATASET_COUNT] = [ // Stores filesystems for zones - ZONE_DATASET, + QuotaLimitedDataset::no_quota(ZONE_DATASET), ]; -const M2_EXPECTED_DATASET_COUNT: usize = 4; -static M2_EXPECTED_DATASETS: [&'static str; M2_EXPECTED_DATASET_COUNT] = [ +const M2_EXPECTED_DATASET_COUNT: usize = 5; +static M2_EXPECTED_DATASETS: [QuotaLimitedDataset; M2_EXPECTED_DATASET_COUNT] = [ // Stores software images. // // Should be duplicated to both M.2s. - INSTALL_DATASET, + QuotaLimitedDataset::no_quota(INSTALL_DATASET), // Stores crash dumps. - CRASH_DATASET, + QuotaLimitedDataset::no_quota(CRASH_DATASET), // Stores cluter configuration information. // // Should be duplicated to both M.2s. - CLUSTER_DATASET, + QuotaLimitedDataset::no_quota(CLUSTER_DATASET), // Stores configuration data, including: // - What services should be launched on this sled // - Information about how to initialize the Sled Agent // - (For scrimlets) RSS setup information // // Should be duplicated to both M.2s. - CONFIG_DATASET, + QuotaLimitedDataset::no_quota(CONFIG_DATASET), + // Store debugging data, such as service bundles. + QuotaLimitedDataset::new(DEBUG_DATASET, DEBUG_DATASET_QUOTA), ]; impl Disk { @@ -458,6 +485,7 @@ impl Disk { zoned, do_format, Some(encryption_details), + None, ); keyfile.zero_and_unlink().await.map_err(|error| { @@ -468,14 +496,15 @@ impl Disk { } for dataset in datasets.into_iter() { - let mountpoint = zpool_name.dataset_mountpoint(dataset); + let mountpoint = zpool_name.dataset_mountpoint(dataset.name); let encryption_details = None; Zfs::ensure_filesystem( - &format!("{}/{}", zpool_name, dataset), + &format!("{}/{}", zpool_name, dataset.name), Mountpoint::Path(mountpoint), zoned, do_format, encryption_details, + dataset.quota, )?; } Ok(()) From 23025f40c95f5c892c3f47382c21db8b718ee1aa Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Fri, 23 Jun 2023 05:06:47 +0000 Subject: [PATCH 2/4] Review feedback - mv zb.rs -> zone-bundle.rs - Add TOML extension to zone bundle metadata file - Return 404 on bad zone name - Typos, safety notes, and link to logadm(8) --- illumos-utils/src/running_zone.rs | 3 +++ sled-agent/src/bin/{zb.rs => zone-bundle.rs} | 9 +++++++-- sled-agent/src/services.rs | 16 ++++++++++++---- sled-agent/src/sled_agent.rs | 2 +- 4 files changed, 23 insertions(+), 7 deletions(-) rename sled-agent/src/bin/{zb.rs => zone-bundle.rs} (95%) diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index fa85cd2883..ac46507663 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -880,6 +880,9 @@ impl RunningZone { // The rotated log files should have the same prefix as the current, but // with an index appended. We'll search the parent directory for // matching names, skipping the current file. + // + // See https://illumos.org/man/8/logadm for details on the naming + // conventions around these files. let dir = current_log_file.parent().unwrap(); let mut rotated_files = Vec::new(); for entry in dir.read_dir_utf8()? { diff --git a/sled-agent/src/bin/zb.rs b/sled-agent/src/bin/zone-bundle.rs similarity index 95% rename from sled-agent/src/bin/zb.rs rename to sled-agent/src/bin/zone-bundle.rs index 57df293ec4..d99bbf06b7 100644 --- a/sled-agent/src/bin/zb.rs +++ b/sled-agent/src/bin/zone-bundle.rs @@ -28,7 +28,7 @@ fn parse_log_level(s: &str) -> anyhow::Result { /// Operate on sled agent zone bundles. /// -/// Zoneb bundles are the collected state of a service zone. This includes +/// Zone bundles are the collected state of a service zone. This includes /// information about the processes running in the zone, and the system on which /// they're running. #[derive(Clone, Debug, Parser)] @@ -107,7 +107,12 @@ async fn main() -> anyhow::Result<()> { .context("failed to list zone bundles")? .into_inner(); for bundle in bundles { - println!("{}/{}", bundle.id.zone_name, bundle.id.bundle_id); + println!( + "{}/{} {}", + bundle.id.zone_name, + bundle.id.bundle_id, + bundle.time_created + ); } } Cmd::Create { zone_name } => { diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 046865f7c7..a41035a50d 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -258,6 +258,9 @@ const BUNDLE_DIRECTORY: &str = "bundle"; // The directory for zone bundles. const ZONE_BUNDLE_DIRECTORY: &str = "zone"; +// The name for zone bundle metadata files. +const ZONE_BUNDLE_METADATA_FILENAME: &str = "metadata.toml"; + // A wrapper around `ZoneRequest`, which allows it to be serialized // to a toml file. #[derive(Clone, serde::Serialize, serde::Deserialize)] @@ -2005,7 +2008,11 @@ impl ServiceManager { // Write the metadata file itself, in TOML format. let contents = toml::to_string(&zone_metadata)?; - insert_data(&mut builder, "metadata", contents.as_bytes())?; + insert_data( + &mut builder, + ZONE_BUNDLE_METADATA_FILENAME, + contents.as_bytes(), + )?; debug!( log, "wrote zone bundle metadata"; @@ -2113,8 +2120,9 @@ impl ServiceManager { // Copy any log files, current and rotated, into the tarball as // well. // - // Saftey: This is a log file, so we're sure it's a single, normal - // file and thus has a name. + // Safety: This pathbuf was retrieved by locating an existing file + // on the filesystem, so we're sure it has a name and the unwrap is + // safe. debug!( log, "appending current log file to zone bundle"; @@ -2232,7 +2240,7 @@ impl ServiceManager { .find(|entry| { entry .path() - .map(|p| p.to_str() == Some("metadata")) + .map(|p| p.to_str() == Some(ZONE_BUNDLE_METADATA_FILENAME)) .unwrap_or(false) }) else { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 97c3146059..59db79c14b 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -149,7 +149,7 @@ impl From for dropshot::HttpError { HttpError::for_unavail(None, inner.to_string()) } crate::services::BundleError::NoSuchZone { .. } => { - HttpError::for_bad_request(None, inner.to_string()) + HttpError::for_not_found(None, inner.to_string()) } _ => HttpError::for_internal_error(err.to_string()), }, From 41806ce3350f2fece34bbb71b150637592c2b899 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Mon, 26 Jun 2023 23:53:54 +0000 Subject: [PATCH 3/4] Do not configure Nexus for TLS by default --- nexus/examples/config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 5e514c4975..a116b31a5e 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -44,7 +44,7 @@ request_body_max_bytes = 1048576 # initialization. If you're using this config file, you're probably running a # simulated system. In that case, the initial certificate is provided to the # simulated sled agent (acting as RSS) via command-line arguments. -tls = true +#tls = true [deployment.dropshot_internal] # IP Address and TCP port on which to listen for the internal API From 17ac97342265d13b3f20f4010933688b028fc5e5 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 27 Jun 2023 05:07:32 +0000 Subject: [PATCH 4/4] Update sled-agent OpenAPI --- openapi/sled-agent.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index c6765b111c..8a3cb58a77 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -2621,9 +2621,15 @@ "type": "object", "properties": { "id": { - "$ref": "#/components/schemas/ZoneBundleId" + "description": "Identifier for this zone bundle", + "allOf": [ + { + "$ref": "#/components/schemas/ZoneBundleId" + } + ] }, "time_created": { + "description": "The time at which this zone bundle was created.", "type": "string", "format": "date-time" }