Skip to content

Commit

Permalink
Adds service bundles for zones
Browse files Browse the repository at this point in the history
- Adds a dataset to the M.2s for storing debugging data.
- Adds basic mechanism for setting a ZFS quota on datasets.
- Adds HTTP endpoints for listing, creating, and fetching zone service
  bundles from the sled agent.
- Adds methods to `ServiceManager` for implementing the above. Zone
  bundles run a set of commands to get the zone-wide output and some key
  process-specific data for relevant processes from an Oxide service
  zone. These are packed into a tarball along with a simple metdata
  file, describing the zone bundle.
- Adds some helper methods in `RunningZone` and related for listing the
  expected SMF service names and processes associated with them based on
  the zone's manifest files.
- Adds dev tool `zb` for talking to the sled agent to operate on zone
  bundles.
  • Loading branch information
bnaecker committed Jun 24, 2023
1 parent 6c07a0c commit 0efa21e
Show file tree
Hide file tree
Showing 15 changed files with 1,329 additions and 20 deletions.
26 changes: 26 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ http = "0.2.9"
httptest = "0.15.4"
hyper-rustls = "0.24.0"
hyper = "0.14"
hyper-staticfile = "0.9.5"
humantime = "2.1.0"
illumos-utils = { path = "illumos-utils" }
indexmap = "1.9.3"
Expand Down
147 changes: 146 additions & 1 deletion illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::zone::{AddressRequest, IPADM, ZONE_PREFIX};
use camino::{Utf8Path, Utf8PathBuf};
use ipnetwork::IpNetwork;
use omicron_common::backoff;
use slog::error;
use slog::info;
use slog::o;
use slog::warn;
Expand All @@ -24,6 +25,16 @@ use crate::zone::MockZones as Zones;
#[cfg(not(any(test, feature = "testing")))]
use crate::zone::Zones;

/// Errors returned from methods for fetching SMF services and log files
#[derive(thiserror::Error, Debug)]
pub enum ServiceError {
#[error("I/O error")]
Io(#[from] std::io::Error),

#[error("Failed to run a command")]
RunCommand(#[from] RunCommandError),
}

/// Errors returned from [`RunningZone::run_cmd`].
#[derive(thiserror::Error, Debug)]
#[error("Error running command in zone '{zone}': {err}")]
Expand Down Expand Up @@ -762,6 +773,125 @@ impl RunningZone {
pub fn links(&self) -> &Vec<Link> {
&self.inner.links
}

/// Return the running processes associated with all the SMF services this
/// zone is intended to run.
pub fn service_processes(
&self,
) -> Result<Vec<ServiceProcess>, ServiceError> {
let service_names = self.service_names()?;
let mut services = Vec::with_capacity(service_names.len());
for service_name in service_names.into_iter() {
let output = self.run_cmd(["ptree", "-s", &service_name])?;

// All Oxide SMF services currently run a single binary, though it
// may be run in a contract via `ctrun`. We don't care about that
// binary, but any others we _do_ want to collect data from.
for line in output.lines() {
if line.contains("ctrun") {
continue;
}
let line = line.trim();
let mut parts = line.split_ascii_whitespace();

// The first two parts should be the PID and the process binary
// path, respectively.
let Some(pid_s) = parts.next() else {
error!(
self.inner.log,
"failed to get service PID from ptree output";
"service" => &service_name,
);
continue;
};
let Ok(pid) = pid_s.parse() else {
error!(
self.inner.log,
"failed to parse service PID from ptree output";
"service" => &service_name,
"pid" => pid_s,
);
continue;
};
let Some(path) = parts.next() else {
error!(
self.inner.log,
"failed to get service binary from ptree output";
"service" => &service_name,
);
continue;
};
let binary = Utf8PathBuf::from(path);

// Fetch any log files for this SMF service.
let Some((log_file, rotated_log_files)) = self.service_log_files(&service_name)? else {
error!(
self.inner.log,
"failed to find log files for existing service";
"service_name" => &service_name,
);
continue;
};

services.push(ServiceProcess {
service_name: service_name.clone(),
binary,
pid,
log_file,
rotated_log_files,
});
}
}
Ok(services)
}

/// Return the names of the Oxide SMF services this zone is intended to run.
pub fn service_names(&self) -> Result<Vec<String>, ServiceError> {
const NEEDLES: [&str; 2] = ["/oxide", "/system/illumos"];
let output = self.run_cmd(&["svcs", "-H", "-o", "fmri"])?;
Ok(output
.lines()
.filter(|line| NEEDLES.iter().any(|needle| line.contains(needle)))
.map(|line| line.trim().to_string())
.collect())
}

/// Return any SMF log files associated with the named service.
///
/// Given a named service, this returns a tuple of the latest or current log
/// file, and an array of any rotated log files. If the service does not
/// exist, or there are no log files, `None` is returned.
pub fn service_log_files(
&self,
name: &str,
) -> Result<Option<(Utf8PathBuf, Vec<Utf8PathBuf>)>, ServiceError> {
let output = self.run_cmd(&["svcs", "-L", name])?;
let mut lines = output.lines();
let Some(current) = lines.next() else {
return Ok(None);
};
// We need to prepend the zonepath root to get the path in the GZ. We
// can do this with `join()`, but that will _replace_ the path if the
// second one is absolute. So trim any prefixed `/` from each path.
let root = self.root();
let current_log_file =
root.join(current.trim().trim_start_matches('/'));

// The rotated log files should have the same prefix as the current, but
// with an index appended. We'll search the parent directory for
// matching names, skipping the current file.
let dir = current_log_file.parent().unwrap();
let mut rotated_files = Vec::new();
for entry in dir.read_dir_utf8()? {
let entry = entry?;
let path = entry.path();
if path != current_log_file && path.starts_with(&current_log_file) {
rotated_files
.push(root.join(path.strip_prefix("/").unwrap_or(path)));
}
}
Ok(Some((current_log_file, rotated_files)))
}
}

impl Drop for RunningZone {
Expand All @@ -783,6 +913,21 @@ impl Drop for RunningZone {
}
}

/// A process running in the zone associated with an SMF service.
#[derive(Clone, Debug)]
pub struct ServiceProcess {
/// The name of the SMF service.
pub service_name: String,
/// The path of the binary in the process image.
pub binary: Utf8PathBuf,
/// The PID of the process.
pub pid: u32,
/// The path for the current log file.
pub log_file: Utf8PathBuf,
/// The paths for any rotated log files.
pub rotated_log_files: Vec<Utf8PathBuf>,
}

/// Errors returned from [`InstalledZone::install`].
#[derive(thiserror::Error, Debug)]
pub enum InstallZoneError {
Expand Down Expand Up @@ -817,7 +962,7 @@ pub struct InstalledZone {
// NIC used for control plane communication.
control_vnic: Link,

// Nic used for bootstrap network communication
// NIC used for bootstrap network communication
bootstrap_vnic: Option<Link>,

// OPTE devices for the guest network interfaces
Expand Down
19 changes: 18 additions & 1 deletion illumos-utils/src/zfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,15 @@ impl Zfs {
}

/// Creates a new ZFS filesystem named `name`, unless one already exists.
///
/// Applies an optional quota, provided _in bytes_.
pub fn ensure_filesystem(
name: &str,
mountpoint: Mountpoint,
zoned: bool,
do_format: bool,
encryption_details: Option<EncryptionDetails>,
quota: Option<usize>,
) -> Result<(), EnsureFilesystemError> {
let (exists, mounted) = Self::dataset_exists(name, &mountpoint)?;
if exists {
Expand Down Expand Up @@ -225,9 +228,23 @@ impl Zfs {
cmd.args(&["-o", &format!("mountpoint={}", mountpoint), name]);
execute(cmd).map_err(|err| EnsureFilesystemError {
name: name.to_string(),
mountpoint,
mountpoint: mountpoint.clone(),
err: err.into(),
})?;

// Apply any quota.
if let Some(quota) = quota {
if let Err(err) =
Self::set_value(name, "quota", &format!("{quota}"))
{
return Err(EnsureFilesystemError {
name: name.to_string(),
mountpoint,
// Take the execution error from the SetValueError
err: err.err.into(),
});
}
}
Ok(())
}

Expand Down
2 changes: 1 addition & 1 deletion nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ request_body_max_bytes = 1048576
# initialization. If you're using this config file, you're probably running a
# simulated system. In that case, the initial certificate is provided to the
# simulated sled agent (acting as RSS) via command-line arguments.
#tls = true
tls = true

[deployment.dropshot_internal]
# IP Address and TCP port on which to listen for the internal API
Expand Down
Loading

0 comments on commit 0efa21e

Please sign in to comment.