diff --git a/Cargo.lock b/Cargo.lock index 85b7e5a186..a77a1e17d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3858,6 +3858,20 @@ dependencies = [ "tracing", ] +[[package]] +name = "libnvme" +version = "0.1.0" +source = "git+https://github.com/oxidecomputer/libnvme?rev=6fffcc81d2c423ed2d2e6c5c2827485554c4ecbe#6fffcc81d2c423ed2d2e6c5c2827485554c4ecbe" +dependencies = [ + "libnvme-sys", + "thiserror", +] + +[[package]] +name = "libnvme-sys" +version = "0.0.0" +source = "git+https://github.com/oxidecomputer/libnvme?rev=6fffcc81d2c423ed2d2e6c5c2827485554c4ecbe#6fffcc81d2c423ed2d2e6c5c2827485554c4ecbe" + [[package]] name = "libsw" version = "3.3.1" @@ -8187,6 +8201,7 @@ dependencies = [ "illumos-utils", "libc", "libefi-illumos", + "libnvme", "macaddr", "omicron-common", "omicron-test-utils", diff --git a/Cargo.toml b/Cargo.toml index db37547ea0..31768de24f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -245,6 +245,7 @@ itertools = "0.12.1" key-manager = { path = "key-manager" } kstat-rs = "0.2.3" libc = "0.2.153" +libnvme = { git = "https://github.com/oxidecomputer/libnvme", rev = "6fffcc81d2c423ed2d2e6c5c2827485554c4ecbe" } linear-map = "1.2.0" macaddr = { version = "1.0.1", features = ["serde_std"] } maplit = "1.0.2" diff --git a/sled-hardware/Cargo.toml b/sled-hardware/Cargo.toml index 3d1259f46f..24a49ae714 100644 --- a/sled-hardware/Cargo.toml +++ b/sled-hardware/Cargo.toml @@ -28,6 +28,7 @@ omicron-workspace-hack.workspace = true [target.'cfg(target_os = "illumos")'.dependencies] illumos-devinfo = { git = "https://github.com/oxidecomputer/illumos-devinfo", branch = "main" } libefi-illumos = { git = "https://github.com/oxidecomputer/libefi-illumos", branch = "master" } +libnvme.workspace = true [dev-dependencies] illumos-utils = { workspace = true, features = ["testing"] } diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs index 44658658be..6a4c968f14 100644 --- a/sled-hardware/src/disk.rs +++ b/sled-hardware/src/disk.rs @@ -38,6 +38,8 @@ pub enum PooledDiskError { CannotFormatMissingDevPath { path: Utf8PathBuf }, #[error("Formatting M.2 devices is not yet implemented")] CannotFormatM2NotImplemented, + #[error(transparent)] + NvmeFormatAndResize(#[from] NvmeFormattingError), } /// A partition (or 'slice') of a disk. @@ -196,9 +198,11 @@ impl PooledDisk { ) -> Result { let paths = &unparsed_disk.paths; let variant = unparsed_disk.variant; + let identity = unparsed_disk.identity(); // Ensure the GPT has the right format. This does not necessarily // mean that the partitions are populated with the data we need. - let partitions = ensure_partition_layout(&log, &paths, variant)?; + let partitions = + ensure_partition_layout(&log, &paths, variant, identity)?; // Find the path to the zpool which exists on this disk. // diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index ebc9a9c2b0..f2db424bda 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -26,7 +26,7 @@ mod gpt; mod partitions; mod sysconf; -pub use partitions::ensure_partition_layout; +pub use partitions::{ensure_partition_layout, NvmeFormattingError}; #[derive(thiserror::Error, Debug)] enum Error { diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs index 29b2466ad9..3b8e0af2ee 100644 --- a/sled-hardware/src/illumos/partitions.rs +++ b/sled-hardware/src/illumos/partitions.rs @@ -4,10 +4,14 @@ //! illumos-specific mechanisms for parsing disk info. +use std::collections::HashMap; +use std::sync::OnceLock; + use crate::illumos::gpt; use crate::{DiskPaths, DiskVariant, Partition, PooledDiskError}; use camino::Utf8Path; use illumos_utils::zpool::ZpoolName; +use omicron_common::disk::DiskIdentity; use slog::info; use slog::Logger; use uuid::Uuid; @@ -17,6 +21,70 @@ use illumos_utils::zpool::MockZpool as Zpool; #[cfg(not(test))] use illumos_utils::zpool::Zpool; +/// NVMe devices use a meta size of 0 as we don't support writing addditional +/// metadata +static NVME_LBA_META_SIZE: u32 = 0; +/// NVMe devices default to using 4k logical block addressing unless overriden. +static DEFAULT_NVME_LBA_DATA_SIZE: u64 = 4096; + +/// NVMe device settings for a particular NVMe model. +struct NvmeDeviceSettings { + /// The desired disk size for dealing with overprovisioning. + size: u32, + /// An override for the default 4k LBA formatting. + lba_data_size_override: Option, +} + +/// A mapping from model to desired settings. +/// A device not found in this lookup table will not be modified by sled-agent. +static PREFERRED_NVME_DEVICE_SETTINGS: OnceLock< + HashMap<&'static str, NvmeDeviceSettings>, +> = OnceLock::new(); + +fn preferred_nvme_device_settings( +) -> &'static HashMap<&'static str, NvmeDeviceSettings> { + PREFERRED_NVME_DEVICE_SETTINGS.get_or_init(|| { + HashMap::from([ + ( + "WUS4C6432DSP3X3", + NvmeDeviceSettings { size: 3200, lba_data_size_override: None }, + ), + ( + "WUS5EA138ESP7E1", + NvmeDeviceSettings { size: 3200, lba_data_size_override: None }, + ), + ( + "WUS5EA138ESP7E3", + NvmeDeviceSettings { size: 3200, lba_data_size_override: None }, + ), + ( + "WUS5EA176ESP7E1", + NvmeDeviceSettings { size: 6400, lba_data_size_override: None }, + ), + ( + "WUS5EA176ESP7E3", + NvmeDeviceSettings { size: 6400, lba_data_size_override: None }, + ), + ]) + }) +} + +#[derive(Debug, thiserror::Error)] +pub enum NvmeFormattingError { + #[error(transparent)] + NvmeInit(#[from] libnvme::NvmeInitError), + #[error(transparent)] + Nvme(#[from] libnvme::NvmeError), + #[error("Device is missing expected LBA format")] + LbaFormatMissing, + #[error("Device has {0} active namespaces but we expected 1")] + UnexpectedNamespaces(usize), + #[error(transparent)] + InfoError(#[from] libnvme::controller_info::NvmeInfoError), + #[error("Could not find NVMe controller for disk with serial {0}")] + NoController(String), +} + // The expected layout of an M.2 device within the Oxide rack. // // Partitions beyond this "expected partition" array are ignored. @@ -79,8 +147,11 @@ pub fn ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, + identity: &DiskIdentity, ) -> Result, PooledDiskError> { - internal_ensure_partition_layout::(log, paths, variant) + internal_ensure_partition_layout::( + log, paths, variant, identity, + ) } // Same as the [ensure_partition_layout], but with generic parameters @@ -89,6 +160,7 @@ fn internal_ensure_partition_layout( log: &Logger, paths: &DiskPaths, variant: DiskVariant, + identity: &DiskIdentity, ) -> Result, PooledDiskError> { // Open the "Whole Disk" as a raw device to be parsed by the // libefi-illumos library. This lets us peek at the GPT before @@ -119,6 +191,11 @@ fn internal_ensure_partition_layout( }; match variant { DiskVariant::U2 => { + // First we need to check that this disk is of the proper + // size and correct logical block address formatting. + ensure_size_and_formatting(log, identity)?; + + // If we were successful we can create a zpool on this disk. info!(log, "Formatting zpool on disk {}", paths.devfs_path); // If a zpool does not already exist, create one. let zpool_name = ZpoolName::new_external(Uuid::new_v4()); @@ -154,13 +231,124 @@ fn internal_ensure_partition_layout( } } +fn ensure_size_and_formatting( + log: &Logger, + identity: &DiskIdentity, +) -> Result<(), NvmeFormattingError> { + use libnvme::namespace::NamespaceDiscoveryLevel; + use libnvme::Nvme; + + let mut controller_found = false; + + if let Some(nvme_settings) = + preferred_nvme_device_settings().get(identity.model.as_str()) + { + let nvme = Nvme::new()?; + for controller in nvme.controller_discovery()? { + let controller = controller?.write_lock().map_err(|(_, e)| e)?; + let controller_info = controller.get_info()?; + // Make sure we are operating on the correct NVMe device. + if controller_info.serial() != identity.serial { + continue; + }; + controller_found = true; + let nsdisc = controller + .namespace_discovery(NamespaceDiscoveryLevel::Active)?; + let namespaces = + nsdisc.into_iter().collect::, _>>()?; + if namespaces.len() != 1 { + return Err(NvmeFormattingError::UnexpectedNamespaces( + namespaces.len(), + )); + } + // Safe because verified there is exactly one namespace. + let namespace = namespaces.into_iter().next().unwrap(); + + // NB: Only some vendors such as WDC support adjusting the size + // of the disk to deal with overprovisioning. This will need to be + // abstracted away if/when we ever start using another vendor with + // this capability. + let size = controller.wdc_resize_get()?; + + // First we need to detach blkdev from the namespace. + namespace.blkdev_detach()?; + + // Resize the device if needed to ensure we get the expected + // durability level in terms of drive writes per day. + if size != nvme_settings.size { + controller.wdc_resize_set(nvme_settings.size)?; + info!( + log, + "Resized {} from {size} to {}", + identity.serial, + nvme_settings.size + ) + } + + // Find the LBA format we want to use for the device. + let wanted_data_size = nvme_settings + .lba_data_size_override + .unwrap_or(DEFAULT_NVME_LBA_DATA_SIZE); + let desired_lba = controller_info + .lba_formats() + .collect::, _>>()? + .into_iter() + .find(|lba| { + lba.meta_size() == NVME_LBA_META_SIZE + && lba.data_size() == wanted_data_size + }) + .ok_or_else(|| NvmeFormattingError::LbaFormatMissing)?; + + // If the controller isn't formatted to our desired LBA we need to + // issue a format request. + let ns_info = namespace.get_info()?; + let current_lba = ns_info.current_format()?; + if current_lba.id() != desired_lba.id() { + controller + .format_request()? + .set_lbaf(desired_lba.id())? + // TODO map this to libnvme::BROADCAST_NAMESPACE once added + .set_nsid(u32::MAX)? + // No secure erase + .set_ses(0)? + .execute()?; + + info!( + log, + "Formatted disk with serial {} to an LBA with data size \ + {wanted_data_size}", + identity.serial, + ); + } + + // Attach blkdev to the namespace again + namespace.blkdev_attach()?; + } + } else { + info!( + log, + "There are no preferred NVMe settings for disk model {}; nothing to\ + do for disk with serial {}", + identity.model, + identity.serial + ); + return Ok(()); + } + + if !controller_found { + return Err(NvmeFormattingError::NoController(identity.serial.clone())); + } + + Ok(()) +} + #[cfg(test)] mod test { use super::*; use crate::DiskPaths; use camino::Utf8PathBuf; use illumos_utils::zpool::MockZpool; - use omicron_test_utils::dev::test_setup_log; + use omicron_test_utils::dev::{mock_disk_identity, test_setup_log}; use std::path::Path; struct FakePartition { @@ -196,6 +384,7 @@ mod test { &log, &DiskPaths { devfs_path, dev_path: None }, DiskVariant::U2, + &mock_disk_identity(), ); match result { Err(PooledDiskError::CannotFormatMissingDevPath { .. }) => {} @@ -229,6 +418,7 @@ mod test { dev_path: Some(Utf8PathBuf::from(DEV_PATH)), }, DiskVariant::U2, + &mock_disk_identity(), ) .expect("Should have succeeded partitioning disk"); @@ -253,6 +443,7 @@ mod test { dev_path: Some(Utf8PathBuf::from(DEV_PATH)) }, DiskVariant::M2, + &mock_disk_identity(), ) .is_err()); @@ -290,6 +481,7 @@ mod test { dev_path: Some(Utf8PathBuf::from(DEV_PATH)), }, DiskVariant::U2, + &mock_disk_identity(), ) .expect("Should be able to parse disk"); @@ -332,6 +524,7 @@ mod test { dev_path: Some(Utf8PathBuf::from(DEV_PATH)), }, DiskVariant::M2, + &mock_disk_identity(), ) .expect("Should be able to parse disk"); @@ -371,6 +564,7 @@ mod test { dev_path: Some(Utf8PathBuf::from(DEV_PATH)), }, DiskVariant::M2, + &mock_disk_identity(), ) .expect_err("Should have failed parsing empty GPT"), PooledDiskError::BadPartitionLayout { .. } @@ -396,6 +590,7 @@ mod test { dev_path: Some(Utf8PathBuf::from(DEV_PATH)), }, DiskVariant::U2, + &mock_disk_identity(), ) .expect_err("Should have failed parsing empty GPT"), PooledDiskError::BadPartitionLayout { .. } diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs index d8372dd8aa..8518aae495 100644 --- a/sled-hardware/src/non_illumos/mod.rs +++ b/sled-hardware/src/non_illumos/mod.rs @@ -6,10 +6,17 @@ use crate::disk::{ DiskPaths, DiskVariant, Partition, PooledDiskError, UnparsedDisk, }; use crate::{Baseboard, SledMode}; +use omicron_common::disk::DiskIdentity; use slog::Logger; use std::collections::HashSet; use tokio::sync::broadcast; +#[derive(Debug, thiserror::Error)] +pub enum NvmeFormattingError { + #[error("NVMe formatting is unsupported on this platform")] + UnsupportedPlatform, +} + /// An unimplemented, stub representation of the underlying hardware. /// /// This is intended for non-illumos systems to have roughly the same interface @@ -59,6 +66,7 @@ pub fn ensure_partition_layout( _log: &Logger, _paths: &DiskPaths, _variant: DiskVariant, + _identity: &DiskIdentity, ) -> Result, PooledDiskError> { unimplemented!("Accessing hardware unsupported on non-illumos"); } diff --git a/test-utils/src/dev/mod.rs b/test-utils/src/dev/mod.rs index e29da9c51e..705d6ff08c 100644 --- a/test-utils/src/dev/mod.rs +++ b/test-utils/src/dev/mod.rs @@ -20,6 +20,7 @@ pub use dropshot::test_util::LogContext; use dropshot::ConfigLogging; use dropshot::ConfigLoggingIfExists; use dropshot::ConfigLoggingLevel; +use omicron_common::disk::DiskIdentity; use slog::Logger; use std::io::BufReader; @@ -145,3 +146,13 @@ pub fn process_running(pid: u32) -> bool { // only checks whether the process is running. 0 == (unsafe { libc::kill(pid as libc::pid_t, 0) }) } + +/// Returns a DiskIdentity that can be passed to ensure_partition_layout when +/// not operating on a real disk. +pub fn mock_disk_identity() -> DiskIdentity { + DiskIdentity { + vendor: "MockVendor".to_string(), + serial: "MOCKSERIAL".to_string(), + model: "MOCKMODEL".to_string(), + } +}