oxidecomputer · papertigers · Mar 11, 2024 · Feb 27, 2024 · Feb 28, 2024 · Feb 28, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -245,6 +245,7 @@ itertools = "0.12.1"
 key-manager = { path = "key-manager" }
 kstat-rs = "0.2.3"
 libc = "0.2.153"
+libnvme = { git = "https://github.com/oxidecomputer/libnvme", rev = "6fffcc81d2c423ed2d2e6c5c2827485554c4ecbe" }
 linear-map = "1.2.0"
 macaddr = { version = "1.0.1", features = ["serde_std"] }
 maplit = "1.0.2"

diff --git a/sled-hardware/Cargo.toml b/sled-hardware/Cargo.toml
@@ -28,6 +28,7 @@ omicron-workspace-hack.workspace = true
 [target.'cfg(target_os = "illumos")'.dependencies]
 illumos-devinfo = { git = "https://github.com/oxidecomputer/illumos-devinfo", branch = "main" }
 libefi-illumos = { git = "https://github.com/oxidecomputer/libefi-illumos", branch = "master" }
+libnvme.workspace = true
 
 [dev-dependencies]
 illumos-utils = { workspace = true, features = ["testing"] }

diff --git a/sled-hardware/src/disk.rs b/sled-hardware/src/disk.rs
@@ -38,6 +38,8 @@ pub enum PooledDiskError {
     CannotFormatMissingDevPath { path: Utf8PathBuf },
     #[error("Formatting M.2 devices is not yet implemented")]
     CannotFormatM2NotImplemented,
+    #[error(transparent)]
+    NvmeFormatAndResize(#[from] NvmeFormattingError),
 }
 
 /// A partition (or 'slice') of a disk.
@@ -196,9 +198,11 @@ impl PooledDisk {
     ) -> Result<Self, PooledDiskError> {
         let paths = &unparsed_disk.paths;
         let variant = unparsed_disk.variant;
+        let identity = unparsed_disk.identity();
         // Ensure the GPT has the right format. This does not necessarily
         // mean that the partitions are populated with the data we need.
-        let partitions = ensure_partition_layout(&log, &paths, variant)?;
+        let partitions =
+            ensure_partition_layout(&log, &paths, variant, identity)?;
 
         // Find the path to the zpool which exists on this disk.
         //

diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs
@@ -26,7 +26,7 @@ mod gpt;
 mod partitions;
 mod sysconf;
 
-pub use partitions::ensure_partition_layout;
+pub use partitions::{ensure_partition_layout, NvmeFormattingError};
 
 #[derive(thiserror::Error, Debug)]
 enum Error {

diff --git a/sled-hardware/src/illumos/partitions.rs b/sled-hardware/src/illumos/partitions.rs
@@ -4,10 +4,14 @@
 
 //! illumos-specific mechanisms for parsing disk info.
 
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
 use crate::illumos::gpt;
 use crate::{DiskPaths, DiskVariant, Partition, PooledDiskError};
 use camino::Utf8Path;
 use illumos_utils::zpool::ZpoolName;
+use omicron_common::disk::DiskIdentity;
 use slog::info;
 use slog::Logger;
 use uuid::Uuid;
@@ -17,6 +21,70 @@ use illumos_utils::zpool::MockZpool as Zpool;
 #[cfg(not(test))]
 use illumos_utils::zpool::Zpool;
 
+/// NVMe devices use a meta size of 0 as we don't support writing addditional
+/// metadata
+static NVME_LBA_META_SIZE: u32 = 0;
+/// NVMe devices default to using 4k logical block addressing unless overriden.
+static DEFAULT_NVME_LBA_DATA_SIZE: u64 = 4096;
+
+/// NVMe device settings for a particular NVMe model.
+struct NvmeDeviceSettings {
+    /// The desired disk size for dealing with overprovisioning.
+    size: u32,
+    /// An override for the default 4k LBA formatting.
+    lba_data_size_override: Option<u64>,
+}
+
+/// A mapping from model to desired settings.
+/// A device not found in this lookup table will not be modified by sled-agent.
+static PREFERRED_NVME_DEVICE_SETTINGS: OnceLock<
+    HashMap<&'static str, NvmeDeviceSettings>,
+> = OnceLock::new();
+
+fn preferred_nvme_device_settings(
+) -> &'static HashMap<&'static str, NvmeDeviceSettings> {
+    PREFERRED_NVME_DEVICE_SETTINGS.get_or_init(|| {
+        HashMap::from([
+            (
+                "WUS4C6432DSP3X3",
+                NvmeDeviceSettings { size: 3200, lba_data_size_override: None },
+            ),
+            (
+                "WUS5EA138ESP7E1",
+                NvmeDeviceSettings { size: 3200, lba_data_size_override: None },
+            ),
+            (
+                "WUS5EA138ESP7E3",
+                NvmeDeviceSettings { size: 3200, lba_data_size_override: None },
+            ),
+            (
+                "WUS5EA176ESP7E1",
+                NvmeDeviceSettings { size: 6400, lba_data_size_override: None },
+            ),
+            (
+                "WUS5EA176ESP7E3",
+                NvmeDeviceSettings { size: 6400, lba_data_size_override: None },
+            ),
+        ])
+    })
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum NvmeFormattingError {
+    #[error(transparent)]
+    NvmeInit(#[from] libnvme::NvmeInitError),
+    #[error(transparent)]
+    Nvme(#[from] libnvme::NvmeError),
+    #[error("Device is missing expected LBA format")]
+    LbaFormatMissing,
+    #[error("Device has {0} active namespaces but we expected 1")]
+    UnexpectedNamespaces(usize),
+    #[error(transparent)]
+    InfoError(#[from] libnvme::controller_info::NvmeInfoError),
+    #[error("Could not find NVMe controller for disk with serial {0}")]
+    NoController(String),
+}
+
 // The expected layout of an M.2 device within the Oxide rack.
 //
 // Partitions beyond this "expected partition" array are ignored.
@@ -79,8 +147,11 @@ pub fn ensure_partition_layout(
     log: &Logger,
     paths: &DiskPaths,
     variant: DiskVariant,
+    identity: &DiskIdentity,
 ) -> Result<Vec<Partition>, PooledDiskError> {
-    internal_ensure_partition_layout::<libefi_illumos::Gpt>(log, paths, variant)
+    internal_ensure_partition_layout::<libefi_illumos::Gpt>(
+        log, paths, variant, identity,
+    )
 }
 
 // Same as the [ensure_partition_layout], but with generic parameters
@@ -89,6 +160,7 @@ fn internal_ensure_partition_layout<GPT: gpt::LibEfiGpt>(
     log: &Logger,
     paths: &DiskPaths,
     variant: DiskVariant,
+    identity: &DiskIdentity,
 ) -> Result<Vec<Partition>, PooledDiskError> {
     // Open the "Whole Disk" as a raw device to be parsed by the
     // libefi-illumos library. This lets us peek at the GPT before
@@ -119,6 +191,11 @@ fn internal_ensure_partition_layout<GPT: gpt::LibEfiGpt>(
             };
             match variant {
                 DiskVariant::U2 => {
+                    // First we need to check that this disk is of the proper
+                    // size and correct logical block address formatting.
+                    ensure_size_and_formatting(log, identity)?;
+
+                    // If we were successful we can create a zpool on this disk.
                     info!(log, "Formatting zpool on disk {}", paths.devfs_path);
                     // If a zpool does not already exist, create one.
                     let zpool_name = ZpoolName::new_external(Uuid::new_v4());
@@ -154,13 +231,124 @@ fn internal_ensure_partition_layout<GPT: gpt::LibEfiGpt>(
     }
 }
 
+fn ensure_size_and_formatting(
+    log: &Logger,
+    identity: &DiskIdentity,
+) -> Result<(), NvmeFormattingError> {
+    use libnvme::namespace::NamespaceDiscoveryLevel;
+    use libnvme::Nvme;
+
+    let mut controller_found = false;
+
+    if let Some(nvme_settings) =
+        preferred_nvme_device_settings().get(identity.model.as_str())
+    {
+        let nvme = Nvme::new()?;
+        for controller in nvme.controller_discovery()? {
+            let controller = controller?.write_lock().map_err(|(_, e)| e)?;
+            let controller_info = controller.get_info()?;
+            // Make sure we are operating on the correct NVMe device.
+            if controller_info.serial() != identity.serial {
+                continue;
+            };
+            controller_found = true;
+            let nsdisc = controller
+                .namespace_discovery(NamespaceDiscoveryLevel::Active)?;
+            let namespaces =
+                nsdisc.into_iter().collect::<Result<Vec<_>, _>>()?;
+            if namespaces.len() != 1 {
+                return Err(NvmeFormattingError::UnexpectedNamespaces(
+                    namespaces.len(),
+                ));
+            }
+            // Safe because verified there is exactly one namespace.
+            let namespace = namespaces.into_iter().next().unwrap();
+
+            // NB: Only some vendors such as WDC support adjusting the size
+            // of the disk to deal with overprovisioning. This will need to be
+            // abstracted away if/when we ever start using another vendor with
+            // this capability.
+            let size = controller.wdc_resize_get()?;
+
+            // First we need to detach blkdev from the namespace.
+            namespace.blkdev_detach()?;
+
+            // Resize the device if needed to ensure we get the expected
+            // durability level in terms of drive writes per day.
+            if size != nvme_settings.size {
+                controller.wdc_resize_set(nvme_settings.size)?;
+                info!(
+                    log,
+                    "Resized {} from {size} to {}",
+                    identity.serial,
+                    nvme_settings.size
+                )
+            }
+
+            // Find the LBA format we want to use for the device.
+            let wanted_data_size = nvme_settings
+                .lba_data_size_override
+                .unwrap_or(DEFAULT_NVME_LBA_DATA_SIZE);
+            let desired_lba = controller_info
+                .lba_formats()
+                .collect::<Result<Vec<_>, _>>()?
+                .into_iter()
+                .find(|lba| {
+                    lba.meta_size() == NVME_LBA_META_SIZE
+                        && lba.data_size() == wanted_data_size
+                })
+                .ok_or_else(|| NvmeFormattingError::LbaFormatMissing)?;
+
+            // If the controller isn't formatted to our desired LBA we need to
+            // issue a format request.
+            let ns_info = namespace.get_info()?;
+            let current_lba = ns_info.current_format()?;
+            if current_lba.id() != desired_lba.id() {
+                controller
+                    .format_request()?
+                    .set_lbaf(desired_lba.id())?
+                    // TODO map this to libnvme::BROADCAST_NAMESPACE once added
+                    .set_nsid(u32::MAX)?
+                    // No secure erase
+                    .set_ses(0)?
+                    .execute()?;
+
+                info!(
+                    log,
+                    "Formatted disk with serial {} to an LBA with data size \
+                    {wanted_data_size}",
+                    identity.serial,
+                );
+            }
+
+            // Attach blkdev to the namespace again
+            namespace.blkdev_attach()?;
+        }
+    } else {
+        info!(
+            log,
+            "There are no preferred NVMe settings for disk model {}; nothing to\
+             do for disk with serial {}",
+            identity.model,
+            identity.serial
+        );
+        return Ok(());
+    }
+
+    if !controller_found {
+        return Err(NvmeFormattingError::NoController(identity.serial.clone()));
+    }
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
     use crate::DiskPaths;
     use camino::Utf8PathBuf;
     use illumos_utils::zpool::MockZpool;
-    use omicron_test_utils::dev::test_setup_log;
+    use omicron_test_utils::dev::{mock_disk_identity, test_setup_log};
     use std::path::Path;
 
     struct FakePartition {
@@ -196,6 +384,7 @@ mod test {
             &log,
             &DiskPaths { devfs_path, dev_path: None },
             DiskVariant::U2,
+            &mock_disk_identity(),
         );
         match result {
             Err(PooledDiskError::CannotFormatMissingDevPath { .. }) => {}
@@ -229,6 +418,7 @@ mod test {
                 dev_path: Some(Utf8PathBuf::from(DEV_PATH)),
             },
             DiskVariant::U2,
+            &mock_disk_identity(),
         )
         .expect("Should have succeeded partitioning disk");
 
@@ -253,6 +443,7 @@ mod test {
                 dev_path: Some(Utf8PathBuf::from(DEV_PATH))
             },
             DiskVariant::M2,
+            &mock_disk_identity(),
         )
         .is_err());
 
@@ -290,6 +481,7 @@ mod test {
                 dev_path: Some(Utf8PathBuf::from(DEV_PATH)),
             },
             DiskVariant::U2,
+            &mock_disk_identity(),
         )
         .expect("Should be able to parse disk");
 
@@ -332,6 +524,7 @@ mod test {
                 dev_path: Some(Utf8PathBuf::from(DEV_PATH)),
             },
             DiskVariant::M2,
+            &mock_disk_identity(),
         )
         .expect("Should be able to parse disk");
 
@@ -371,6 +564,7 @@ mod test {
                     dev_path: Some(Utf8PathBuf::from(DEV_PATH)),
                 },
                 DiskVariant::M2,
+                &mock_disk_identity(),
             )
             .expect_err("Should have failed parsing empty GPT"),
             PooledDiskError::BadPartitionLayout { .. }
@@ -396,6 +590,7 @@ mod test {
                     dev_path: Some(Utf8PathBuf::from(DEV_PATH)),
                 },
                 DiskVariant::U2,
+                &mock_disk_identity(),
             )
             .expect_err("Should have failed parsing empty GPT"),
             PooledDiskError::BadPartitionLayout { .. }