-
Notifications
You must be signed in to change notification settings - Fork 40
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
sled-agent needs to deal with SSD overprovisioning #5158
Changes from 4 commits
1b56562
1d9cfee
e802cac
e708327
da05bdb
22745be
3010370
ea09e03
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,10 +4,14 @@ | |
|
||
//! illumos-specific mechanisms for parsing disk info. | ||
|
||
use std::collections::HashMap; | ||
use std::sync::OnceLock; | ||
|
||
use crate::illumos::gpt; | ||
use crate::{DiskPaths, DiskVariant, Partition, PooledDiskError}; | ||
use camino::Utf8Path; | ||
use illumos_utils::zpool::ZpoolName; | ||
use omicron_common::disk::DiskIdentity; | ||
use slog::info; | ||
use slog::Logger; | ||
use uuid::Uuid; | ||
|
@@ -17,6 +21,82 @@ use illumos_utils::zpool::MockZpool as Zpool; | |
#[cfg(not(test))] | ||
use illumos_utils::zpool::Zpool; | ||
|
||
/// NVMe devices use a meta size of 0 as we don't support writing addditional | ||
/// metadata | ||
static NVME_LBA_META_SIZE: u32 = 0; | ||
/// NVMe devices default to using 4k logical block addressing unless overriden. | ||
static DEFAULT_NVME_LBA_DATA_SIZE: u64 = 4096; | ||
|
||
/// NVMe device settings for a particular NVMe model. | ||
struct NvmeDeviceSettings { | ||
/// The desired disk size for dealing with overprovisioning. | ||
size: Option<u32>, | ||
/// An override for the default 4k LBA formatting. | ||
lba_data_size_override: Option<u64>, | ||
} | ||
|
||
static PREFERRED_NVME_DEVICE_SETTINGS: OnceLock< | ||
HashMap<&'static str, NvmeDeviceSettings>, | ||
> = OnceLock::new(); | ||
|
||
fn preferred_nvme_device_settings( | ||
) -> &'static HashMap<&'static str, NvmeDeviceSettings> { | ||
PREFERRED_NVME_DEVICE_SETTINGS.get_or_init(|| { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tiny nit: if having to do this dance is annoying, you could use https://docs.rs/once_cell/latest/once_cell/sync/struct.Lazy.html instead, so you could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't feel strongly about this one. I am okay leaving it as is if that works for you. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, totally fine. |
||
HashMap::from([ | ||
// This disk ships with a size of 3200 from the factory, but we | ||
// still wish to apply 4K LBA formatting. | ||
( | ||
"WUS4C6432DSP3X3", | ||
NvmeDeviceSettings { size: None, lba_data_size_override: None }, | ||
), | ||
( | ||
"WUS5EA138ESP7E1", | ||
NvmeDeviceSettings { | ||
size: Some(3200), | ||
lba_data_size_override: None, | ||
}, | ||
), | ||
( | ||
"WUS5EA138ESP7E3", | ||
NvmeDeviceSettings { | ||
size: Some(3200), | ||
lba_data_size_override: None, | ||
}, | ||
), | ||
( | ||
"WUS5EA176ESP7E1", | ||
NvmeDeviceSettings { | ||
size: Some(6400), | ||
lba_data_size_override: None, | ||
}, | ||
), | ||
( | ||
"WUS5EA176ESP7E3", | ||
NvmeDeviceSettings { | ||
size: Some(6400), | ||
lba_data_size_override: None, | ||
}, | ||
), | ||
]) | ||
}) | ||
} | ||
|
||
#[derive(Debug, thiserror::Error)] | ||
pub enum NvmeFormattingError { | ||
#[error(transparent)] | ||
NvmeInit(#[from] libnvme::NvmeInitError), | ||
#[error(transparent)] | ||
Nvme(#[from] libnvme::NvmeError), | ||
#[error("Device is missing expected LBA format")] | ||
LbaFormatMissing, | ||
#[error("Device has {0} active namespaces but we expected 1")] | ||
UnexpectedNamespaces(usize), | ||
#[error(transparent)] | ||
InfoError(#[from] libnvme::controller_info::NvmeInfoError), | ||
#[error("Could not find NVMe controller for disk with serial {0}")] | ||
NoController(String), | ||
} | ||
|
||
// The expected layout of an M.2 device within the Oxide rack. | ||
// | ||
// Partitions beyond this "expected partition" array are ignored. | ||
|
@@ -79,8 +159,11 @@ pub fn ensure_partition_layout( | |
log: &Logger, | ||
paths: &DiskPaths, | ||
variant: DiskVariant, | ||
identity: &DiskIdentity, | ||
) -> Result<Vec<Partition>, PooledDiskError> { | ||
internal_ensure_partition_layout::<libefi_illumos::Gpt>(log, paths, variant) | ||
internal_ensure_partition_layout::<libefi_illumos::Gpt>( | ||
log, paths, variant, identity, | ||
) | ||
} | ||
|
||
// Same as the [ensure_partition_layout], but with generic parameters | ||
|
@@ -89,6 +172,7 @@ fn internal_ensure_partition_layout<GPT: gpt::LibEfiGpt>( | |
log: &Logger, | ||
paths: &DiskPaths, | ||
variant: DiskVariant, | ||
identity: &DiskIdentity, | ||
) -> Result<Vec<Partition>, PooledDiskError> { | ||
// Open the "Whole Disk" as a raw device to be parsed by the | ||
// libefi-illumos library. This lets us peek at the GPT before | ||
|
@@ -119,6 +203,11 @@ fn internal_ensure_partition_layout<GPT: gpt::LibEfiGpt>( | |
}; | ||
match variant { | ||
DiskVariant::U2 => { | ||
// First we need to check that this disk is of the proper | ||
// size and correct logical block address formatting. | ||
ensure_size_and_formatting(log, identity)?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mostly for my own education: what does this do with disks that already have data on them? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't touch a disk with a label/zpool as changing the LBA format erases all data on the disk. This should only be preformed when StorageManger finds a new disk that has not been setup yet i.e first boot, add disk etc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And I expect that when we transform to an explicit adoption model then we'll want this to not occur until that adoption process is initiated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This only gets run if there is no GPT table. It's hidden by github above the diff. |
||
|
||
// If we were successful we can create a zpool on this disk. | ||
info!(log, "Formatting zpool on disk {}", paths.devfs_path); | ||
// If a zpool does not already exist, create one. | ||
let zpool_name = ZpoolName::new_external(Uuid::new_v4()); | ||
|
@@ -154,13 +243,124 @@ fn internal_ensure_partition_layout<GPT: gpt::LibEfiGpt>( | |
} | ||
} | ||
|
||
fn ensure_size_and_formatting( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any problem running this over after failure on a reboot or restart of sled-agent? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So while testing I found that if a disk is left in the unattached state then sled-agent will ignore it. That's due to these lines. So, if we were to detach blkdev from the disk and then take a sled-agent reboot we wouldn't be notified of that disk again I believe. Now if we were in the process of formatting a disk and the box panicked or rebooted for some reason I don't know what would happen. Perhaps @rmustacc could shed some light on that failure scenario. Regardless I don't know what the best course of action is for the first failure mode, maybe we need an enhancement in how we detect disks from devinfo? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There is nothing persistent about a blkdev attach/detach, so the kernel will attempt to if it can. Because we don't support namespace management and aren't doing anything here, this should likely be okay, though I can't comment on what happens if a device fails in the middle of say a format nvm command.
I would suggest we file a follow up bug that moves this to looking for the NVMe node and not the blkdev node. This is for a few reasons:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed this as #5241 |
||
log: &Logger, | ||
identity: &DiskIdentity, | ||
) -> Result<(), NvmeFormattingError> { | ||
use libnvme::namespace::NamespaceDiscoveryLevel; | ||
use libnvme::Nvme; | ||
|
||
let mut controller_found = false; | ||
|
||
if let Some(nvme_settings) = | ||
preferred_nvme_device_settings().get(identity.model.as_str()) | ||
{ | ||
let nvme = Nvme::new()?; | ||
for controller in nvme.controller_discovery()?.into_iter() { | ||
let controller = controller?.write_lock().map_err(|(_, e)| e)?; | ||
let controller_info = controller.get_info()?; | ||
// Make sure we are operating on the correct NVMe device. | ||
if controller_info.serial() != identity.serial { | ||
continue; | ||
}; | ||
controller_found = true; | ||
let nsdisc = controller | ||
.namespace_discovery(NamespaceDiscoveryLevel::Active)?; | ||
let namespaces = | ||
nsdisc.into_iter().collect::<Result<Vec<_>, _>>()?; | ||
if namespaces.len() != 1 { | ||
return Err(NvmeFormattingError::UnexpectedNamespaces( | ||
namespaces.len(), | ||
)); | ||
} | ||
// Safe because verified there is exactly one namespace. | ||
let namespace = namespaces.into_iter().next().unwrap(); | ||
|
||
// TODO we need to abstract this detail awway if/when future devices | ||
// are not from WDC. | ||
let size = controller.wdc_resize_get()?; | ||
|
||
// First we need to detach blkdev from the namespace. | ||
namespace.blkdev_detach()?; | ||
|
||
// Resize the device if needed to ensure we get the expected | ||
// durability level in terms of drive writes per day. | ||
if let Some(wanted_size) = nvme_settings.size { | ||
if size != wanted_size { | ||
// TODO this also needs to be abstracted away. | ||
controller.wdc_resize_set(wanted_size)?; | ||
info!( | ||
log, | ||
"Resized {} from {size} to {wanted_size}", | ||
identity.serial, | ||
) | ||
} | ||
} | ||
|
||
// Find the LBA format we want to use for the device. | ||
let wanted_data_size = nvme_settings | ||
.lba_data_size_override | ||
.unwrap_or(DEFAULT_NVME_LBA_DATA_SIZE); | ||
let desired_lba = controller_info | ||
.lba_formats() | ||
.into_iter() | ||
.collect::<Result<Vec<_>, _>>()? | ||
.into_iter() | ||
.find(|lba| { | ||
lba.meta_size() == NVME_LBA_META_SIZE | ||
&& lba.data_size() == wanted_data_size | ||
}) | ||
.ok_or_else(|| NvmeFormattingError::LbaFormatMissing)?; | ||
|
||
// If the controller isn't formatted to our desired LBA we need to | ||
// issue a format request. | ||
let ns_info = namespace.get_info()?; | ||
let current_lba = ns_info.current_format()?; | ||
if current_lba.id() != desired_lba.id() { | ||
controller | ||
.format_request()? | ||
.set_lbaf(desired_lba.id())? | ||
// TODO map this to libnvme::BROADCAST_NAMESPACE once added | ||
.set_nsid(u32::MAX)? | ||
// No secure erase | ||
.set_ses(0)? | ||
.execute()?; | ||
|
||
info!( | ||
log, | ||
"Formatted {} using LBA with data size of {wanted_data_size}", | ||
identity.serial, | ||
); | ||
} | ||
|
||
// Attach blkdev to the namespace again | ||
namespace.blkdev_attach()?; | ||
} | ||
} else { | ||
info!( | ||
log, | ||
"There are no preferred NVMe settings for disk model {}; nothing to\ | ||
do for disk with serial {}", | ||
identity.model, | ||
identity.serial | ||
); | ||
return Ok(()); | ||
} | ||
|
||
if !controller_found { | ||
return Err(NvmeFormattingError::NoController(identity.serial.clone())); | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
use crate::DiskPaths; | ||
use camino::Utf8PathBuf; | ||
use illumos_utils::zpool::MockZpool; | ||
use omicron_test_utils::dev::test_setup_log; | ||
use omicron_test_utils::dev::{mock_device_identity, test_setup_log}; | ||
use std::path::Path; | ||
|
||
struct FakePartition { | ||
|
@@ -196,6 +396,7 @@ mod test { | |
&log, | ||
&DiskPaths { devfs_path, dev_path: None }, | ||
DiskVariant::U2, | ||
&mock_device_identity(), | ||
); | ||
match result { | ||
Err(PooledDiskError::CannotFormatMissingDevPath { .. }) => {} | ||
|
@@ -229,6 +430,7 @@ mod test { | |
dev_path: Some(Utf8PathBuf::from(DEV_PATH)), | ||
}, | ||
DiskVariant::U2, | ||
&mock_device_identity(), | ||
) | ||
.expect("Should have succeeded partitioning disk"); | ||
|
||
|
@@ -253,6 +455,7 @@ mod test { | |
dev_path: Some(Utf8PathBuf::from(DEV_PATH)) | ||
}, | ||
DiskVariant::M2, | ||
&mock_device_identity(), | ||
) | ||
.is_err()); | ||
|
||
|
@@ -290,6 +493,7 @@ mod test { | |
dev_path: Some(Utf8PathBuf::from(DEV_PATH)), | ||
}, | ||
DiskVariant::U2, | ||
&mock_device_identity(), | ||
) | ||
.expect("Should be able to parse disk"); | ||
|
||
|
@@ -332,6 +536,7 @@ mod test { | |
dev_path: Some(Utf8PathBuf::from(DEV_PATH)), | ||
}, | ||
DiskVariant::M2, | ||
&mock_device_identity(), | ||
) | ||
.expect("Should be able to parse disk"); | ||
|
||
|
@@ -371,6 +576,7 @@ mod test { | |
dev_path: Some(Utf8PathBuf::from(DEV_PATH)), | ||
}, | ||
DiskVariant::M2, | ||
&mock_device_identity(), | ||
) | ||
.expect_err("Should have failed parsing empty GPT"), | ||
PooledDiskError::BadPartitionLayout { .. } | ||
|
@@ -396,6 +602,7 @@ mod test { | |
dev_path: Some(Utf8PathBuf::from(DEV_PATH)), | ||
}, | ||
DiskVariant::U2, | ||
&mock_device_identity(), | ||
) | ||
.expect_err("Should have failed parsing empty GPT"), | ||
PooledDiskError::BadPartitionLayout { .. } | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nothing uses this today, so I am open to removing it from this PR in favor of not having dead code.