diff --git a/Cargo.lock b/Cargo.lock index e15afdfbab..fe003edbe4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5239,6 +5239,7 @@ dependencies = [ "dropshot", "expectorate", "flate2", + "flume", "futures", "gateway-client", "glob", diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index a70b3b4638..a16fe7630b 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -28,6 +28,7 @@ dpd-client.workspace = true display-error-chain.workspace = true dropshot.workspace = true flate2.workspace = true +flume.workspace = true futures.workspace = true glob.workspace = true hex.workspace = true diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 80c62be234..6d2f4c1372 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -13,16 +13,16 @@ use crate::params::{ InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, InstanceStateRequested, InstanceUnregisterResponse, }; +use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; +use omicron_common::api::external::ByteCount; use anyhow::anyhow; use illumos_utils::dladm::Etherstub; use illumos_utils::link::VnicAllocator; use illumos_utils::opte::PortManager; use illumos_utils::running_zone::ZoneBuilderFactory; -use illumos_utils::vmm_reservoir; -use omicron_common::api::external::ByteCount; use omicron_common::api::internal::nexus::InstanceRuntimeState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::api::internal::nexus::VmmRuntimeState; @@ -30,7 +30,7 @@ use sled_storage::manager::StorageHandle; use slog::Logger; use std::collections::BTreeMap; use std::net::SocketAddr; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use tokio::sync::{mpsc, oneshot}; use uuid::Uuid; @@ -48,12 +48,6 @@ pub enum Error { #[error("OPTE port management error: {0}")] Opte(#[from] illumos_utils::opte::Error), - #[error("Failed to create reservoir: {0}")] - Reservoir(#[from] vmm_reservoir::Error), - - #[error("Invalid reservoir configuration: {0}")] - ReservoirConfig(String), - #[error("Cannot find data link: {0}")] Underlay(#[from] sled_hardware::underlay::Error), @@ -72,12 +66,6 @@ pub enum Error { RequestDropped(#[from] oneshot::error::RecvError), } -pub enum ReservoirMode { - None, - Size(u32), - Percentage(u8), -} - pub(crate) struct InstanceManagerServices { pub nexus_client: NexusClientWithResolver, pub vnic_allocator: VnicAllocator, @@ -90,14 +78,8 @@ pub(crate) struct InstanceManagerServices { // Describes the internals of the "InstanceManager", though most of the // instance manager's state exists within the "InstanceManagerRunner" structure. struct InstanceManagerInternal { - log: Logger, tx: mpsc::Sender, - // NOTE: Arguably, this field could be "owned" by the InstanceManagerRunner. - // It was not moved there, and the reservoir functions were not converted to - // use the message-passing interface (see: "InstanceManagerRequest") because - // callers of "get/set reservoir size" are not async, and (in the case of - // getting the size) they also do not expect a "Result" type. - reservoir_size: Mutex, + vmm_reservoir_manager: VmmReservoirManagerHandle, #[allow(dead_code)] runner_handle: tokio::task::JoinHandle<()>, @@ -110,6 +92,7 @@ pub struct InstanceManager { impl InstanceManager { /// Initializes a new [`InstanceManager`] object. + #[allow(clippy::too_many_arguments)] pub fn new( log: Logger, nexus_client: NexusClientWithResolver, @@ -118,6 +101,7 @@ impl InstanceManager { storage: StorageHandle, zone_bundler: ZoneBundler, zone_builder_factory: ZoneBuilderFactory, + vmm_reservoir_manager: VmmReservoirManagerHandle, ) -> Result { let (tx, rx) = mpsc::channel(QUEUE_SIZE); let (terminate_tx, terminate_rx) = mpsc::unbounded_channel(); @@ -142,91 +126,13 @@ impl InstanceManager { Ok(Self { inner: Arc::new(InstanceManagerInternal { - log, tx, - // no reservoir size set on startup - reservoir_size: Mutex::new(ByteCount::from_kibibytes_u32(0)), + vmm_reservoir_manager, runner_handle, }), }) } - /// Sets the VMM reservoir to the requested percentage of usable physical - /// RAM or to a size in MiB. Either mode will round down to the nearest - /// aligned size required by the control plane. - pub fn set_reservoir_size( - &self, - hardware: &sled_hardware::HardwareManager, - mode: ReservoirMode, - ) -> Result<(), Error> { - let hardware_physical_ram_bytes = hardware.usable_physical_ram_bytes(); - let req_bytes = match mode { - ReservoirMode::None => return Ok(()), - ReservoirMode::Size(mb) => { - let bytes = ByteCount::from_mebibytes_u32(mb).to_bytes(); - if bytes > hardware_physical_ram_bytes { - return Err(Error::ReservoirConfig(format!( - "cannot specify a reservoir of {bytes} bytes when \ - physical memory is {hardware_physical_ram_bytes} bytes", - ))); - } - bytes - } - ReservoirMode::Percentage(percent) => { - if !matches!(percent, 1..=99) { - return Err(Error::ReservoirConfig(format!( - "VMM reservoir percentage of {} must be between 0 and \ - 100", - percent - ))); - }; - (hardware_physical_ram_bytes as f64 * (percent as f64 / 100.0)) - .floor() as u64 - } - }; - - let req_bytes_aligned = vmm_reservoir::align_reservoir_size(req_bytes); - - if req_bytes_aligned == 0 { - warn!( - self.inner.log, - "Requested reservoir size of {} bytes < minimum aligned size \ - of {} bytes", - req_bytes, - vmm_reservoir::RESERVOIR_SZ_ALIGN - ); - return Ok(()); - } - - // The max ByteCount value is i64::MAX, which is ~8 million TiB. - // As this value is either a percentage of DRAM or a size in MiB - // represented as a u32, constructing this should always work. - let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap(); - if let ReservoirMode::Percentage(percent) = mode { - info!( - self.inner.log, - "{}% of {} physical ram = {} bytes)", - percent, - hardware_physical_ram_bytes, - req_bytes, - ); - } - info!( - self.inner.log, - "Setting reservoir size to {reservoir_size} bytes" - ); - vmm_reservoir::ReservoirControl::set(reservoir_size)?; - - *self.inner.reservoir_size.lock().unwrap() = reservoir_size; - - Ok(()) - } - - /// Returns the last-set size of the reservoir - pub fn reservoir_size(&self) -> ByteCount { - *self.inner.reservoir_size.lock().unwrap() - } - pub async fn ensure_registered( &self, instance_id: Uuid, @@ -379,6 +285,10 @@ impl InstanceManager { .map_err(|_| Error::FailedSendInstanceManagerClosed)?; rx.await? } + + pub fn reservoir_size(&self) -> ByteCount { + self.inner.vmm_reservoir_manager.reservoir_size() + } } // Most requests that can be sent to the "InstanceManagerRunner" task. diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 527b483ee8..bfc23b248d 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -40,6 +40,7 @@ mod smf_helper; mod storage_monitor; mod swap_device; mod updates; +mod vmm_reservoir; mod zone_bundle; #[cfg(test)] diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1a634a6346..fc0115bc13 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -11,7 +11,7 @@ use crate::bootstrap::early_networking::{ }; use crate::bootstrap::params::{BaseboardId, StartSledAgentRequest}; use crate::config::Config; -use crate::instance_manager::{InstanceManager, ReservoirMode}; +use crate::instance_manager::InstanceManager; use crate::long_running_tasks::LongRunningTaskHandles; use crate::metrics::MetricsManager; use crate::nexus::{ConvertInto, NexusClientWithResolver, NexusRequestQueue}; @@ -25,6 +25,7 @@ use crate::params::{ use crate::services::{self, ServiceManager}; use crate::storage_monitor::UnderlayAccess; use crate::updates::{ConfigUpdates, UpdateManager}; +use crate::vmm_reservoir::{ReservoirMode, VmmReservoirManager}; use crate::zone_bundle; use crate::zone_bundle::BundleError; use bootstore::schemes::v0 as bootstore; @@ -413,6 +414,19 @@ impl SledAgent { .map_err(|_| ()) .expect("Failed to send to StorageMonitor"); + // Configure the VMM reservoir as either a percentage of DRAM or as an + // exact size in MiB. + let reservoir_mode = ReservoirMode::from_config( + config.vmm_reservoir_percentage, + config.vmm_reservoir_size_mb, + ); + + let vmm_reservoir_manager = VmmReservoirManager::spawn( + &log, + long_running_task_handles.hardware_manager.clone(), + reservoir_mode, + ); + let instances = InstanceManager::new( parent_log.clone(), nexus_client.clone(), @@ -421,41 +435,9 @@ impl SledAgent { storage_manager.clone(), long_running_task_handles.zone_bundler.clone(), ZoneBuilderFactory::default(), + vmm_reservoir_manager, )?; - // Configure the VMM reservoir as either a percentage of DRAM or as an - // exact size in MiB. - let reservoir_mode = match ( - config.vmm_reservoir_percentage, - config.vmm_reservoir_size_mb, - ) { - (None, None) => ReservoirMode::None, - (Some(p), None) => ReservoirMode::Percentage(p), - (None, Some(mb)) => ReservoirMode::Size(mb), - (Some(_), Some(_)) => panic!( - "only one of vmm_reservoir_percentage and \ - vmm_reservoir_size_mb is allowed" - ), - }; - - match reservoir_mode { - ReservoirMode::None => warn!(log, "Not using VMM reservoir"), - ReservoirMode::Size(0) | ReservoirMode::Percentage(0) => { - warn!(log, "Not using VMM reservoir (size 0 bytes requested)") - } - _ => { - instances - .set_reservoir_size( - &long_running_task_handles.hardware_manager, - reservoir_mode, - ) - .map_err(|e| { - error!(log, "Failed to setup VMM reservoir: {e}"); - e - })?; - } - } - let update_config = ConfigUpdates { zone_artifact_path: Utf8PathBuf::from("/opt/oxide"), }; diff --git a/sled-agent/src/vmm_reservoir.rs b/sled-agent/src/vmm_reservoir.rs new file mode 100644 index 0000000000..b44cb7efc4 --- /dev/null +++ b/sled-agent/src/vmm_reservoir.rs @@ -0,0 +1,256 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! A background thread for managing the VMM reservoir + +use illumos_utils::vmm_reservoir; +use omicron_common::api::external::ByteCount; +use slog::Logger; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::thread; +use tokio::sync::oneshot; + +use sled_hardware::HardwareManager; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Failed to create reservoir: {0}")] + Reservoir(#[from] vmm_reservoir::Error), + + #[error("Invalid reservoir configuration: {0}")] + ReservoirConfig(String), + + #[error("VmmReservoirManager is currently busy")] + Busy(ReservoirMode), + + #[error("VmmReservoirManager has shutdown")] + Shutdown(ReservoirMode), + + #[error( + "Communication error with VmmReservoirManager: ReplySenderDropped" + )] + ReplySenderDropped, +} + +#[derive(Debug)] +pub enum ReservoirMode { + None, + Size(u32), + Percentage(u8), +} + +/// A message sent from [`VmmReservoirManagerHandle`] to [`VmmReservoirManager`] +pub enum ReservoirManagerMsg { + SetReservoirSize { + mode: ReservoirMode, + reply_tx: oneshot::Sender>, + }, +} + +impl ReservoirMode { + /// Return a configuration of the VMM reservoir as either a percentage of + /// DRAM or as an exact size in MiB. + /// + /// Panic upon invalid configuration + pub fn from_config( + percentage: Option, + size_mb: Option, + ) -> ReservoirMode { + match (percentage, size_mb) { + (None, None) => ReservoirMode::None, + (Some(p), None) => ReservoirMode::Percentage(p), + (None, Some(mb)) => ReservoirMode::Size(mb), + (Some(_), Some(_)) => panic!( + "only one of vmm_reservoir_percentage and \ + vmm_reservoir_size_mb is allowed" + ), + } + } +} + +/// A mechanism to interact with the [`VmmReservoirManager`] +pub struct VmmReservoirManagerHandle { + reservoir_size: Arc, + tx: flume::Sender, + _manager_handle: thread::JoinHandle<()>, +} + +impl VmmReservoirManagerHandle { + /// Returns the last-set size of the reservoir + pub fn reservoir_size(&self) -> ByteCount { + self.reservoir_size.load(Ordering::SeqCst).try_into().unwrap() + } + + /// Tell the [`VmmReservoirManager`] to set the reservoir size and wait for + /// a response. + /// + /// Returns an error if the allocation fails or if the manager is currently + /// busy handling another request. + // + // It's anticipated this will be used to change the reservation + #[allow(unused)] + pub async fn set_reservoir_size( + &self, + mode: ReservoirMode, + ) -> Result<(), Error> { + let (tx, rx) = oneshot::channel(); + let msg = ReservoirManagerMsg::SetReservoirSize { mode, reply_tx: tx }; + if let Err(e) = self.tx.try_send(msg) { + let (e, is_shutdown) = match e { + flume::TrySendError::Full(e) => (e, false), + flume::TrySendError::Disconnected(e) => (e, true), + }; + let ReservoirManagerMsg::SetReservoirSize { mode, .. } = e; + if is_shutdown { + return Err(Error::Shutdown(mode)); + } else { + return Err(Error::Busy(mode)); + } + } + rx.await.map_err(|_| Error::ReplySenderDropped)? + } +} + +/// Manage the VMM reservoir in a background thread +pub struct VmmReservoirManager { + reservoir_size: Arc, + rx: flume::Receiver, + log: Logger, +} + +impl VmmReservoirManager { + pub fn run( + self, + hardware_manager: HardwareManager, + reservoir_mode: ReservoirMode, + ) { + match reservoir_mode { + ReservoirMode::None => warn!(self.log, "Not using VMM reservoir"), + ReservoirMode::Size(0) | ReservoirMode::Percentage(0) => { + warn!( + self.log, + "Not using VMM reservoir (size 0 bytes requested)" + ) + } + _ => { + if let Err(e) = + self.set_reservoir_size(&hardware_manager, reservoir_mode) + { + error!(self.log, "Failed to setup VMM reservoir: {e}"); + } + } + } + + while let Ok(msg) = self.rx.recv() { + let ReservoirManagerMsg::SetReservoirSize { mode, reply_tx } = msg; + match self.set_reservoir_size(&hardware_manager, mode) { + Ok(()) => { + let _ = reply_tx.send(Ok(())); + } + Err(e) => { + error!(self.log, "Failed to setup VMM reservoir: {e}"); + let _ = reply_tx.send(Err(e)); + } + } + } + } + + /// Sets the VMM reservoir to the requested percentage of usable physical + /// RAM or to a size in MiB. Either mode will round down to the nearest + /// aligned size required by the control plane. + pub fn set_reservoir_size( + &self, + hardware: &sled_hardware::HardwareManager, + mode: ReservoirMode, + ) -> Result<(), Error> { + let hardware_physical_ram_bytes = hardware.usable_physical_ram_bytes(); + let req_bytes = match mode { + ReservoirMode::None => return Ok(()), + ReservoirMode::Size(mb) => { + let bytes = ByteCount::from_mebibytes_u32(mb).to_bytes(); + if bytes > hardware_physical_ram_bytes { + return Err(Error::ReservoirConfig(format!( + "cannot specify a reservoir of {bytes} bytes when \ + physical memory is {hardware_physical_ram_bytes} bytes", + ))); + } + bytes + } + ReservoirMode::Percentage(percent) => { + if !matches!(percent, 1..=99) { + return Err(Error::ReservoirConfig(format!( + "VMM reservoir percentage of {} must be between 0 and \ + 100", + percent + ))); + }; + (hardware_physical_ram_bytes as f64 * (percent as f64 / 100.0)) + .floor() as u64 + } + }; + + let req_bytes_aligned = vmm_reservoir::align_reservoir_size(req_bytes); + + if req_bytes_aligned == 0 { + warn!( + self.log, + "Requested reservoir size of {} bytes < minimum aligned size \ + of {} bytes", + req_bytes, + vmm_reservoir::RESERVOIR_SZ_ALIGN + ); + return Ok(()); + } + + // The max ByteCount value is i64::MAX, which is ~8 million TiB. + // As this value is either a percentage of DRAM or a size in MiB + // represented as a u32, constructing this should always work. + let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap(); + if let ReservoirMode::Percentage(percent) = mode { + info!( + self.log, + "{}% of {} physical ram = {} bytes)", + percent, + hardware_physical_ram_bytes, + req_bytes, + ); + } + info!(self.log, "Setting reservoir size to {reservoir_size} bytes"); + vmm_reservoir::ReservoirControl::set(reservoir_size)?; + + self.reservoir_size.store(reservoir_size.to_bytes(), Ordering::SeqCst); + info!( + self.log, + "Finished setting reservoir size to {reservoir_size} bytes" + ); + + Ok(()) + } +} + +impl VmmReservoirManager { + pub fn spawn( + log: &Logger, + hardware_manager: HardwareManager, + reservoir_mode: ReservoirMode, + ) -> VmmReservoirManagerHandle { + let log = log.new(o!("component" => "VmmReservoirManager")); + // We use a rendevous channel to only allow one request at a time. + // Resizing a reservoir may block the thread for up to two minutes, so + // we want to ensure, it is complete before allowing another call. + let (tx, rx) = flume::bounded(0); + let reservoir_size = Arc::new(AtomicU64::new(0)); + let manager = VmmReservoirManager { + reservoir_size: reservoir_size.clone(), + rx, + log, + }; + + let _manager_handle = thread::spawn(move || { + manager.run(hardware_manager, reservoir_mode) + }); + VmmReservoirManagerHandle { reservoir_size, tx, _manager_handle } + } +}