diff --git a/Cargo.toml b/Cargo.toml index 697b8dc..448a93c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,3 @@ [workspace] -members = ["src/api", "src/vmm", "src/cli", "src/fs-gen", "src/agent"] +members = ["src/agent", "src/api", "src/cli", "src/fs-gen", "src/vmm"] resolver = "2" diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index ef68bd7..b549196 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -13,6 +13,7 @@ rust-version = "1.76.0" clap = { version = "4.5.1", features = ["derive", "env"] } clap-verbosity-flag = "2.2.0" epoll = "4.3.3" +event-manager = { version = "0.4.0", features = ["remote_endpoint"] } kvm-bindings = { version = "0.7.0", features = ["fam-wrappers"] } kvm-ioctls = "0.16.0" libc = "0.2.153" @@ -21,15 +22,18 @@ log = "0.4.20" nix = { version = "0.28.0", features = ["term"] } openpty = "0.2.0" prost = "0.11" +tokio = { version = "1.37.0", features = ["full"] } tonic = "0.9" tracing = "0.1.40" tracing-subscriber = "0.3.18" virtio-bindings = "0.2.2" +virtio-device = { git = "https://github.com/rust-vmm/vm-virtio.git" } +virtio-queue = { git = "https://github.com/rust-vmm/vm-virtio.git" } +vm-allocator = "0.1.0" vm-device = "0.1.0" -vm-memory = { version = "0.14.0", features = ["backend-mmap"] } +vm-memory = { version = "0.14.1", features = ["backend-mmap"] } vm-superio = "0.7.0" vmm-sys-util = "0.12.1" -tokio = { version= "1.37.0", features= ["full"]} [build-dependencies] -tonic-build = "0.9" \ No newline at end of file +tonic-build = "0.9" diff --git a/src/vmm/src/core/cpu/mod.rs b/src/vmm/src/core/cpu/mod.rs index 25cd303..d87baf0 100644 --- a/src/vmm/src/core/cpu/mod.rs +++ b/src/vmm/src/core/cpu/mod.rs @@ -13,6 +13,8 @@ use std::sync::{Arc, Mutex}; use std::{io, process}; use std::{result, u64}; use tracing::{error, info, warn}; +use vm_device::bus::MmioAddress; +use vm_device::device_manager::{IoManager, MmioManager}; use vm_memory::{Address, Bytes, GuestAddress, GuestMemoryError, GuestMemoryMmap}; use vmm_sys_util::terminal::Terminal; @@ -73,6 +75,7 @@ pub(crate) struct Vcpu { /// KVM file descriptor for a vCPU. pub vcpu_fd: VcpuFd, + device_mgr: Arc>, serial: Arc>>, slip_pty: Arc>, } @@ -82,12 +85,14 @@ impl Vcpu { pub fn new( vm_fd: &VmFd, index: u64, + device_mgr: Arc>, serial: Arc>>, slip_pty: Arc>, ) -> Result { Ok(Vcpu { index, vcpu_fd: vm_fd.create_vcpu(index).map_err(Error::KvmIoctl)?, + device_mgr, serial, slip_pty, }) @@ -308,6 +313,28 @@ impl Vcpu { warn!(address = addr, "Unsupported device read at {:x?}", addr); } }, + VcpuExit::MmioRead(addr, data) => { + if self + .device_mgr + .try_lock() + .unwrap() + .mmio_read(MmioAddress(addr), data) + .is_err() + { + error!("Failed to read from mmio addr={} data={:#?}", addr, data); + } + } + VcpuExit::MmioWrite(addr, data) => { + if self + .device_mgr + .try_lock() + .unwrap() + .mmio_write(MmioAddress(addr), data) + .is_err() + { + error!("Failed to write to mmio"); + } + } _ => { error!(?exit_reason, "Unhandled VM-Exit"); } diff --git a/src/vmm/src/core/devices/mod.rs b/src/vmm/src/core/devices/mod.rs index ee70983..ddae99e 100644 --- a/src/vmm/src/core/devices/mod.rs +++ b/src/vmm/src/core/devices/mod.rs @@ -3,6 +3,7 @@ use std::io; pub(crate) mod serial; +pub(crate) mod virtio; #[derive(Debug)] /// Devices errors. diff --git a/src/vmm/src/core/devices/virtio/mod.rs b/src/vmm/src/core/devices/virtio/mod.rs new file mode 100644 index 0000000..5882bee --- /dev/null +++ b/src/vmm/src/core/devices/virtio/mod.rs @@ -0,0 +1,189 @@ +pub mod net; +mod register; + +use event_manager::{ + Error as EvmgrError, MutEventSubscriber, RemoteEndpoint, Result as EvmgrResult, SubscriberId, +}; +use kvm_ioctls::{IoEventAddress, VmFd}; +use libc::EFD_NONBLOCK; +use std::{ + io, + sync::{ + atomic::{AtomicU8, Ordering}, + Arc, Mutex, + }, +}; +use virtio_device::VirtioConfig; +use virtio_queue::{Queue, QueueT}; +use vm_device::bus::{self, MmioRange}; +use vmm_sys_util::{errno, eventfd::EventFd}; + +// Device-independent virtio features. +mod features { + pub const VIRTIO_F_RING_EVENT_IDX: u64 = 29; + pub const VIRTIO_F_VERSION_1: u64 = 32; +} + +// This bit is set on the device interrupt status when notifying the driver about used +// queue events. +// TODO: There seem to be similar semantics when the PCI transport is used with MSI-X cap +// disabled. Let's figure out at some point if having MMIO as part of the name is necessary. +const VIRTIO_MMIO_INT_VRING: u8 = 0x01; + +// The driver will write to the register at this offset in the MMIO region to notify the device +// about available queue events. +const VIRTIO_MMIO_QUEUE_NOTIFY_OFFSET: u64 = 0x50; + +// TODO: Make configurable for each device maybe? +const QUEUE_MAX_SIZE: u16 = 256; + +#[derive(Debug)] +#[allow(dead_code)] +pub enum Error { + AlreadyActivated, + BadFeatures(u64), + Bus(bus::Error), + Cmdline(linux_loader::cmdline::Error), + Endpoint(EvmgrError), + EventFd(io::Error), + Overflow, + IoEvent, + QueuesNotValid, + RegisterIoevent(errno::Error), + RegisterIrqfd(errno::Error), + RegisterMmioDevice(bus::Error), + Conversion, + Mutex, + Net, +} + +pub type Result = std::result::Result; +pub type Subscriber = Arc>; + +#[derive(Copy, Clone)] +pub struct MmioConfig { + pub range: MmioRange, + // The interrupt assigned to the device. + pub gsi: u32, +} + +pub struct Config { + virtio: VirtioConfig, + pub mmio: MmioConfig, + endpoint: RemoteEndpoint, + vm_fd: Arc, + pub irqfd: Arc, +} + +impl Config { + pub fn new( + virtio: VirtioConfig, + mmio: MmioConfig, + endpoint: RemoteEndpoint, + vm_fd: Arc, + ) -> Result { + let irqfd = Arc::new(EventFd::new(EFD_NONBLOCK).map_err(Error::EventFd)?); + + // vm_fd + // .register_irqfd(&irqfd, mmio.gsi) + // .map_err(Error::RegisterIrqfd)?; + + Ok(Self { + virtio, + mmio, + endpoint, + vm_fd, + irqfd, + }) + } + + // Perform common initial steps for device activation based on the configuration, and return + // a `Vec` that contains `EventFd`s registered as ioeventfds, which are used to convey queue + // notifications coming from the driver. + pub fn prepare_activate(&self) -> Result> { + if self.virtio.queues.iter().all(|queue| !queue.ready()) { + return Err(Error::QueuesNotValid); + } + + if self.virtio.device_activated { + return Err(Error::AlreadyActivated); + } + + // We do not support legacy drivers. + if self.virtio.driver_features & (1 << features::VIRTIO_F_VERSION_1) == 0 { + return Err(Error::BadFeatures(self.virtio.driver_features)); + } + + let mut ioevents = Vec::new(); + + // Right now, we operate under the assumption all queues are marked ready by the device + // (which is true until we start supporting devices that can optionally make use of + // additional queues on top of the defaults). + for i in 0..self.virtio.queues.len() { + let fd = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFd)?; + + // Register the queue event fd. + self.vm_fd + .register_ioevent( + &fd, + &IoEventAddress::Mmio( + self.mmio.range.base().0 + VIRTIO_MMIO_QUEUE_NOTIFY_OFFSET, + ), + // The maximum number of queues should fit within an `u16` according to the + // standard, so the conversion below is always expected to succeed. + u32::try_from(i).map_err(|_| Error::Conversion)?, + ) + .map_err(Error::RegisterIoevent)?; + + ioevents.push(fd); + } + + Ok(ioevents) + } + + // Perform the final steps of device activation based on the inner configuration and the + // provided subscriber that's going to handle the device queues. We'll extend this when + // we start support devices that make use of multiple handlers (i.e. for multiple queues). + pub fn finalize_activate(&mut self, handler: Subscriber) -> Result<()> { + // Register the queue handler with the `EventManager`. We could record the `sub_id` + // (and/or keep a handler clone) for further interaction (i.e. to remove the subscriber at + // a later time, retrieve state, etc). + let _sub_id = self + .endpoint + .call_blocking(move |mgr| -> EvmgrResult { + Ok(mgr.add_subscriber(handler)) + }) + .map_err(Error::Endpoint)?; + + self.virtio.device_activated = true; + + Ok(()) + } +} + +/// Simple trait to model the operation of signalling the driver about used events +/// for the specified queue. +// TODO: Does this need renaming to be relevant for packed queues as well? +pub trait SignalUsedQueue { + // TODO: Should this return an error? This failing is not really recoverable at the interface + // level so the expectation is the implementation handles that transparently somehow. + fn signal_used_queue(&self, index: u16); +} + +/// Uses a single irqfd as the basis of signalling any queue (useful for the MMIO transport, +/// where a single interrupt is shared for everything). +pub struct SingleFdSignalQueue { + pub irqfd: Arc, + pub interrupt_status: Arc, +} + +impl SignalUsedQueue for SingleFdSignalQueue { + fn signal_used_queue(&self, _index: u16) { + self.interrupt_status + .fetch_or(VIRTIO_MMIO_INT_VRING, Ordering::SeqCst); + + self.irqfd + .write(1) + .expect("Failed write to eventfd when signalling queue"); + } +} diff --git a/src/vmm/src/core/devices/virtio/net/device.rs b/src/vmm/src/core/devices/virtio/net/device.rs new file mode 100644 index 0000000..d388564 --- /dev/null +++ b/src/vmm/src/core/devices/virtio/net/device.rs @@ -0,0 +1,158 @@ +use super::queue_handler::QueueHandler; +use super::{ + simple_handler::SimpleHandler, tuntap::tap::Tap, Error, Result, NET_DEVICE_ID, + VIRTIO_NET_HDR_SIZE, +}; +use crate::core::devices::virtio::features::VIRTIO_F_RING_EVENT_IDX; +use crate::core::devices::virtio::net::tuntap::open_tap::open_tap; +use crate::core::devices::virtio::register::register_mmio_device; +use crate::core::devices::virtio::{ + self, Config, MmioConfig, SingleFdSignalQueue, Subscriber, QUEUE_MAX_SIZE, +}; +use event_manager::RemoteEndpoint; +use kvm_ioctls::VmFd; +use std::net::Ipv4Addr; +use std::{ + borrow::{Borrow, BorrowMut}, + sync::{Arc, Mutex}, +}; +use virtio_bindings::{ + virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}, + virtio_net::{ + VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, + VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_TSO4, + VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, + }, +}; +use virtio_device::{VirtioConfig, VirtioDeviceActions, VirtioDeviceType, VirtioMmioDevice}; +use virtio_queue::{Queue, QueueT}; +use vm_device::device_manager::IoManager; +use vm_device::{bus::MmioAddress, MutDeviceMmio}; +use vm_memory::GuestMemoryMmap; + +pub struct Net { + mem: Arc, + pub config: Config, + tap: Arc>, +} + +impl Net { + #[allow(clippy::too_many_arguments)] + pub fn new( + mem: Arc, + device_mgr: Arc>, + mmio_cfg: MmioConfig, + ip_addr: Ipv4Addr, + mask: Ipv4Addr, + irq: u32, + endpoint: RemoteEndpoint, + vm_fd: Arc, + cmdline_extra_parameters: &mut Vec, + ) -> Result>> { + let device_features = (1 << VIRTIO_F_VERSION_1) + | (1 << VIRTIO_F_RING_EVENT_IDX) + | (1 << VIRTIO_F_IN_ORDER) + | (1 << VIRTIO_NET_F_CSUM) + | (1 << VIRTIO_NET_F_GUEST_CSUM) + | (1 << VIRTIO_NET_F_GUEST_TSO4) + | (1 << VIRTIO_NET_F_GUEST_TSO6) + | (1 << VIRTIO_NET_F_GUEST_UFO) + | (1 << VIRTIO_NET_F_HOST_TSO4) + | (1 << VIRTIO_NET_F_HOST_TSO6) + | (1 << VIRTIO_NET_F_HOST_UFO); + + let config_space = Vec::new(); + let queues = vec![ + Queue::new(QUEUE_MAX_SIZE).map_err(|_| Error::Virtio(virtio::Error::QueuesNotValid))?, + Queue::new(QUEUE_MAX_SIZE).map_err(|_| Error::Virtio(virtio::Error::QueuesNotValid))?, + ]; + + let virtio_cfg = VirtioConfig::new(device_features, queues, config_space); + + let cfg = Config::new(virtio_cfg, mmio_cfg, endpoint, vm_fd).map_err(Error::Virtio)?; + + // Set offload flags to match the relevant virtio features of the device (for now, + // statically set in the constructor. + let tap = open_tap(None, Some(ip_addr), Some(mask), &mut None, None, None) + .map_err(Error::TunTap)?; + + // The layout of the header is specified in the standard and is 12 bytes in size. We + // should define this somewhere. + tap.set_vnet_hdr_size(VIRTIO_NET_HDR_SIZE as i32) + .map_err(Error::Tap)?; + + let net = Arc::new(Mutex::new(Net { + mem, + config: cfg, + tap: Arc::new(Mutex::new(tap)), + })); + + let param = register_mmio_device(mmio_cfg, device_mgr, irq, None, net.clone()) + .map_err(Error::Virtio)?; + cmdline_extra_parameters.push(param); + + Ok(net) + } +} + +impl VirtioDeviceType for Net { + fn device_type(&self) -> u32 { + NET_DEVICE_ID + } +} + +impl Borrow> for Net { + fn borrow(&self) -> &VirtioConfig { + &self.config.virtio + } +} + +impl BorrowMut> for Net { + fn borrow_mut(&mut self) -> &mut VirtioConfig { + &mut self.config.virtio + } +} + +impl VirtioDeviceActions for Net { + type E = Error; + + fn activate(&mut self) -> Result<()> { + let driver_notify = SingleFdSignalQueue { + irqfd: self.config.irqfd.clone(), + interrupt_status: self.config.virtio.interrupt_status.clone(), + }; + + let mut ioevents = self.config.prepare_activate().map_err(Error::Virtio)?; + + let rxq = self.config.virtio.queues.remove(0); + let txq = self.config.virtio.queues.remove(0); + let inner = SimpleHandler::new(driver_notify, rxq, txq, self.tap.clone(), self.mem.clone()); + + let handler = Arc::new(Mutex::new(QueueHandler { + inner, + rx_ioevent: ioevents.remove(0), + tx_ioevent: ioevents.remove(0), + })); + + self.config + .finalize_activate(handler) + .map_err(Error::Virtio) + } + + fn reset(&mut self) -> std::result::Result<(), Error> { + // Not implemented for now. + Ok(()) + } +} + +impl VirtioMmioDevice for Net {} + +impl MutDeviceMmio for Net { + fn mmio_read(&mut self, _base: MmioAddress, offset: u64, data: &mut [u8]) { + self.read(offset, data); + } + + fn mmio_write(&mut self, _base: MmioAddress, offset: u64, data: &[u8]) { + self.write(offset, data); + } +} diff --git a/src/vmm/src/core/devices/virtio/net/mod.rs b/src/vmm/src/core/devices/virtio/net/mod.rs new file mode 100644 index 0000000..765986e --- /dev/null +++ b/src/vmm/src/core/devices/virtio/net/mod.rs @@ -0,0 +1,22 @@ +pub mod device; +mod queue_handler; +mod simple_handler; +pub mod tuntap; + +use crate::core::devices::virtio; + +use self::tuntap::{open_tap, tap}; + +const NET_DEVICE_ID: u32 = 1; +const VIRTIO_NET_HDR_SIZE: usize = 12; +const RXQ_INDEX: u16 = 0; +const TXQ_INDEX: u16 = 1; + +#[derive(Debug)] +pub enum Error { + Virtio(virtio::Error), + TunTap(open_tap::Error), + Tap(tap::Error), +} + +pub type Result = std::result::Result; diff --git a/src/vmm/src/core/devices/virtio/net/queue_handler.rs b/src/vmm/src/core/devices/virtio/net/queue_handler.rs new file mode 100644 index 0000000..34d7ed4 --- /dev/null +++ b/src/vmm/src/core/devices/virtio/net/queue_handler.rs @@ -0,0 +1,114 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +use super::simple_handler::SimpleHandler; +use crate::core::devices::virtio::SignalUsedQueue; +use event_manager::{EventOps, Events, MutEventSubscriber}; +use log::error; +use std::os::fd::AsRawFd; +use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::eventfd::EventFd; + +const TAPFD_DATA: u32 = 0; +const RX_IOEVENT_DATA: u32 = 1; +const TX_IOEVENT_DATA: u32 = 2; + +pub struct QueueHandler +where + S: SignalUsedQueue, +{ + pub inner: SimpleHandler, + pub rx_ioevent: EventFd, + pub tx_ioevent: EventFd, +} + +impl QueueHandler +where + S: SignalUsedQueue, +{ + // Helper method that receives an error message to be logged and the `ops` handle + // which is used to unregister all events. + fn handle_error>(&self, s: A, ops: &mut EventOps) { + error!("{}", s.as_ref()); + ops.remove(Events::empty(&self.rx_ioevent)) + .expect("Failed to remove rx ioevent"); + ops.remove(Events::empty(&self.tx_ioevent)) + .expect("Failed to remove tx ioevent"); + ops.remove(Events::empty( + &self + .inner + .tap + .lock() + .expect("Failed to lock tap resource") + .as_raw_fd(), + )) + .expect("Failed to remove tap event"); + } +} + +impl MutEventSubscriber for QueueHandler +where + S: SignalUsedQueue, +{ + fn process(&mut self, events: Events, ops: &mut EventOps) { + // TODO: We can also consider panicking on the errors that cannot be generated + // or influenced. + + if events.event_set() != EventSet::IN { + self.handle_error("Unexpected event_set", ops); + return; + } + + match events.data() { + TAPFD_DATA => { + if let Err(e) = self.inner.process_tap() { + self.handle_error(format!("Process tap error {:?}", e), ops); + } + } + RX_IOEVENT_DATA => { + if self.rx_ioevent.read().is_err() { + self.handle_error("Rx ioevent read", ops); + } else if let Err(e) = self.inner.process_rxq() { + self.handle_error(format!("Process rx error {:?}", e), ops); + } + } + TX_IOEVENT_DATA => { + if self.tx_ioevent.read().is_err() { + self.handle_error("Tx ioevent read", ops); + } + if let Err(e) = self.inner.process_txq() { + self.handle_error(format!("Process tx error {:?}", e), ops); + } + } + _ => self.handle_error("Unexpected data", ops), + } + } + + fn init(&mut self, ops: &mut EventOps) { + ops.add(Events::with_data( + &self + .inner + .tap + .lock() + .expect("Failed to lock tap resource") + .as_raw_fd(), + TAPFD_DATA, + EventSet::IN | EventSet::EDGE_TRIGGERED, + )) + .expect("Unable to add tapfd"); + + ops.add(Events::with_data( + &self.rx_ioevent, + RX_IOEVENT_DATA, + EventSet::IN, + )) + .expect("Unable to add rxfd"); + + ops.add(Events::with_data( + &self.tx_ioevent, + TX_IOEVENT_DATA, + EventSet::IN, + )) + .expect("Unable to add txfd"); + } +} diff --git a/src/vmm/src/core/devices/virtio/net/simple_handler.rs b/src/vmm/src/core/devices/virtio/net/simple_handler.rs new file mode 100644 index 0000000..cfeb1ea --- /dev/null +++ b/src/vmm/src/core/devices/virtio/net/simple_handler.rs @@ -0,0 +1,217 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +use std::cmp; +use std::io::{self, Read, Write}; +use std::result; +use std::sync::{Arc, Mutex}; + +use log::warn; +use virtio_queue::{DescriptorChain, Queue, QueueOwnedT, QueueT}; +use vm_memory::{Bytes, GuestAddressSpace, GuestMemoryMmap}; + +use super::tuntap::tap::Tap; +use super::{RXQ_INDEX, TXQ_INDEX}; +use crate::core::devices::virtio::SignalUsedQueue; + +// use crate::virtio::net::tap::Tap; +// use crate::virtio::net::{RXQ_INDEX, TXQ_INDEX}; +// use crate::virtio::SignalUsedQueue; + +// According to the standard: "If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or +// VIRTIO_NET_F_GUEST_UFO features are used, the maximum incoming packet will be to 65550 +// bytes long (the maximum size of a TCP or UDP packet, plus the 14 byte ethernet header), +// otherwise 1514 bytes. The 12-byte struct virtio_net_hdr is prepended to this, making for +// 65562 or 1526 bytes." For transmission, the standard states "The header and packet are added +// as one output descriptor to the transmitq, and the device is notified of the new entry". +// We assume the TX frame will not exceed this size either. +const MAX_BUFFER_SIZE: usize = 65562; + +#[derive(Debug)] +pub enum Error { + GuestMemory(vm_memory::GuestMemoryError), + Queue(virtio_queue::Error), + Tap(io::Error), + Mutex, +} + +impl From for Error { + fn from(e: virtio_queue::Error) -> Self { + Error::Queue(e) + } +} + +// A simple handler implementation for a RX/TX queue pair, which does not make assumptions about +// the way queue notification is implemented. The backend is not yet generic (we always assume a +// `Tap` object), but we're looking at improving that going forward. +// TODO: Find a better name. +pub struct SimpleHandler +where + S: SignalUsedQueue, +{ + pub driver_notify: S, + pub rxq: Queue, + pub rxbuf_current: usize, + pub rxbuf: [u8; MAX_BUFFER_SIZE], + pub txq: Queue, + pub txbuf: [u8; MAX_BUFFER_SIZE], + pub tap: Arc>, + pub mem: Arc, +} + +impl SimpleHandler +where + S: SignalUsedQueue, +{ + pub fn new( + driver_notify: S, + rxq: Queue, + txq: Queue, + tap: Arc>, + mem: Arc, + ) -> Self { + SimpleHandler { + driver_notify, + rxq, + rxbuf_current: 0, + rxbuf: [0u8; MAX_BUFFER_SIZE], + txq, + txbuf: [0u8; MAX_BUFFER_SIZE], + tap, + mem, + } + } + + // Have to see how to approach error handling for the `Queue` implementation in particular, + // because many situations are not really recoverable. We should consider reporting them based + // on the metrics/events solution when they appear, and not propagate them further unless + // it's really useful/necessary. + fn write_frame_to_guest(&mut self) -> result::Result { + let num_bytes = self.rxbuf_current; + + let mut chain = match self.rxq.iter(self.mem.as_ref())?.next() { + Some(c) => c, + _ => return Ok(false), + }; + + let mut count = 0; + let buf = &mut self.rxbuf[..num_bytes]; + + while let Some(desc) = chain.next() { + let left = buf.len() - count; + + if left == 0 { + break; + } + + let len = cmp::min(left, desc.len() as usize); + chain + .memory() + .write_slice(&buf[count..count + len], desc.addr()) + .map_err(Error::GuestMemory)?; + + count += len; + } + + if count != buf.len() { + // The frame was too large for the chain. + warn!("rx frame too large"); + } + + self.rxq + .add_used(self.mem.as_ref(), chain.head_index(), count as u32)?; + + self.rxbuf_current = 0; + + Ok(true) + } + + pub fn process_tap(&mut self) -> result::Result<(), Error> { + loop { + if self.rxbuf_current == 0 { + match self + .tap + .lock() + .map_err(|_| Error::Mutex)? + .read(&mut self.rxbuf) + { + Ok(n) => self.rxbuf_current = n, + Err(_) => { + // TODO: Do something (logs, metrics, etc.) in response to an error when + // reading from tap. EAGAIN means there's nothing available to read anymore + // (because we open the TAP as non-blocking). + break; + } + } + } + + if !self.write_frame_to_guest()? && !self.rxq.enable_notification(self.mem.as_ref())? { + break; + } + } + + if self.rxq.needs_notification(self.mem.as_ref())? { + self.driver_notify.signal_used_queue(RXQ_INDEX); + } + + Ok(()) + } + + fn send_frame_from_chain( + &mut self, + mut chain: DescriptorChain>, + ) -> result::Result { + let mut count = 0; + + while let Some(desc) = chain.by_ref().next() { + let left = self.txbuf.len() - count; + let len = desc.len() as usize; + + if len > left { + warn!("tx frame too large"); + break; + } + + chain + .memory() + .read_slice(&mut self.txbuf[count..count + len], desc.addr()) + .map_err(Error::GuestMemory)?; + + count += len; + } + + self.tap + .lock() + .map_err(|_| Error::Mutex)? + .write_all(&self.txbuf[..count]) + .map_err(Error::Tap)?; + + Ok(count as u32) + } + + pub fn process_txq(&mut self) -> result::Result<(), Error> { + loop { + self.txq.disable_notification(self.mem.as_ref())?; + + while let Some(chain) = self.txq.iter(self.mem.memory())?.next() { + self.send_frame_from_chain(chain.clone())?; + + self.txq + .add_used(self.mem.as_ref(), chain.head_index(), 0)?; + + if self.txq.needs_notification(self.mem.as_ref())? { + self.driver_notify.signal_used_queue(TXQ_INDEX); + } + } + + if !self.txq.enable_notification(self.mem.as_ref())? { + return Ok(()); + } + } + } + + pub fn process_rxq(&mut self) -> result::Result<(), Error> { + self.rxq.disable_notification(self.mem.as_ref())?; + self.process_tap() + } +} diff --git a/src/vmm/src/core/network/mac.rs b/src/vmm/src/core/devices/virtio/net/tuntap/mac.rs similarity index 100% rename from src/vmm/src/core/network/mac.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/mac.rs diff --git a/src/vmm/src/core/network/mod.rs b/src/vmm/src/core/devices/virtio/net/tuntap/mod.rs similarity index 100% rename from src/vmm/src/core/network/mod.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/mod.rs diff --git a/src/vmm/src/core/network/net_gen/if_tun.rs b/src/vmm/src/core/devices/virtio/net/tuntap/net_gen/if_tun.rs similarity index 100% rename from src/vmm/src/core/network/net_gen/if_tun.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/net_gen/if_tun.rs diff --git a/src/vmm/src/core/network/net_gen/iff.rs b/src/vmm/src/core/devices/virtio/net/tuntap/net_gen/iff.rs similarity index 100% rename from src/vmm/src/core/network/net_gen/iff.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/net_gen/iff.rs diff --git a/src/vmm/src/core/network/net_gen/inn.rs b/src/vmm/src/core/devices/virtio/net/tuntap/net_gen/inn.rs similarity index 100% rename from src/vmm/src/core/network/net_gen/inn.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/net_gen/inn.rs diff --git a/src/vmm/src/core/network/net_gen/mod.rs b/src/vmm/src/core/devices/virtio/net/tuntap/net_gen/mod.rs similarity index 100% rename from src/vmm/src/core/network/net_gen/mod.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/net_gen/mod.rs diff --git a/src/vmm/src/core/network/net_gen/sockios.rs b/src/vmm/src/core/devices/virtio/net/tuntap/net_gen/sockios.rs similarity index 100% rename from src/vmm/src/core/network/net_gen/sockios.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/net_gen/sockios.rs diff --git a/src/vmm/src/core/network/open_tap.rs b/src/vmm/src/core/devices/virtio/net/tuntap/open_tap.rs similarity index 96% rename from src/vmm/src/core/network/open_tap.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/open_tap.rs index 56ff32b..1461e69 100644 --- a/src/vmm/src/core/network/open_tap.rs +++ b/src/vmm/src/core/devices/virtio/net/tuntap/open_tap.rs @@ -2,9 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use super::mac::MacAddr; +use super::tap::Tap; use super::{tap, vnet_hdr_len}; -use crate::core::network::mac::MacAddr; -use crate::core::network::tap::Tap; use std::io; use std::net::Ipv4Addr; use std::path::Path; diff --git a/src/vmm/src/core/network/tap.rs b/src/vmm/src/core/devices/virtio/net/tuntap/tap.rs similarity index 98% rename from src/vmm/src/core/network/tap.rs rename to src/vmm/src/core/devices/virtio/net/tuntap/tap.rs index 9988d53..0eb1ec3 100644 --- a/src/vmm/src/core/network/tap.rs +++ b/src/vmm/src/core/devices/virtio/net/tuntap/tap.rs @@ -5,10 +5,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use super::mac::MacAddr; +use super::mac::MAC_ADDR_LEN; +use super::net_gen; use super::{create_inet_socket, create_sockaddr, create_unix_socket, Error as NetUtilError}; -use crate::core::network::mac::MacAddr; -use crate::core::network::mac::MAC_ADDR_LEN; -use crate::core::network::net_gen; use std::fs::File; use std::io::{Error as IoError, Read, Result as IoResult, Write}; use std::net; diff --git a/src/vmm/src/core/devices/virtio/register.rs b/src/vmm/src/core/devices/virtio/register.rs new file mode 100644 index 0000000..82a9b70 --- /dev/null +++ b/src/vmm/src/core/devices/virtio/register.rs @@ -0,0 +1,58 @@ +use std::sync::{Arc, Mutex}; + +use super::{Error, MmioConfig, Result}; +use linux_loader::cmdline; +use vm_device::{ + device_manager::{IoManager, MmioManager}, + DeviceMmio, +}; +use vm_memory::{Address, GuestAddress, GuestUsize}; + +pub fn register_mmio_device( + mmio_cfg: MmioConfig, + device_mgr: Arc>, + irq: u32, + id: Option, + device: Arc, +) -> Result { + device_mgr + .lock() + .map_err(|_| Error::Mutex)? + .register_mmio(mmio_cfg.range, device) + .map_err(Error::RegisterMmioDevice)?; + + let size = mmio_cfg.range.size(); + + // Pass to kernel command line + if size == 0 { + return Err(Error::Cmdline(cmdline::Error::MmioSize)); + } + + let mut device_str = format!( + "virtio_mmio.device={}@0x{:x?}:{}", + guestusize_to_str(size), + GuestAddress(mmio_cfg.range.base().0).raw_value(), + irq + ); + if let Some(id) = id { + device_str.push_str(format!(":{}", id).as_str()); + } + Ok(device_str) +} + +fn guestusize_to_str(size: GuestUsize) -> String { + const KB_MULT: u64 = 1 << 10; + const MB_MULT: u64 = KB_MULT << 10; + const GB_MULT: u64 = MB_MULT << 10; + + if size % GB_MULT == 0 { + return format!("{}G", size / GB_MULT); + } + if size % MB_MULT == 0 { + return format!("{}M", size / MB_MULT); + } + if size % KB_MULT == 0 { + return format!("{}K", size / KB_MULT); + } + size.to_string() +} diff --git a/src/vmm/src/core/irq_allocator.rs b/src/vmm/src/core/irq_allocator.rs new file mode 100644 index 0000000..aff5613 --- /dev/null +++ b/src/vmm/src/core/irq_allocator.rs @@ -0,0 +1,64 @@ +// Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +use std::fmt; + +#[derive(Debug, PartialEq, Eq)] +pub enum Error { + InvalidValue, + MaxIrq, + IRQOverflowed, +} + +pub type Result = std::result::Result; + +/// An irq allocator which gives next available irq. +/// It is mainly used for non-legacy devices. +// There are a few reserved irq's on x86_64. We just skip all the inital +// reserved irq to make the implementaion simple. This could be later extended +// to cater more complex scenario. +#[derive(Debug)] +pub struct IrqAllocator { + // Tracks the last allocated irq + last_used_irq: u32, + last_irq: u32, +} + +impl IrqAllocator { + pub fn new(last_used_irq: u32, last_irq: u32) -> Result { + if last_used_irq >= last_irq { + return Err(Error::InvalidValue); + } + Ok(IrqAllocator { + last_used_irq, + last_irq, + }) + } + + pub fn next_irq(&mut self) -> Result { + self.last_used_irq + .checked_add(1) + .ok_or(Error::IRQOverflowed) + .and_then(|irq| { + if irq > self.last_irq { + Err(Error::MaxIrq) + } else { + self.last_used_irq = irq; + Ok(irq) + } + }) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let err = match self { + Error::MaxIrq => "last_irq IRQ limit reached", + Error::IRQOverflowed => "IRQ overflowed", + Error::InvalidValue => { + "Check the value of last_used and last_irq. las_used should be less than last_irq" + } + }; + write!(f, "{}", err) // user-facing output + } +} diff --git a/src/vmm/src/core/kernel.rs b/src/vmm/src/core/kernel.rs index c9b82ea..e300438 100644 --- a/src/vmm/src/core/kernel.rs +++ b/src/vmm/src/core/kernel.rs @@ -42,7 +42,7 @@ const HIMEM_START: u64 = 0x0010_0000; // 1 MB /// Address where the kernel command line is written. const CMDLINE_START: u64 = 0x0002_0000; // Default command line -const CMDLINE: &str = "console=ttyS0 i8042.nokbd reboot=k panic=1 pci=off"; +const DEFAULT_CMDLINE: &str = "console=ttyS0 i8042.nokbd reboot=k panic=1 pci=off ip=172.29.0.2::172.29.0.1:255.255.0.0::eth0:off"; fn add_e820_entry( params: &mut boot_params, @@ -127,6 +127,7 @@ pub fn kernel_setup( guest_memory: &GuestMemoryMmap, kernel_path: PathBuf, initramfs_path: Option, + cmdline_extra_parameters: &mut Vec, ) -> Result { let mut kernel_image = File::open(kernel_path).map_err(Error::IO)?; let zero_page_addr = GuestAddress(ZEROPG_START); @@ -143,13 +144,24 @@ pub fn kernel_setup( // Generate boot parameters. let mut bootparams = build_bootparams(guest_memory, GuestAddress(HIMEM_START))?; + let combined_cmdline: String = { + let mut combined = DEFAULT_CMDLINE.to_string(); + for param in cmdline_extra_parameters { + combined.push(' '); + combined.push_str(param); + } + combined + }; + // Add the kernel command line to the boot parameters. bootparams.hdr.cmd_line_ptr = CMDLINE_START as u32; - bootparams.hdr.cmdline_size = CMDLINE.len() as u32 + 1; + bootparams.hdr.cmdline_size = combined_cmdline.len() as u32 + 1; // Load the kernel command line into guest memory. - let mut cmdline = Cmdline::new(CMDLINE.len() + 1).map_err(Error::Cmdline)?; - cmdline.insert_str(CMDLINE).map_err(Error::Cmdline)?; + let mut cmdline = Cmdline::new(combined_cmdline.len() + 1).map_err(Error::Cmdline)?; + cmdline + .insert_str(combined_cmdline) + .map_err(Error::Cmdline)?; load_cmdline( guest_memory, GuestAddress(CMDLINE_START), diff --git a/src/vmm/src/core/mod.rs b/src/vmm/src/core/mod.rs index 9fdbcbf..717f82f 100644 --- a/src/vmm/src/core/mod.rs +++ b/src/vmm/src/core/mod.rs @@ -8,13 +8,13 @@ extern crate vm_superio; use linux_loader::loader; use std::io; -use self::network::open_tap; +use self::devices::virtio::{self, net::tuntap::open_tap}; mod cpu; mod devices; mod epoll_context; +mod irq_allocator; mod kernel; -mod network; mod slip_pty; pub mod vmm; @@ -65,6 +65,14 @@ pub enum Error { OpenTap(open_tap::Error), // PTY write error PtyRx(devices::Error), + // Address allocator + Allocate(vm_allocator::Error), + // Address allocator + IrqAllocator(irq_allocator::Error), + // MmioRange + MmioRange, + // Virtio net + Virtio(virtio::Error), } /// Dedicated [`Result`](https://doc.rust-lang.org/std/result/) type. diff --git a/src/vmm/src/core/vmm.rs b/src/vmm/src/core/vmm.rs index d541148..1b9ea2d 100644 --- a/src/vmm/src/core/vmm.rs +++ b/src/vmm/src/core/vmm.rs @@ -5,6 +5,7 @@ use crate::core::devices::serial::LumperSerial; use crate::core::epoll_context::{EpollContext, EPOLL_EVENTS_LEN}; use crate::core::kernel; use crate::core::{Error, Result}; +use event_manager::{EventManager, MutEventSubscriber}; use kvm_bindings::{kvm_userspace_memory_region, KVM_MAX_CPUID_ENTRIES}; use kvm_ioctls::{Kvm, VmFd}; use linux_loader::loader::KernelLoaderResult; @@ -16,20 +17,50 @@ use std::path::{Path, PathBuf}; use std::sync::{Arc, Mutex}; use std::thread; use tracing::info; +use vm_allocator::{AddressAllocator, AllocPolicy}; +use vm_device::bus::{MmioAddress, MmioRange}; +use vm_device::device_manager::IoManager; use vm_memory::{Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use vmm_sys_util::terminal::Terminal; -use super::network::open_tap::open_tap; -use super::network::tap::Tap; +use super::devices::virtio::net::device::Net; +use super::devices::virtio::{self, MmioConfig}; +use super::irq_allocator::IrqAllocator; use super::slip_pty::SlipPty; +#[cfg(target_arch = "x86_64")] +pub(crate) const MMIO_GAP_END: u64 = 1 << 32; +/// Size of the MMIO gap. +#[cfg(target_arch = "x86_64")] +pub(crate) const MMIO_GAP_SIZE: u64 = 768 << 20; +/// The start of the MMIO gap (memory area reserved for MMIO devices). +#[cfg(target_arch = "x86_64")] +pub(crate) const MMIO_GAP_START: u64 = MMIO_GAP_END - MMIO_GAP_SIZE; +/// Default address allocator alignment. It needs to be a power of 2. +pub const DEFAULT_ADDRESS_ALIGNEMNT: u64 = 4; +/// Default allocation policy for address allocator. +pub const DEFAULT_ALLOC_POLICY: AllocPolicy = AllocPolicy::FirstMatch; +/// IRQ line 4 is typically used for serial port 1. +// See more IRQ assignments & info: https://tldp.org/HOWTO/Serial-HOWTO-8.html +const SERIAL_IRQ: u32 = 4; +/// Last usable IRQ ID for virtio device interrupts on x86_64. +const IRQ_MAX: u8 = 23; + +type EventMgr = Arc>>>>; + pub struct VMM { - vm_fd: VmFd, + vm_fd: Arc, kvm: Kvm, guest_memory: GuestMemoryMmap, + address_allocator: Option, + irq_allocator: IrqAllocator, + device_mgr: Arc>, + event_mgr: EventMgr, vcpus: Vec, - _tap: Tap, + tap_ip_addr: Ipv4Addr, + tap_netmask: Ipv4Addr, + net_devices: Vec>>, serial: Arc>>, slip_pty: Arc>, epoll: EpollContext, @@ -43,17 +74,7 @@ impl VMM { // Create a KVM VM object. // KVM returns a file descriptor to the VM object. - let vm_fd = kvm.create_vm().map_err(Error::KvmIoctl)?; - - let tap = open_tap( - None, - Some(tap_ip_addr), - Some(tap_netmask), - &mut None, - None, - None, - ) - .map_err(Error::OpenTap)?; + let vm_fd = Arc::new(kvm.create_vm().map_err(Error::KvmIoctl)?); let slip_pty = SlipPty::new()?; @@ -72,17 +93,26 @@ impl VMM { ) .map_err(Error::EpollError)?; + let irq_allocator = IrqAllocator::new(SERIAL_IRQ, IRQ_MAX.into()).unwrap(); + let device_mgr = Arc::new(Mutex::new(IoManager::new())); + let vmm = VMM { vm_fd, kvm, guest_memory: GuestMemoryMmap::default(), + address_allocator: None, + device_mgr, + irq_allocator, + event_mgr: Arc::new(Mutex::new(EventManager::new().unwrap())), vcpus: vec![], - _tap: tap, serial: Arc::new(Mutex::new( LumperSerial::new(stdout()).map_err(Error::SerialCreation)?, )), slip_pty: Arc::new(Mutex::new(slip_pty)), epoll, + tap_ip_addr, + tap_netmask, + net_devices: Vec::new(), }; Ok(vmm) @@ -121,6 +151,19 @@ impl VMM { Ok(()) } + fn configure_allocators(&mut self, mem_size_mb: u32) -> Result<()> { + // Convert memory size from MBytes to bytes. + let mem_size = (mem_size_mb as u64) << 20; + + // Setup address allocator. + let start_addr = MMIO_GAP_START; + let address_allocator = AddressAllocator::new(start_addr, mem_size).unwrap(); + + self.address_allocator = Some(address_allocator); + + Ok(()) + } + fn configure_io(&mut self) -> Result<()> { // First, create the irqchip. // On `x86_64`, this _must_ be created _before_ the vCPUs. @@ -154,6 +197,14 @@ impl VMM { ) .map_err(Error::KvmIoctl)?; + for net in self.net_devices.iter() { + let net_cfg = &net.lock().unwrap().config; + + self.vm_fd + .register_irqfd(&net_cfg.irqfd, net_cfg.mmio.gsi) + .map_err(Error::KvmIoctl)?; + } + Ok(()) } @@ -170,6 +221,7 @@ impl VMM { let vcpu = Vcpu::new( &self.vm_fd, index.into(), + self.device_mgr.clone(), Arc::clone(&self.serial), Arc::clone(&self.slip_pty), ) @@ -221,6 +273,14 @@ impl VMM { let mut events = [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; let epoll_fd = self.epoll.as_raw_fd(); + let event_mgr = self.event_mgr.clone(); + let _ = thread::Builder::new().spawn(move || loop { + match event_mgr.lock().unwrap().run() { + Ok(_) => (), + Err(e) => eprintln!("Failed to handle events: {:?}", e), + } + }); + // Let's start the STDIN polling thread. loop { let num_events = @@ -277,15 +337,61 @@ impl VMM { kernel_path: &Path, initramfs_path: &Option, ) -> Result<()> { + let cmdline_extra_parameters = &mut Vec::new(); + self.configure_memory(mem_size_mb)?; + self.configure_allocators(mem_size_mb)?; + self.configure_net_device(cmdline_extra_parameters)?; + let kernel_load = kernel::kernel_setup( &self.guest_memory, kernel_path.to_path_buf(), initramfs_path.clone(), + cmdline_extra_parameters, )?; self.configure_io()?; self.configure_vcpus(num_vcpus, kernel_load)?; Ok(()) } + + pub fn configure_net_device( + &mut self, + cmdline_extra_parameters: &mut Vec, + ) -> Result<()> { + let mem = Arc::new(self.guest_memory.clone()); + let range = if let Some(allocator) = &self.address_allocator { + allocator + .to_owned() + .allocate(0x1000, DEFAULT_ADDRESS_ALIGNEMNT, DEFAULT_ALLOC_POLICY) + .map_err(Error::Allocate)? + } else { + // Handle the case where self.address_allocator is None + panic!("Address allocator is not initialized"); + }; + let mmio_range = MmioRange::new(MmioAddress(range.start()), range.len()) + .map_err(|_| Error::MmioRange)?; + let irq = self.irq_allocator.next_irq().map_err(Error::IrqAllocator)?; + let mmio_cfg = MmioConfig { + range: mmio_range, + gsi: irq, + }; + + let net = Net::new( + mem, + self.device_mgr.clone(), + mmio_cfg, + self.tap_ip_addr, + self.tap_netmask, + irq, + self.event_mgr.lock().unwrap().remote_endpoint(), + self.vm_fd.clone(), + cmdline_extra_parameters, + ) + .map_err(|_| Error::Virtio(virtio::Error::Net))?; + + self.net_devices.push(net); + + Ok(()) + } }