From 51d102c7080da4c28dc2d50cfee6b220ddf88318 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Wed, 4 Mar 2020 10:12:22 +0800 Subject: [PATCH] vm-virtio: Add virtio-mem device The basic idea of virtio-mem is to provide a flexible, cross-architecture memory hot plug and hot unplug solution that avoids many limitations imposed by existing technologies, architectures, and interfaces. More details can be found in https://lkml.org/lkml/2019/12/12/681. This commit add virtio-mem device. Signed-off-by: Hui Zhu --- vm-virtio/src/lib.rs | 6 + vm-virtio/src/mem.rs | 959 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 965 insertions(+) create mode 100644 vm-virtio/src/mem.rs diff --git a/vm-virtio/src/lib.rs b/vm-virtio/src/lib.rs index b96ced73a..5c0c38010 100755 --- a/vm-virtio/src/lib.rs +++ b/vm-virtio/src/lib.rs @@ -29,6 +29,7 @@ mod device; pub mod block; mod console; mod iommu; +pub mod mem; pub mod net; pub mod net_util; mod pmem; @@ -43,6 +44,7 @@ pub use self::block::*; pub use self::console::*; pub use self::device::*; pub use self::iommu::*; +pub use self::mem::*; pub use self::net::*; pub use self::net_util::*; pub use self::pmem::*; @@ -77,6 +79,7 @@ enum VirtioDeviceType { TYPE_INPUT = 18, TYPE_VSOCK = 19, TYPE_IOMMU = 23, + TYPE_MEM = 24, TYPE_FS = 26, TYPE_PMEM = 27, TYPE_UNKNOWN = 0xFF, @@ -95,6 +98,7 @@ impl From for VirtioDeviceType { 18 => VirtioDeviceType::TYPE_INPUT, 19 => VirtioDeviceType::TYPE_VSOCK, 23 => VirtioDeviceType::TYPE_IOMMU, + 24 => VirtioDeviceType::TYPE_MEM, 26 => VirtioDeviceType::TYPE_FS, 27 => VirtioDeviceType::TYPE_PMEM, _ => VirtioDeviceType::TYPE_UNKNOWN, @@ -118,6 +122,7 @@ impl fmt::Display for VirtioDeviceType { VirtioDeviceType::TYPE_INPUT => "input", VirtioDeviceType::TYPE_VSOCK => "vsock", VirtioDeviceType::TYPE_IOMMU => "iommu", + VirtioDeviceType::TYPE_MEM => "mem", VirtioDeviceType::TYPE_FS => "fs", VirtioDeviceType::TYPE_PMEM => "pmem", VirtioDeviceType::TYPE_UNKNOWN => "UNKNOWN", @@ -176,4 +181,5 @@ pub enum Error { EpollWait(io::Error), FailedSignalingDriver(io::Error), VhostUserUpdateMemory(vhost_user::Error), + EventfdError(io::Error), } diff --git a/vm-virtio/src/mem.rs b/vm-virtio/src/mem.rs new file mode 100644 index 000000000..cb79ba3fb --- /dev/null +++ b/vm-virtio/src/mem.rs @@ -0,0 +1,959 @@ +// Copyright (c) 2020 Ant Financial +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::Error as DeviceError; +use super::{ + ActivateError, ActivateResult, DescriptorChain, DeviceEventT, Queue, VirtioDevice, + VirtioDeviceType, VIRTIO_F_VERSION_1, +}; +use crate::{VirtioInterrupt, VirtioInterruptType}; +use epoll; +use libc; +use libc::EFD_NONBLOCK; +use std::cmp; +use std::io::{self, Write}; +use std::mem::size_of; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::result; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::mpsc; +use std::sync::{Arc, Mutex}; +use std::thread; +use vm_device::{Migratable, MigratableError, Pausable, Snapshotable}; +use vm_memory::{ + Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, + GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, +}; +use vmm_sys_util::eventfd::EventFd; + +const QUEUE_SIZE: u16 = 128; +const NUM_QUEUES: usize = 1; +const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE]; + +// Use 2 MiB alignment so transparent hugepages can be used by KVM. +pub const VIRTIO_MEM_DEFAULT_BLOCK_SIZE: u64 = (512 * 4096); +const VIRTIO_MEM_USABLE_EXTENT: u64 = 256 * 1024 * 1024; + +// Request processed successfully, applicable for +// - VIRTIO_MEM_REQ_PLUG +// - VIRTIO_MEM_REQ_UNPLUG +// - VIRTIO_MEM_REQ_UNPLUG_ALL +// - VIRTIO_MEM_REQ_STATE +const VIRTIO_MEM_RESP_ACK: u16 = 0; + +// Request denied - e.g. trying to plug more than requested, applicable for +// - VIRTIO_MEM_REQ_PLUG +const VIRTIO_MEM_RESP_NACK: u16 = 1; + +// Request cannot be processed right now, try again later, applicable for +// - VIRTIO_MEM_REQ_PLUG +// - VIRTIO_MEM_REQ_UNPLUG +// - VIRTIO_MEM_REQ_UNPLUG_ALL +// VIRTIO_MEM_RESP_BUSY: u16 = 2; + +// Error in request (e.g. addresses/alignemnt), applicable for +// - VIRTIO_MEM_REQ_PLUG +// - VIRTIO_MEM_REQ_UNPLUG +// - VIRTIO_MEM_REQ_STATE +const VIRTIO_MEM_RESP_ERROR: u16 = 3; + +// State of memory blocks is "plugged" +const VIRTIO_MEM_STATE_PLUGGED: u16 = 0; +// State of memory blocks is "unplugged" +const VIRTIO_MEM_STATE_UNPLUGGED: u16 = 1; +// State of memory blocks is "mixed" +const VIRTIO_MEM_STATE_MIXED: u16 = 2; + +// request to plug memory blocks +const VIRTIO_MEM_REQ_PLUG: u16 = 0; +// request to unplug memory blocks +const VIRTIO_MEM_REQ_UNPLUG: u16 = 1; +// request to unplug all blocks and shrink the usable size +const VIRTIO_MEM_REQ_UNPLUG_ALL: u16 = 2; +// request information about the plugged state of memory blocks +const VIRTIO_MEM_REQ_STATE: u16 = 3; + +// Get resize event. +const RESIZE_EVENT: DeviceEventT = 0; +// New descriptors are pending on the virtio queue. +const QUEUE_AVAIL_EVENT: DeviceEventT = 1; +// The device has been dropped. +const KILL_EVENT: DeviceEventT = 2; +// The device should be paused. +const PAUSE_EVENT: DeviceEventT = 3; + +#[derive(Debug)] +pub enum Error { + // Guest gave us bad memory addresses. + GuestMemory(GuestMemoryError), + // Guest gave us a write only descriptor that protocol says to read from. + UnexpectedWriteOnlyDescriptor, + // Guest gave us a read only descriptor that protocol says to write to. + UnexpectedReadOnlyDescriptor, + // Guest gave us too few descriptors in a descriptor chain. + DescriptorChainTooShort, + // Guest gave us a buffer that was too short to use. + BufferLengthTooSmall, + // Guest sent us invalid request. + InvalidRequest, + // Failed to EventFd write. + EventFdWriteFail(std::io::Error), + // Failed to EventFd try_clone. + EventFdTryCloneFail(std::io::Error), + // Failed to MpscRecv. + MpscRecvFail(mpsc::RecvError), + // Resize invalid argument + ResizeInval(String), + // Fail to resize trigger + ResizeTriggerFail(DeviceError), +} + +// Got from qemu/include/standard-headers/linux/virtio_mem.h +// rust union doesn't support std::default::Default that +// need by mem.read_obj. +// Then move virtio_mem_req_plug, virtio_mem_req_unplug and +// virtio_mem_req_state to virtio_mem_req. +#[repr(C)] +#[derive(Copy, Clone, Debug, Default)] +struct VirtioMemReq { + req_type: u16, + padding: [u16; 3], + addr: u64, + nb_blocks: u16, +} + +// Safe because it only has data and has no implicit padding. +unsafe impl ByteValued for VirtioMemReq {} + +// Got from qemu/include/standard-headers/linux/virtio_mem.h +#[repr(C)] +#[derive(Copy, Clone, Debug, Default)] +struct VirtioMemRespState { + state: u16, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug, Default)] +struct VirtioMemResp { + resp_type: u16, + padding: [u16; 3], + + state: VirtioMemRespState, +} + +// Safe because it only has data and has no implicit padding. +unsafe impl ByteValued for VirtioMemResp {} + +// Got from qemu/include/standard-headers/linux/virtio_mem.h +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, Default)] +struct VirtioMemConfig { + // Block size and alignment. Cannot change. + block_size: u32, + // Valid with VIRTIO_MEM_F_ACPI_PXM. Cannot change. + node_id: u16, + padding: u16, + // Start address of the memory region. Cannot change. + addr: u64, + // Region size (maximum). Cannot change. + region_size: u64, + // Currently usable region size. Can grow up to region_size. Can + // shrink due to VIRTIO_MEM_REQ_UNPLUG_ALL (in which case no config + // update will be sent). + usable_region_size: u64, + // Currently used size. Changes due to plug/unplug requests, but no + // config updates will be sent. + plugged_size: u64, + // Requested size. New plug requests cannot exceed it. Can change. + requested_size: u64, +} + +// Safe because it only has data and has no implicit padding. +unsafe impl ByteValued for VirtioMemConfig {} + +struct Request { + req: VirtioMemReq, + status_addr: GuestAddress, +} + +impl Request { + fn parse( + avail_desc: &DescriptorChain, + mem: &GuestMemoryMmap, + ) -> result::Result { + // The head contains the request type which MUST be readable. + if avail_desc.is_write_only() { + return Err(Error::UnexpectedWriteOnlyDescriptor); + } + if avail_desc.len as usize != size_of::() { + return Err(Error::InvalidRequest); + } + let req: VirtioMemReq = mem.read_obj(avail_desc.addr).map_err(Error::GuestMemory)?; + + let status_desc = avail_desc + .next_descriptor() + .ok_or(Error::DescriptorChainTooShort)?; + + // The status MUST always be writable + if !status_desc.is_write_only() { + return Err(Error::UnexpectedReadOnlyDescriptor); + } + + if (status_desc.len as usize) < size_of::() { + return Err(Error::BufferLengthTooSmall); + } + + Ok(Request { + req, + status_addr: status_desc.addr, + }) + } +} + +pub struct Resize { + size: Arc, + tx: mpsc::Sender>, + rx: Option>>, + evt: EventFd, +} + +impl Resize { + pub fn new() -> io::Result { + let (tx, rx) = mpsc::channel(); + + Ok(Resize { + size: Arc::new(AtomicU64::new(0)), + tx, + rx: Some(rx), + evt: EventFd::new(EFD_NONBLOCK)?, + }) + } + + pub fn try_clone(&self) -> Result { + Ok(Resize { + size: self.size.clone(), + tx: self.tx.clone(), + rx: None, + evt: self.evt.try_clone().map_err(Error::EventFdTryCloneFail)?, + }) + } + + pub fn work(&self, size: u64) -> Result<(), Error> { + if let Some(rx) = &self.rx { + self.size.store(size, Ordering::SeqCst); + self.evt.write(1).map_err(Error::EventFdWriteFail)?; + rx.recv().map_err(Error::MpscRecvFail)? + } else { + panic!("work should not work with cloned resize") + } + } + + fn get_size(&self) -> u64 { + self.size.load(Ordering::SeqCst) + } + + fn send(&self, r: Result<(), Error>) { + self.tx.send(r).unwrap(); + } +} + +struct MemEpollHandler { + host_addr: u64, + host_fd: Option, + mem_state: Vec, + config: Arc>, + resize: Resize, + queue: Queue, + mem: GuestMemoryAtomic, + interrupt_cb: Arc, + queue_evt: EventFd, + kill_evt: EventFd, + pause_evt: EventFd, +} + +struct StateChangeRequest<'a> { + config: VirtioMemConfig, + addr: u64, + size: u64, + nb_blocks: u16, + mem_state: &'a mut Vec, + host_addr: u64, + host_fd: Option, + plug: bool, +} + +impl MemEpollHandler { + fn virtio_mem_valid_range(config: &VirtioMemConfig, addr: u64, size: u64) -> bool { + // address properly aligned? + if addr % config.block_size as u64 != 0 { + return false; + } + + // reasonable size + if addr + size <= addr || size == 0 { + return false; + } + + // start address in usable range? + if addr < config.addr || addr >= config.addr + config.usable_region_size { + return false; + } + + // end address in usable range? + if addr + size > config.addr + config.usable_region_size { + return false; + } + + true + } + + fn virtio_mem_check_bitmap( + bit_index: usize, + nb_blocks: u16, + mem_state: &[bool], + plug: bool, + ) -> bool { + for state in mem_state.iter().skip(bit_index).take(nb_blocks as usize) { + if *state != plug { + return false; + } + } + true + } + + fn virtio_mem_set_bitmap( + bit_index: usize, + nb_blocks: u16, + mem_state: &mut Vec, + plug: bool, + ) { + for state in mem_state + .iter_mut() + .skip(bit_index) + .take(nb_blocks as usize) + { + *state = plug; + } + } + + fn virtio_mem_state_change_request(r: StateChangeRequest) -> u16 { + if r.plug && (r.config.plugged_size + r.size > r.config.requested_size) { + return VIRTIO_MEM_RESP_NACK; + } + if !MemEpollHandler::virtio_mem_valid_range(&r.config, r.addr, r.size) { + return VIRTIO_MEM_RESP_ERROR; + } + + let offset = r.addr - r.config.addr; + + let bit_index = (offset / r.config.block_size as u64) as usize; + if !MemEpollHandler::virtio_mem_check_bitmap(bit_index, r.nb_blocks, r.mem_state, !r.plug) { + return VIRTIO_MEM_RESP_ERROR; + } + + if !r.plug { + if let Some(fd) = r.host_fd { + let res = unsafe { + libc::fallocate64( + fd, + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + offset as libc::off64_t, + r.size as libc::off64_t, + ) + }; + if res != 0 { + error!("fallocate64 get error {}", io::Error::last_os_error()); + return VIRTIO_MEM_RESP_ERROR; + } + } + let res = unsafe { + libc::madvise( + (r.host_addr + offset) as *mut libc::c_void, + r.size as libc::size_t, + libc::MADV_DONTNEED, + ) + }; + if res != 0 { + error!("madvise get error {}", io::Error::last_os_error()); + return VIRTIO_MEM_RESP_ERROR; + } + } + + MemEpollHandler::virtio_mem_set_bitmap(bit_index, r.nb_blocks, r.mem_state, r.plug); + + VIRTIO_MEM_RESP_ACK + } + + fn virtio_mem_unplug_all( + config: VirtioMemConfig, + mem_state: &mut Vec, + host_addr: u64, + host_fd: Option, + ) -> u16 { + for x in 0..(config.region_size / config.block_size as u64) as usize { + if mem_state[x] { + let resp_type = + MemEpollHandler::virtio_mem_state_change_request(StateChangeRequest { + config, + addr: config.addr + x as u64 * config.block_size as u64, + size: config.block_size as u64, + nb_blocks: 1, + mem_state, + host_addr, + host_fd, + plug: false, + }); + if resp_type != VIRTIO_MEM_RESP_ACK { + return resp_type; + } + mem_state[x] = false; + } + } + + VIRTIO_MEM_RESP_ACK + } + + fn virtio_mem_state_request( + config: VirtioMemConfig, + addr: u64, + nb_blocks: u16, + mem_state: &mut Vec, + ) -> (u16, u16) { + let size: u64 = nb_blocks as u64 * config.block_size as u64; + let resp_type = if MemEpollHandler::virtio_mem_valid_range(&config, addr, size) { + VIRTIO_MEM_RESP_ACK + } else { + VIRTIO_MEM_RESP_ERROR + }; + + let offset = addr - config.addr; + let bit_index = (offset / config.block_size as u64) as usize; + let resp_state = + if MemEpollHandler::virtio_mem_check_bitmap(bit_index, nb_blocks, mem_state, true) { + VIRTIO_MEM_STATE_PLUGGED + } else if MemEpollHandler::virtio_mem_check_bitmap( + bit_index, nb_blocks, mem_state, false, + ) { + VIRTIO_MEM_STATE_UNPLUGGED + } else { + VIRTIO_MEM_STATE_MIXED + }; + + (resp_type, resp_state) + } + + fn virtio_mem_send_response( + mem: &GuestMemoryMmap, + resp_type: u16, + resp_state: u16, + status_addr: GuestAddress, + ) -> u32 { + let mut resp = VirtioMemResp::default(); + resp.resp_type = resp_type; + resp.state.state = resp_state; + match mem.write_obj(resp, status_addr) { + Ok(_) => size_of::() as u32, + Err(e) => { + error!("bad guest memory address: {}", e); + 0 + } + } + } + + fn signal(&self, int_type: &VirtioInterruptType) -> result::Result<(), DeviceError> { + self.interrupt_cb + .trigger(int_type, Some(&self.queue)) + .map_err(|e| { + error!("Failed to signal used queue: {:?}", e); + DeviceError::FailedSignalingUsedQueue(e) + }) + } + + fn process_queue(&mut self) -> bool { + let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize]; + let mut used_count = 0; + let mem = self.mem.memory(); + for avail_desc in self.queue.iter(&mem) { + let len = match Request::parse(&avail_desc, &mem) { + Err(e) => { + error!("failed parse VirtioMemReq: {:?}", e); + 0 + } + Ok(r) => { + let mut config = self.config.lock().unwrap(); + match r.req.req_type { + VIRTIO_MEM_REQ_PLUG => { + let size: u64 = r.req.nb_blocks as u64 * config.block_size as u64; + let resp_type = MemEpollHandler::virtio_mem_state_change_request( + StateChangeRequest { + config: *config, + addr: r.req.addr, + size, + nb_blocks: r.req.nb_blocks, + mem_state: &mut self.mem_state, + host_addr: self.host_addr, + host_fd: self.host_fd, + plug: true, + }, + ); + if resp_type == VIRTIO_MEM_RESP_ACK { + config.plugged_size += size; + } + MemEpollHandler::virtio_mem_send_response( + &mem, + resp_type, + 0u16, + r.status_addr, + ) + } + VIRTIO_MEM_REQ_UNPLUG => { + let size: u64 = r.req.nb_blocks as u64 * config.block_size as u64; + let resp_type = MemEpollHandler::virtio_mem_state_change_request( + StateChangeRequest { + config: *config, + addr: r.req.addr, + size, + nb_blocks: r.req.nb_blocks, + mem_state: &mut self.mem_state, + host_addr: self.host_addr, + host_fd: self.host_fd, + plug: false, + }, + ); + if resp_type == VIRTIO_MEM_RESP_ACK { + config.plugged_size -= size; + } + MemEpollHandler::virtio_mem_send_response( + &mem, + resp_type, + 0u16, + r.status_addr, + ) + } + VIRTIO_MEM_REQ_UNPLUG_ALL => { + let resp_type = MemEpollHandler::virtio_mem_unplug_all( + *config, + &mut self.mem_state, + self.host_addr, + self.host_fd, + ); + if resp_type == VIRTIO_MEM_RESP_ACK { + config.plugged_size = 0; + config.usable_region_size = cmp::min( + config.region_size, + config.requested_size + VIRTIO_MEM_USABLE_EXTENT, + ); + } + MemEpollHandler::virtio_mem_send_response( + &mem, + resp_type, + 0u16, + r.status_addr, + ) + } + VIRTIO_MEM_REQ_STATE => { + let (resp_type, resp_state) = MemEpollHandler::virtio_mem_state_request( + *config, + r.req.addr, + r.req.nb_blocks, + &mut self.mem_state, + ); + MemEpollHandler::virtio_mem_send_response( + &mem, + resp_type, + resp_state, + r.status_addr, + ) + } + _ => { + error!("VirtioMemReq unknown request type {:?}", r.req.req_type); + 0 + } + } + } + }; + used_desc_heads[used_count] = (avail_desc.index, len); + used_count += 1; + } + + for &(desc_index, len) in &used_desc_heads[..used_count] { + self.queue.add_used(&mem, desc_index, len); + } + used_count > 0 + } + + fn run(&mut self, paused: Arc) -> result::Result<(), DeviceError> { + // Create the epoll file descriptor + let epoll_fd = epoll::create(true).map_err(DeviceError::EpollCreateFd)?; + + // Add events + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + self.resize.evt.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, u64::from(RESIZE_EVENT)), + ) + .map_err(DeviceError::EpollCtl)?; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + self.queue_evt.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, u64::from(QUEUE_AVAIL_EVENT)), + ) + .map_err(DeviceError::EpollCtl)?; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + self.kill_evt.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, u64::from(KILL_EVENT)), + ) + .map_err(DeviceError::EpollCtl)?; + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + self.pause_evt.as_raw_fd(), + epoll::Event::new(epoll::Events::EPOLLIN, u64::from(PAUSE_EVENT)), + ) + .map_err(DeviceError::EpollCtl)?; + + const EPOLL_EVENTS_LEN: usize = 100; + let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; + + 'epoll: loop { + let num_events = match epoll::wait(epoll_fd, -1, &mut events[..]) { + Ok(res) => res, + Err(e) => { + if e.kind() == io::ErrorKind::Interrupted { + // It's well defined from the epoll_wait() syscall + // documentation that the epoll loop can be interrupted + // before any of the requested events occurred or the + // timeout expired. In both those cases, epoll_wait() + // returns an error of type EINTR, but this should not + // be considered as a regular error. Instead it is more + // appropriate to retry, by calling into epoll_wait(). + continue; + } + return Err(DeviceError::EpollWait(e)); + } + }; + + for event in events.iter().take(num_events) { + let ev_type = event.data as u16; + + match ev_type { + RESIZE_EVENT => { + if let Err(e) = self.resize.evt.read() { + error!("Failed to get resize event: {:?}", e); + break 'epoll; + } else { + let size = self.resize.get_size(); + let mut config = self.config.lock().unwrap(); + let mut need_break = false; + let r = if config.requested_size == size { + Err(Error::ResizeInval(format!("Virtio-mem resize {} is same with current config.requested_size", size))) + } else if size > config.region_size { + let region_size = config.region_size; + Err(Error::ResizeInval(format!( + "Virtio-mem resize {} is bigger than config.region_size {}", + size, region_size + ))) + } else if size % (config.block_size as u64) != 0 { + let block_size = config.block_size; + Err(Error::ResizeInval(format!( + "Virtio-mem resize {} is not aligned with config.block_size {}", + size, block_size + ))) + } else { + config.requested_size = size; + let tmp_size = cmp::min( + config.region_size, + config.requested_size + VIRTIO_MEM_USABLE_EXTENT, + ); + config.usable_region_size = + cmp::max(config.usable_region_size, tmp_size); + if let Err(e) = self.signal(&VirtioInterruptType::Config) { + need_break = true; + Err(Error::ResizeTriggerFail(e)) + } else { + Ok(()) + } + }; + if let Err(e) = &r { + error!("{:?}", e); + } + self.resize.send(r); + if need_break { + break 'epoll; + } + } + } + QUEUE_AVAIL_EVENT => { + if let Err(e) = self.queue_evt.read() { + error!("Failed to get queue event: {:?}", e); + break 'epoll; + } else if self.process_queue() { + if let Err(e) = self.signal(&VirtioInterruptType::Queue) { + error!("Failed to signal used queue: {:?}", e); + break 'epoll; + } + } + } + KILL_EVENT => { + debug!("kill_evt received, stopping epoll loop"); + break 'epoll; + } + PAUSE_EVENT => { + debug!("PAUSE_EVENT received, pausing virtio-pmem epoll loop"); + // We loop here to handle spurious park() returns. + // Until we have not resumed, the paused boolean will + // be true. + while paused.load(Ordering::SeqCst) { + thread::park(); + } + } + _ => { + error!("Unknown event for virtio-mem"); + } + } + } + } + + Ok(()) + } +} + +// Virtio device for exposing entropy to the guest OS through virtio. +pub struct Mem { + resize: Resize, + kill_evt: Option, + pause_evt: Option, + avail_features: u64, + pub acked_features: u64, + host_addr: u64, + host_fd: Option, + config: Arc>, + queue_evts: Option>, + interrupt_cb: Option>, + epoll_threads: Option>>>, + paused: Arc, +} + +impl Mem { + // Create a new virtio-mem device. + pub fn new(region: &Arc, resize: Resize) -> io::Result { + let region_len = region.len(); + + if region_len != region_len / VIRTIO_MEM_DEFAULT_BLOCK_SIZE * VIRTIO_MEM_DEFAULT_BLOCK_SIZE + { + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "Virtio-mem size is not aligned with {}", + VIRTIO_MEM_DEFAULT_BLOCK_SIZE + ), + )); + } + + // Fixme: Not support VIRTIO_MEM_F_ACPI_PXM + let avail_features = 1u64 << VIRTIO_F_VERSION_1; + + let mut config = VirtioMemConfig::default(); + config.block_size = VIRTIO_MEM_DEFAULT_BLOCK_SIZE as u32; + config.addr = region.start_addr().raw_value(); + config.region_size = region.len(); + config.usable_region_size = cmp::min( + config.region_size, + config.requested_size + VIRTIO_MEM_USABLE_EXTENT, + ); + + let host_fd = if let Some(f_offset) = region.file_offset() { + Some(f_offset.file().as_raw_fd()) + } else { + None + }; + + Ok(Mem { + resize, + kill_evt: None, + pause_evt: None, + avail_features, + acked_features: 0u64, + host_addr: region.as_ptr() as u64, + host_fd, + config: Arc::new(Mutex::new(config)), + queue_evts: None, + interrupt_cb: None, + epoll_threads: None, + paused: Arc::new(AtomicBool::new(false)), + }) + } +} + +impl Drop for Mem { + fn drop(&mut self) { + if let Some(kill_evt) = self.kill_evt.take() { + // Ignore the result because there is nothing we can do about it. + let _ = kill_evt.write(1); + } + } +} + +impl VirtioDevice for Mem { + fn device_type(&self) -> u32 { + VirtioDeviceType::TYPE_MEM as u32 + } + + fn queue_max_sizes(&self) -> &[u16] { + QUEUE_SIZES + } + + fn features(&self) -> u64 { + self.avail_features + } + + fn ack_features(&mut self, value: u64) { + let mut v = value; + // Check if the guest is ACK'ing a feature that we didn't claim to have. + let unrequested_features = v & !self.avail_features; + if unrequested_features != 0 { + warn!("Received acknowledge request for unknown feature."); + + // Don't count these features as acked. + v &= !unrequested_features; + } + self.acked_features |= v; + } + + fn read_config(&self, offset: u64, mut data: &mut [u8]) { + let config = self.config.lock().unwrap(); + let config_slice = config.as_slice(); + let config_len = config_slice.len() as u64; + if offset >= config_len { + error!("Failed to read config space"); + return; + } + if let Some(end) = offset.checked_add(data.len() as u64) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&config_slice[offset as usize..cmp::min(end, config_len) as usize]) + .unwrap(); + } + } + + fn write_config(&mut self, _offset: u64, _data: &[u8]) { + warn!("virtio-mem device configuration is read-only"); + } + + fn activate( + &mut self, + mem: GuestMemoryAtomic, + interrupt_cb: Arc, + mut queues: Vec, + mut queue_evts: Vec, + ) -> ActivateResult { + if queues.len() != NUM_QUEUES || queue_evts.len() != NUM_QUEUES { + error!( + "Cannot perform activate. Expected {} queue(s), got {}", + NUM_QUEUES, + queues.len() + ); + return Err(ActivateError::BadActivate); + } + + let (self_kill_evt, kill_evt) = EventFd::new(EFD_NONBLOCK) + .and_then(|e| Ok((e.try_clone()?, e))) + .map_err(|e| { + error!("failed creating kill EventFd pair: {}", e); + ActivateError::BadActivate + })?; + self.kill_evt = Some(self_kill_evt); + + let (self_pause_evt, pause_evt) = EventFd::new(EFD_NONBLOCK) + .and_then(|e| Ok((e.try_clone()?, e))) + .map_err(|e| { + error!("failed creating pause EventFd pair: {}", e); + ActivateError::BadActivate + })?; + self.pause_evt = Some(self_pause_evt); + + self.interrupt_cb = Some(interrupt_cb.clone()); + + let mut tmp_queue_evts: Vec = Vec::new(); + for queue_evt in queue_evts.iter() { + // Save the queue EventFD as we need to return it on reset + // but clone it to pass into the thread. + tmp_queue_evts.push(queue_evt.try_clone().map_err(|e| { + error!("failed to clone queue EventFd: {}", e); + ActivateError::BadActivate + })?); + } + self.queue_evts = Some(tmp_queue_evts); + + let config = self.config.lock().unwrap(); + let mut handler = MemEpollHandler { + host_addr: self.host_addr, + host_fd: self.host_fd, + mem_state: vec![false; config.region_size as usize / config.block_size as usize], + config: self.config.clone(), + resize: self.resize.try_clone().map_err(|e| { + error!("failed to clone resize EventFd: {:?}", e); + ActivateError::BadActivate + })?, + queue: queues.remove(0), + mem, + interrupt_cb, + queue_evt: queue_evts.remove(0), + kill_evt, + pause_evt, + }; + + let paused = self.paused.clone(); + let mut epoll_threads = Vec::new(); + thread::Builder::new() + .name("virtio_mem".to_string()) + .spawn(move || handler.run(paused)) + .map(|thread| epoll_threads.push(thread)) + .map_err(|e| { + error!("failed to clone virtio-mem epoll thread: {}", e); + ActivateError::BadActivate + })?; + self.epoll_threads = Some(epoll_threads); + + Ok(()) + } + + fn reset(&mut self) -> Option<(Arc, Vec)> { + // We first must resume the virtio thread if it was paused. + if self.pause_evt.take().is_some() { + self.resume().ok()?; + } + + if let Some(kill_evt) = self.kill_evt.take() { + // Ignore the result because there is nothing we can do about it. + let _ = kill_evt.write(1); + } + + // Return the interrupt and queue EventFDs + Some(( + self.interrupt_cb.take().unwrap(), + self.queue_evts.take().unwrap(), + )) + } +} + +virtio_pausable!(Mem); +impl Snapshotable for Mem {} +impl Migratable for Mem {}