From 42522a88c0226a50b49beaaf22436aec12c61d21 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 13 Jun 2025 14:27:00 -0400 Subject: [PATCH] misc: do not use u64 to represent host pointers To ensure that struct sizes are the same on 32-bit and 64-bit, various kernel APIs use __u64 (Rust u64) to represent userspace pointers. Userspace is expected to cast pointers to __u64 before passing them to the kernel, and cast kernel-provided __u64 to a pointer before using them. However, various safe APIs in Cloud Hypervisor took caller-provided u64 values and passed them to syscalls that interpret them as userspace addresses. Therefore, passing bad u64 values would cause memory disclosure or corruption. Fix the bug by using usize and pointer types as appropriate. To make soundness of the code easier to reason about, the PCI code gains a new MmapRegion abstraction that ensures the validity of pointers. The rest of the code already has an MmapRegion abstraction it can use. To avoid having to reason about whether something is keeping the MmapRegion alive, reference counting is added. MmapRegion cannot hold references to other objects, so the reference counting cannot introduce cycles. Signed-off-by: Demi Marie Obenour --- devices/src/ivshmem.rs | 12 +- fuzz/fuzz_targets/pmem.rs | 4 +- hypervisor/src/kvm/mod.rs | 16 +- hypervisor/src/mshv/mod.rs | 16 +- hypervisor/src/vm.rs | 8 +- pci/src/lib.rs | 1 + pci/src/mmap.rs | 89 +++++++++ pci/src/vfio.rs | 209 ++++++++++++--------- pci/src/vfio_user.rs | 96 ++++------ virtio-devices/src/device.rs | 7 +- virtio-devices/src/pmem.rs | 10 +- virtio-devices/src/transport/pci_device.rs | 2 +- virtio-devices/src/vhost_user/fs.rs | 3 +- vm-device/src/lib.rs | 8 +- vmm/src/device_manager.rs | 62 +++--- vmm/src/memory_manager.rs | 94 ++++++--- vmm/src/vm.rs | 8 +- 17 files changed, 382 insertions(+), 263 deletions(-) create mode 100644 pci/src/mmap.rs diff --git a/devices/src/ivshmem.rs b/devices/src/ivshmem.rs index 5997e10a2..d6f5b5d46 100644 --- a/devices/src/ivshmem.rs +++ b/devices/src/ivshmem.rs @@ -33,7 +33,7 @@ const IVSHMEM_DEVICE_ID: u16 = 0x1110; const IVSHMEM_REG_BAR_SIZE: u64 = 0x100; -type GuestRegionMmap = vm_memory::GuestRegionMmap; +type MmapRegion = vm_memory::MmapRegion; #[derive(Debug, Error)] pub enum IvshmemError { @@ -68,7 +68,7 @@ pub trait IvshmemOps: Send + Sync { start_addr: u64, size: usize, backing_file: Option, - ) -> Result<(Arc, UserspaceMapping), IvshmemError>; + ) -> Result<(Arc, UserspaceMapping), IvshmemError>; fn unmap_ram_region(&mut self, mapping: UserspaceMapping) -> Result<(), IvshmemError>; } @@ -95,7 +95,7 @@ pub struct IvshmemDevice { region_size: u64, ivshmem_ops: Arc>, backend_file: Option, - region: Option>, + region: Option>, userspace_mapping: Option, } @@ -180,11 +180,7 @@ impl IvshmemDevice { Ok(device) } - pub fn set_region( - &mut self, - region: Arc, - userspace_mapping: UserspaceMapping, - ) { + pub fn set_region(&mut self, region: Arc, userspace_mapping: UserspaceMapping) { self.region = Some(region); self.userspace_mapping = Some(userspace_mapping); } diff --git a/fuzz/fuzz_targets/pmem.rs b/fuzz/fuzz_targets/pmem.rs index e9247fb63..a8fcb7a77 100644 --- a/fuzz/fuzz_targets/pmem.rs +++ b/fuzz/fuzz_targets/pmem.rs @@ -115,10 +115,9 @@ fn create_dummy_pmem() -> Pmem { .unwrap(); let guest_addr = GuestAddress(0); let dummy_user_mapping = UserspaceMapping { - host_addr: dummy_mmap_region.as_ptr() as u64, mem_slot: 0, addr: guest_addr, - len: dummy_mapping_size as u64, + mapping: Arc::new(dummy_mmap_region), mergeable: false, }; @@ -127,7 +126,6 @@ fn create_dummy_pmem() -> Pmem { file, guest_addr, dummy_user_mapping, - dummy_mmap_region, false, SeccompAction::Allow, EventFd::new(EFD_NONBLOCK).unwrap(), diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 53de9bad3..d9dc9b754 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -723,8 +723,8 @@ impl vm::Vm for KvmVm { &self, slot: u32, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, readonly: bool, log_dirty_pages: bool, ) -> vm::Result<()> { @@ -741,8 +741,8 @@ impl vm::Vm for KvmVm { let mut region = kvm_userspace_memory_region { slot, guest_phys_addr, - memory_size, - userspace_addr, + memory_size: memory_size as u64, + userspace_addr: userspace_addr as usize as u64, flags, }; @@ -787,8 +787,8 @@ impl vm::Vm for KvmVm { &self, slot: u32, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, readonly: bool, log_dirty_pages: bool, ) -> vm::Result<()> { @@ -805,8 +805,8 @@ impl vm::Vm for KvmVm { let mut region = kvm_userspace_memory_region { slot, guest_phys_addr, - memory_size, - userspace_addr, + memory_size: memory_size as u64, + userspace_addr: userspace_addr as usize as u64, flags, }; diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 1fe60d368..3ca3d846f 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -1886,8 +1886,8 @@ impl vm::Vm for MshvVm { &self, _slot: u32, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, readonly: bool, _log_dirty_pages: bool, ) -> vm::Result<()> { @@ -1899,8 +1899,8 @@ impl vm::Vm for MshvVm { let user_memory_region = mshv_user_mem_region { flags, guest_pfn: guest_phys_addr >> PAGE_SHIFT, - size: memory_size, - userspace_addr, + size: memory_size.try_into().unwrap(), + userspace_addr: (userspace_addr as usize).try_into().unwrap(), ..Default::default() }; // No matter read only or not we keep track the slots. @@ -1930,8 +1930,8 @@ impl vm::Vm for MshvVm { &self, _slot: u32, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, readonly: bool, _log_dirty_pages: bool, ) -> vm::Result<()> { @@ -1943,8 +1943,8 @@ impl vm::Vm for MshvVm { let user_memory_region = mshv_user_mem_region { flags, guest_pfn: guest_phys_addr >> PAGE_SHIFT, - size: memory_size, - userspace_addr, + size: memory_size.try_into().unwrap(), + userspace_addr: (userspace_addr as usize).try_into().unwrap(), ..Default::default() }; // Remove the corresponding entry from "self.dirty_log_slots" if needed diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index a01e499a7..9d4bf5c3d 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -345,8 +345,8 @@ pub trait Vm: Send + Sync + Any { &self, slot: u32, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, readonly: bool, log_dirty_pages: bool, ) -> Result<()>; @@ -359,8 +359,8 @@ pub trait Vm: Send + Sync + Any { &self, slot: u32, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, readonly: bool, log_dirty_pages: bool, ) -> Result<()>; diff --git a/pci/src/lib.rs b/pci/src/lib.rs index c95a38b33..8c1532491 100644 --- a/pci/src/lib.rs +++ b/pci/src/lib.rs @@ -11,6 +11,7 @@ extern crate log; mod bus; mod configuration; mod device; +mod mmap; mod msi; mod msix; mod vfio; diff --git a/pci/src/mmap.rs b/pci/src/mmap.rs new file mode 100644 index 000000000..79d3ab20b --- /dev/null +++ b/pci/src/mmap.rs @@ -0,0 +1,89 @@ +// Copyright © 2025 Demi Marie Obenour +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Helpers for `mmap()` + +use core::ffi::c_int; +use core::ptr::null_mut; +use std::io::{Error, ErrorKind}; +use std::os::fd::{AsRawFd as _, BorrowedFd}; + +use libc::size_t; + +/// A region of `mmap()`-allocated memory that calls `munmap()` when dropped. +/// This guarantees that the buffer is valid and that its address space +/// will be reserved. The address space is not guaranteed to be accessible. +/// Atomic access to the data will not cause undefined behavior but might +/// cause SIGSEGV or SIGBUS. Non-atomic access will generally cause data +/// races and thus Undefined Behavior. +#[derive(Debug)] +pub struct MmapRegion { + addr: *mut u8, + len: size_t, +} + +impl Drop for MmapRegion { + fn drop(&mut self) { + // SAFETY: guaranteed by type validity invariant + unsafe { assert_eq!(libc::munmap(self.addr as *mut _, self.len), 0) } + } +} +// SAFETY: the caller is responsible for avoiding data races +unsafe impl Send for MmapRegion {} +// SAFETY: the caller is responsible for avoiding data races +unsafe impl Sync for MmapRegion {} + +impl MmapRegion { + #[inline] + pub fn addr(&self) -> *mut u8 { + self.addr + } + + /// Return the length of the region. + /// This function promises that the return value fits in [`libc::size_t`] + /// and in [`isize`] and `unsafe` code can rely on this. + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Create an [`MmapRegion`] using `mmap` of a file descriptor. + pub fn mmap( + len: u64, + prot: c_int, + fd: BorrowedFd, + offset1: u64, + offset2: u64, + ) -> std::io::Result { + const BAD_LENGTH: &str = "Offsets must fit in libc::off_t"; + const BAD_OFFSET: &str = "Mapping length must fit \ +in both isize and libc::size_t"; + let Some(offset) = offset1.checked_add(offset2) else { + return Err(Error::new(ErrorKind::InvalidInput, BAD_OFFSET)); + }; + let Ok(offset) = libc::off_t::try_from(offset) else { + return Err(Error::new(ErrorKind::InvalidInput, BAD_OFFSET)); + }; + if isize::try_from(len).is_err() { + return Err(Error::new(ErrorKind::InvalidInput, BAD_LENGTH)); + } + let Ok(len) = libc::size_t::try_from(len) else { + return Err(Error::new(ErrorKind::InvalidInput, BAD_LENGTH)); + }; + + assert!( + (prot & !(libc::PROT_READ | libc::PROT_WRITE | libc::PROT_EXEC)) == 0, + "bad protection" + ); + let flags = libc::MAP_SHARED; + // SAFETY: FFI call with correct parameters. + let addr = unsafe { libc::mmap(null_mut(), len, prot, flags, fd.as_raw_fd(), offset) }; + if addr == libc::MAP_FAILED { + Err(Error::last_os_error()) + } else { + let addr = addr as _; + Ok(Self { addr, len }) + } + } +} diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 3d59293ae..7c85f557c 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -6,9 +6,9 @@ use std::any::Any; use std::collections::{BTreeMap, HashMap}; use std::io; +use std::os::fd::BorrowedFd; use std::os::unix::io::AsRawFd; use std::path::PathBuf; -use std::ptr::null_mut; use std::sync::{Arc, Barrier, Mutex}; use anyhow::anyhow; @@ -34,6 +34,7 @@ use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsiz use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vmm_sys_util::eventfd::EventFd; +use crate::mmap::MmapRegion; use crate::msi::{MSI_CONFIG_ID, MsiConfigState}; use crate::msix::MsixConfigState; use crate::{ @@ -258,12 +259,11 @@ impl Interrupt { } } -#[derive(Copy, Clone)] +#[derive(Clone)] pub struct UserMemoryRegion { pub slot: u32, pub start: u64, - pub size: u64, - pub host_addr: u64, + pub mapping: Arc, } #[derive(Clone)] @@ -275,12 +275,17 @@ pub struct MmioRegion { pub(crate) user_memory_regions: Vec, } -trait MmioRegionRange { +/// # Safety +/// +/// [`Self::find_user_address`] must always either return `Err` +/// or a pointer to `size` bytes of valid memory. +unsafe trait MmioRegionRange { fn check_range(&self, guest_addr: u64, size: u64) -> bool; - fn find_user_address(&self, guest_addr: u64) -> Result; + fn find_user_address(&self, guest_addr: u64, size: u64) -> Result<*mut u8, io::Error>; } -impl MmioRegionRange for Vec { +// SAFETY: See the comment in `find_user_address`. +unsafe impl MmioRegionRange for Vec { // Check if a guest address is within the range of mmio regions fn check_range(&self, guest_addr: u64, size: u64) -> bool { for region in self.iter() { @@ -298,14 +303,33 @@ impl MmioRegionRange for Vec { } // Locate the user region address for a guest address within all mmio regions - fn find_user_address(&self, guest_addr: u64) -> Result { + fn find_user_address(&self, guest_addr: u64, size: u64) -> Result<*mut u8, io::Error> { for region in self.iter() { for user_region in region.user_memory_regions.iter() { - if guest_addr >= user_region.start - && guest_addr < user_region.start + user_region.size - { - return Ok(user_region.host_addr + (guest_addr - user_region.start)); + let mapping: &MmapRegion = &user_region.mapping; + let start: u64 = user_region.start; + let len: u64 = mapping.len().try_into().unwrap(); + // See if the guest address is inside the region. + let Some(offset_from_start) = guest_addr.checked_sub(start) else { + continue; + }; + if offset_from_start >= len { + continue; } + // Check that the size is in bounds. + // This enforces the invariant promised by implementing MmioRegionRange. + assert!( + size <= len - offset_from_start, + "Attempt to read {size} bytes at offset {offset_from_start} into \ +a region of size {len}" + ); + // SAFETY: MmapRegion guarantees that mapping.addr points to at least + // mapping.len() bytes of valid memory. offset_from_start is equal + // to guest_addr - start, which was checked to be less than mapping.len(). + // Therefore, the returned pointer is still in the range of valid memory. + // Also, since mapping.len() fit in usize, offset_from_start must as well, + // so the cast is safe. + return Ok(unsafe { mapping.addr().add(offset_from_start as usize) }); } } @@ -1585,7 +1609,8 @@ impl VfioPciDevice { /// * `mem_slot` - The closure to return a memory slot. pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { let fd = self.device.as_raw_fd(); - + // SAFETY: fd is guaranteed valid + let fd = unsafe { BorrowedFd::borrow_raw(fd) }; for region in self.common.mmio_regions.iter_mut() { let region_flags = self.device.get_region_flags(region.index); if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { @@ -1625,71 +1650,72 @@ impl VfioPciDevice { self.common.interrupt.msix.as_ref(), )?; - for area in sparse_areas.iter() { - // SAFETY: FFI call with correct arguments - let host_addr = unsafe { - libc::mmap( - null_mut(), - area.size as usize, - prot, - libc::MAP_SHARED, - fd, - mmap_offset as libc::off_t + area.offset as libc::off_t, - ) - }; - - if std::ptr::eq(host_addr, libc::MAP_FAILED) { + for area in &sparse_areas { + if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { error!( - "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", - area.offset, - area.size, - std::io::Error::last_os_error() + "Could not mmap sparse area that is not page size aligned \ +(offset = 0x{:x}, size = 0x{:x})", + area.offset, area.size ); return Err(VfioPciError::MmapArea); } + } - if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { - warn!( - "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", - area.offset, area.size, - ); - return Ok(()); - } + for area in sparse_areas.iter() { + let mapping = match MmapRegion::mmap( + area.size, + prot, + fd, + mmap_offset, + area.offset, + ) { + Ok(mapping) => mapping, + Err(_) => { + error!( + "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", + mmap_offset, + area.size, + std::io::Error::last_os_error() + ); + return Err(VfioPciError::MmapArea); + } + }; let user_memory_region = UserMemoryRegion { slot: self.memory_slot_allocator.next_memory_slot(), start: region.start.0 + area.offset, - size: area.size, - host_addr: host_addr as u64, + mapping: Arc::new(mapping), }; - - // SAFETY: host_addr was allocated by mmap() and points to size - // bytes of memory. + // SAFETY: MmapRegion invariants guarantee that + // user_memory_region.mapping.addr() points to + // user_memory_region.mapping.len() bytes of + // valid memory that will only be unmapped with munmap(). unsafe { self.vm.create_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + user_memory_region.mapping.len(), + user_memory_region.mapping.addr(), false, false, ) } .map_err(VfioPciError::CreateUserMemoryRegion)?; - region.user_memory_regions.push(user_memory_region); - if !self.iommu_attached { self.container .vfio_dma_map( user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + user_memory_region.mapping.len().try_into().unwrap(), + (user_memory_region.mapping.addr() as usize) + .try_into() + .unwrap(), ) .map_err(|e| { VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf) })?; } + region.user_memory_regions.push(user_memory_region); } } } @@ -1698,19 +1724,21 @@ impl VfioPciDevice { } pub fn unmap_mmio_regions(&mut self) { - for region in self.common.mmio_regions.iter() { - for user_memory_region in region.user_memory_regions.iter() { + for region in self.common.mmio_regions.iter_mut() { + for user_memory_region in region.user_memory_regions.drain(..) { + let len = user_memory_region.mapping.len(); + let host_addr = user_memory_region.mapping.addr(); // Unmap from vfio container if !self.iommu_attached && let Err(e) = self .container - .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) + .vfio_dma_unmap(user_memory_region.start, len.try_into().unwrap()) .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf)) { error!( "Could not unmap mmio region from vfio container: \ iova 0x{:x}, size 0x{:x}: {}, ", - user_memory_region.start, user_memory_region.size, e + user_memory_region.start, len, e ); } @@ -1721,8 +1749,8 @@ impl VfioPciDevice { self.vm.remove_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + len, + host_addr, false, false, ) @@ -1732,21 +1760,6 @@ impl VfioPciDevice { self.memory_slot_allocator .free_memory_slot(user_memory_region.slot); - - // SAFETY: FFI call with correct arguments - let ret = unsafe { - libc::munmap( - user_memory_region.host_addr as *mut libc::c_void, - user_memory_region.size as usize, - ) - }; - if ret != 0 { - error!( - "Could not unmap region {}, error:{}", - region.index, - io::Error::last_os_error() - ); - } } } } @@ -1886,29 +1899,31 @@ impl PciDevice for VfioPciDevice { region.start = GuestAddress(new_base); for user_memory_region in region.user_memory_regions.iter_mut() { + let len = user_memory_region.mapping.len(); + let host_addr = user_memory_region.mapping.addr(); // Unmap the old MMIO region from vfio container if !self.iommu_attached && let Err(e) = self .container - .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) + .vfio_dma_unmap(user_memory_region.start, len.try_into().unwrap()) .map_err(|e| { VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf) }) { error!( "Could not unmap mmio region from vfio container: \ - iova 0x{:x}, size 0x{:x}: {}, ", - user_memory_region.start, user_memory_region.size, e +iova 0x{:x}, size 0x{:x}: {}, ", + user_memory_region.start, len, e ); } // Remove old region - // SAFETY: user_memory_regions has valid entries + // SAFETY: validity of len and host_addr guaranteed by hypervisor::mmap::MmapRegion unsafe { self.vm.remove_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + len, + host_addr, false, false, ) @@ -1923,13 +1938,13 @@ impl PciDevice for VfioPciDevice { } // Insert new region - // SAFETY: mmio_regions only has valid values + // SAFETY: validity of len and host_addr guaranteed by hypervisor::mmap::MmapRegion unsafe { self.vm.create_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + len, + host_addr, false, false, ) @@ -1941,8 +1956,8 @@ impl PciDevice for VfioPciDevice { self.container .vfio_dma_map( user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + len.try_into().unwrap(), + (host_addr as usize).try_into().unwrap(), ) .map_err(|e| { VfioPciError::DmaMap(e, self.device_path.clone(), self.bdf) @@ -1950,8 +1965,8 @@ impl PciDevice for VfioPciDevice { .map_err(|e| { io::Error::other(format!( "Could not map mmio region to vfio container: \ - iova 0x{:x}, size 0x{:x}: {}, ", - user_memory_region.start, user_memory_region.size, e +iova 0x{:x}, size 0x{:x}: {}, ", + user_memory_region.start, len, e )) })?; } @@ -1987,6 +2002,7 @@ impl Snapshottable for VfioPciDevice { Ok(vfio_pci_dev_snapshot) } } + impl Transportable for VfioPciDevice {} impl Migratable for VfioPciDevice {} @@ -2020,11 +2036,17 @@ impl VfioDmaMapping { impl ExternalDmaMapping for VfioDmaMapping { fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { + let Ok(usize_size): Result = size.try_into() else { + return Err(io::Error::other(format!("size {size} overflows usize"))); + }; let mem = self.memory.memory(); let guest_addr = GuestAddress(gpa); - let user_addr = if mem.check_range(guest_addr, size as usize) { - match mem.get_host_address(guest_addr) { - Ok(t) => t as u64, + let user_addr = if mem.check_range(guest_addr, usize_size) { + match mem.get_slice(guest_addr, usize_size) { + Ok(t) => { + assert!(t.len() >= usize_size); + Ok(t.ptr_guard_mut()) + } Err(e) => { return Err(io::Error::other(format!( "unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}" @@ -2032,15 +2054,26 @@ impl ExternalDmaMapping for VfioDmaMapping p.as_ptr(), + Err(p) => p, + }; + // SAFETY: find_user_address and GuestMemory::get_slice() guarantee that + // the returned pointer is valid for up to `usize_size` bytes. + // `usize_size` is always equal to `size` due to the above `try_into()` call. self.container - .vfio_dma_map(iova, size, user_addr) + .vfio_dma_map(iova, size, (user_addr as usize).try_into().unwrap()) .map_err(|e| { io::Error::other(format!( "failed to map memory for VFIO container, \ diff --git a/pci/src/vfio_user.rs b/pci/src/vfio_user.rs index 79b563872..3ce9ebdb0 100644 --- a/pci/src/vfio_user.rs +++ b/pci/src/vfio_user.rs @@ -4,8 +4,8 @@ // use std::any::Any; +use std::os::fd::AsFd; use std::os::unix::prelude::AsRawFd; -use std::ptr::null_mut; use std::sync::{Arc, Barrier, Mutex}; use hypervisor::HypervisorVmError; @@ -24,6 +24,7 @@ use vm_memory::{ use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vmm_sys_util::eventfd::EventFd; +use crate::mmap::MmapRegion; use crate::vfio::{UserMemoryRegion, VFIO_COMMON_ID, Vfio, VfioCommon, VfioError}; use crate::{ BarReprogrammingParams, PciBarConfiguration, PciBdf, PciDevice, PciDeviceError, PciSubclass, @@ -52,6 +53,8 @@ pub enum VfioUserPciDeviceError { InitializeLegacyInterrupts(#[source] VfioPciError), #[error("Failed to create VfioCommon")] CreateVfioCommon(#[source] VfioPciError), + #[error("Other OS error")] + Other(#[source] std::io::Error), } #[derive(Copy, Clone)] @@ -110,10 +113,8 @@ impl VfioUserPciDevice { }) } - /// # Safety - /// - /// Not known yet (TODO) - pub unsafe fn map_mmio_regions(&mut self) -> Result<(), VfioUserPciDeviceError> { + /// Map all of the MMIO regions. + pub fn map_mmio_regions(&mut self) -> Result<(), VfioUserPciDeviceError> { for mmio_region in &mut self.common.mmio_regions { let region_flags = self .client @@ -158,43 +159,39 @@ impl VfioUserPciDevice { sparse_areas }; - for s in mmaps.iter() { - // SAFETY: FFI call with correct arguments - let host_addr = unsafe { - libc::mmap( - null_mut(), - s.size as usize, - prot, - libc::MAP_SHARED, - file_offset.as_ref().unwrap().file().as_raw_fd(), - file_offset.as_ref().unwrap().start() as libc::off_t - + s.offset as libc::off_t, - ) - }; + let file_offset = file_offset.as_ref().unwrap(); - if std::ptr::eq(host_addr, libc::MAP_FAILED) { - error!( - "Could not mmap regions, error:{}", - std::io::Error::last_os_error() - ); - continue; - } + for s in mmaps.iter() { + let mapping = match MmapRegion::mmap( + s.size, + prot, + file_offset.file().as_fd(), + file_offset.start(), + s.offset, + ) { + Ok(mapping) => Arc::new(mapping), + Err(e) => { + error!( + "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", + s.offset, s.size, e + ); + return Err(VfioUserPciDeviceError::Other(e)); + } + }; let user_memory_region = UserMemoryRegion { slot: self.memory_slot_allocator.next_memory_slot(), start: mmio_region.start.0 + s.offset, - size: s.size, - host_addr: host_addr as u64, + mapping, }; - // SAFETY: host_addr was just allocated with mmap() - // and points to size bytes of valid address. + // SAFETY: validity of len and host_addr guaranteed by hypervisor::mmap::MmapRegion unsafe { self.vm.create_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + user_memory_region.mapping.len(), + user_memory_region.mapping.addr(), false, false, ) @@ -209,17 +206,17 @@ impl VfioUserPciDevice { Ok(()) } - pub fn unmap_mmio_regions(&mut self) { - for mmio_region in self.common.mmio_regions.iter() { - for user_memory_region in mmio_region.user_memory_regions.iter() { + fn unmap_mmio_regions(&mut self) { + for mmio_region in self.common.mmio_regions.iter_mut() { + for user_memory_region in mmio_region.user_memory_regions.drain(..) { // Remove region - // SAFETY: only valid regions are in user_memory_regions + // SAFETY: guaranteed by hypervisor::mmap::MmapRegion invariants if let Err(e) = unsafe { self.vm.remove_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + user_memory_region.mapping.len(), + user_memory_region.mapping.addr(), false, false, ) @@ -229,22 +226,7 @@ impl VfioUserPciDevice { self.memory_slot_allocator .free_memory_slot(user_memory_region.slot); - - // Remove mmaps - // SAFETY: FFI call with correct arguments - let ret = unsafe { - libc::munmap( - user_memory_region.host_addr as *mut libc::c_void, - user_memory_region.size as usize, - ) - }; - if ret != 0 { - error!( - "Could not unmap region {}, error:{}", - mmio_region.index, - std::io::Error::last_os_error() - ); - } + // memory will be unmapped on drop } } } @@ -465,8 +447,8 @@ impl PciDevice for VfioUserPciDevice { self.vm.remove_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + user_memory_region.mapping.len(), + user_memory_region.mapping.addr(), false, false, ) @@ -486,8 +468,8 @@ impl PciDevice for VfioUserPciDevice { self.vm.create_user_memory_region( user_memory_region.slot, user_memory_region.start, - user_memory_region.size, - user_memory_region.host_addr, + user_memory_region.mapping.len(), + user_memory_region.mapping.addr(), false, false, ) diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index a30ad6064..cd9a8fea7 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -16,13 +16,13 @@ use std::thread; use libc::EFD_NONBLOCK; use virtio_queue::Queue; use vm_device::UserspaceMapping; -use vm_memory::{GuestAddress, GuestMemoryAtomic, GuestUsize}; +use vm_memory::{GuestAddress, GuestMemoryAtomic}; use vm_migration::{MigratableError, Pausable}; use vm_virtio::{AccessPlatform, VirtioDeviceType}; use vmm_sys_util::eventfd::EventFd; use crate::{ - ActivateError, ActivateResult, Error, GuestMemoryMmap, GuestRegionMmap, + ActivateError, ActivateResult, Error, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_RING_INDIRECT_DESC, }; @@ -46,10 +46,9 @@ pub struct VirtioSharedMemory { #[derive(Clone)] pub struct VirtioSharedMemoryList { - pub host_addr: u64, pub mem_slot: u32, pub addr: GuestAddress, - pub len: GuestUsize, + pub mapping: Arc, pub region_list: Vec, } diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index d76655847..ff8ad296b 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -34,7 +34,7 @@ use super::{ }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; -use crate::{GuestMemoryMmap, MmapRegion, VirtioInterrupt, VirtioInterruptType}; +use crate::{GuestMemoryMmap, VirtioInterrupt, VirtioInterruptType}; const QUEUE_SIZE: u16 = 256; const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE]; @@ -267,10 +267,6 @@ pub struct Pmem { mapping: UserspaceMapping, seccomp_action: SeccompAction, exit_evt: EventFd, - - // Hold ownership of the memory that is allocated for the device - // which will be automatically dropped when the device is dropped - _region: MmapRegion, } #[derive(Serialize, Deserialize)] @@ -287,7 +283,6 @@ impl Pmem { disk: File, addr: GuestAddress, mapping: UserspaceMapping, - _region: MmapRegion, iommu: bool, seccomp_action: SeccompAction, exit_evt: EventFd, @@ -304,7 +299,7 @@ impl Pmem { } else { let config = VirtioPmemConfig { start: addr.raw_value().to_le(), - size: (_region.size() as u64).to_le(), + size: (mapping.mapping.size() as u64).to_le(), }; let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; @@ -331,7 +326,6 @@ impl Pmem { config, mapping, seccomp_action, - _region, exit_evt, }) } diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index da54ef64d..ca60c2c06 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -1028,7 +1028,7 @@ impl PciDevice for VirtioPciDevice { let bar = PciBarConfiguration::default() .set_index(VIRTIO_SHM_BAR_INDEX) .set_address(shm_list.addr.raw_value()) - .set_size(shm_list.len); + .set_size(shm_list.mapping.size() as _); // The creation of the PCI BAR and its associated capabilities must // happen only during the creation of a brand new VM. When a VM is diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 2374a1cc7..b8d728920 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -356,10 +356,9 @@ impl VirtioDevice for Fs { let mut mappings = Vec::new(); if let Some(cache) = self.cache.as_ref() { mappings.push(UserspaceMapping { - host_addr: cache.0.host_addr, mem_slot: cache.0.mem_slot, addr: cache.0.addr, - len: cache.0.len, + mapping: cache.0.mapping.clone(), mergeable: false, }); } diff --git a/vm-device/src/lib.rs b/vm-device/src/lib.rs index f484e9f14..61e765950 100644 --- a/vm-device/src/lib.rs +++ b/vm-device/src/lib.rs @@ -3,8 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::sync::Arc; + use serde::{Deserialize, Serialize}; -use vm_memory::{GuestAddress, GuestUsize}; +use vm_memory::bitmap::AtomicBitmap; +use vm_memory::{GuestAddress, MmapRegion}; mod bus; pub mod dma_mapping; @@ -62,9 +65,8 @@ pub enum Resource { #[derive(Clone)] pub struct UserspaceMapping { - pub host_addr: u64, pub mem_slot: u32, pub addr: GuestAddress, - pub len: GuestUsize, + pub mapping: Arc>, pub mergeable: bool, } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 1293a0daa..622c183f0 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -97,8 +97,10 @@ use vm_device::interrupt::{ InterruptIndex, InterruptManager, LegacyIrqGroupConfig, MsiIrqGroupConfig, }; use vm_device::{Bus, BusDevice, BusDeviceSync, Resource, UserspaceMapping}; +#[cfg(feature = "ivshmem")] +use vm_memory::bitmap::AtomicBitmap; use vm_memory::guest_memory::FileOffset; -use vm_memory::{Address, GuestAddress, GuestMemoryRegion, GuestUsize, MmapRegion}; +use vm_memory::{Address, GuestAddress, GuestMemoryRegion, GuestUsize, MmapRegion, VolatileMemory}; #[cfg(target_arch = "x86_64")] use vm_memory::{GuestAddressSpace, GuestMemory}; use vm_migration::protocol::MemoryRangeTable; @@ -817,15 +819,15 @@ impl DeviceRelocation for AddressManager { if let Some(mut shm_regions) = virtio_dev.get_shm_regions() && shm_regions.addr.raw_value() == old_base { - // SAFETY: TODO what are the invariants here? + // SAFETY: guaranteed by MmapRegion invariants unsafe { // Remove old mapping self.vm .remove_user_memory_region( shm_regions.mem_slot, old_base, - shm_regions.len, - shm_regions.host_addr, + shm_regions.mapping.len(), + shm_regions.mapping.as_ptr(), false, false, ) @@ -834,17 +836,14 @@ impl DeviceRelocation for AddressManager { "failed to remove user memory region: {e:?}" )) })?; - } - // SAFETY: TODO what are the invariants here? - unsafe { // Create new mapping by inserting new region to KVM. self.vm .create_user_memory_region( shm_regions.mem_slot, new_base, - shm_regions.len, - shm_regions.host_addr, + shm_regions.mapping.len(), + shm_regions.mapping.as_ptr(), false, false, ) @@ -3249,10 +3248,11 @@ impl DeviceManager { }, ) .map_err(DeviceManagerError::NewMmapRegion)?; - let host_addr: u64 = mmap_region.as_ptr() as u64; + let host_addr = mmap_region.as_ptr(); // SAFETY: host_addr points to region_size bytes of mmap-allocated memory. let mem_slot = unsafe { + let region_size = region_size.try_into().unwrap(); self.memory_manager .lock() .unwrap() @@ -3261,10 +3261,9 @@ impl DeviceManager { }?; let mapping = UserspaceMapping { - host_addr, mem_slot, addr: GuestAddress(region_base), - len: region_size, + mapping: Arc::new(mmap_region), mergeable: false, }; @@ -3274,7 +3273,6 @@ impl DeviceManager { file, GuestAddress(region_base), mapping, - mmap_region, self.force_iommu | pmem_cfg.iommu, self.seccomp_action.clone(), self.exit_evt @@ -4010,16 +4008,13 @@ impl DeviceManager { resources, )?; - // SAFETY: TODO // Note it is required to call 'add_pci_device()' in advance to have the list of // mmio regions provisioned correctly - unsafe { - vfio_user_pci_device - .lock() - .unwrap() - .map_mmio_regions() - .map_err(DeviceManagerError::VfioUserMapRegion) - }?; + vfio_user_pci_device + .lock() + .unwrap() + .map_mmio_regions() + .map_err(DeviceManagerError::VfioUserMapRegion)?; let mut node = device_node!(vfio_user_name, vfio_user_pci_device); @@ -4747,8 +4742,8 @@ impl DeviceManager { .unwrap() .remove_userspace_mapping( mapping.addr.raw_value(), - mapping.len, - mapping.host_addr, + mapping.mapping.size(), + mapping.mapping.as_ptr() as _, mapping.mergeable, mapping.mem_slot, ) @@ -5002,13 +4997,12 @@ impl IvshmemOps for IvshmemHandler { start_addr: u64, size: usize, backing_file: Option, - ) -> Result<(Arc, UserspaceMapping), IvshmemError> { + ) -> Result<(Arc>, UserspaceMapping), IvshmemError> { info!("Creating ivshmem mem region at 0x{start_addr:x}"); - let region: Arc = MemoryManager::create_ram_region( + let region = MemoryManager::create_ram_region_raw( &backing_file, 0, - GuestAddress(start_addr), size, false, true, @@ -5021,12 +5015,12 @@ impl IvshmemOps for IvshmemHandler { .map_err(|_| IvshmemError::CreateUserMemoryRegion)?; let mem_slot = { let mut manager = self.memory_manager.lock().unwrap(); - // SAFETY: guaranteed by GuestRegionMmap invariants + // SAFETY: guaranteed by MmapRegion invariants unsafe { manager.create_userspace_mapping( - region.start_addr().0, + start_addr, region.len(), - region.as_ptr() as u64, + region.as_ptr(), false, false, false, @@ -5034,11 +5028,11 @@ impl IvshmemOps for IvshmemHandler { } } .map_err(|_| IvshmemError::CreateUserspaceMapping)?; + let region = Arc::new(region); let mapping = UserspaceMapping { - host_addr: region.as_ptr() as u64, + mapping: region.clone(), mem_slot, - addr: GuestAddress(region.start_addr().0), - len: region.len(), + addr: GuestAddress(start_addr), mergeable: false, }; Ok((region, mapping)) @@ -5050,8 +5044,8 @@ impl IvshmemOps for IvshmemHandler { unsafe { manager.remove_userspace_mapping( mapping.addr.raw_value(), - mapping.len, - mapping.host_addr, + mapping.mapping.len(), + mapping.mapping.as_ptr(), mapping.mergeable, mapping.mem_slot, ) diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index 146ea610e..a9efbe99a 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -8,7 +8,7 @@ use std::collections::BTreeMap; use std::collections::HashMap; use std::fs::{File, OpenOptions}; use std::io::{self}; -use std::ops::{BitAnd, Deref, Not, Sub}; +use std::ops::{BitAnd, Not, Sub}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use std::os::fd::AsFd; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; @@ -900,13 +900,12 @@ impl MemoryManager { for (zone_id, regions) in list { for (region, virtio_mem) in regions { - // SAFETY: regions only holds valid addresses. - // TODO: encapsulate this unsafety in a small part of the file. + // SAFETY: guaranteed by GuestRegionMmap invariants let slot = unsafe { self.create_userspace_mapping( region.start_addr().raw_value(), - region.len(), - region.as_ptr() as u64, + region.len().try_into().unwrap(), + region.as_ptr(), self.mergeable, false, self.log_dirty, @@ -962,13 +961,16 @@ impl MemoryManager { arch::layout::UEFI_START, ) .unwrap(); + const _: () = assert!(core::mem::size_of::() == core::mem::size_of::()); + + // SAFETY: guaranteed by GuestRegionMmap unsafe { self.vm .create_user_memory_region( uefi_mem_slot, uefi_region.start_addr().raw_value(), - uefi_region.len(), - uefi_region.as_ptr() as u64, + uefi_region.len() as usize, + uefi_region.as_ptr(), false, false, ) @@ -1364,10 +1366,9 @@ impl MemoryManager { } #[allow(clippy::too_many_arguments)] - pub fn create_ram_region( + pub fn create_ram_region_raw( backing_file: &Option, file_offset: u64, - start_addr: GuestAddress, size: usize, prefault: bool, shared: bool, @@ -1376,7 +1377,7 @@ impl MemoryManager { host_numa_node: Option, existing_memory_file: Option, thp: bool, - ) -> Result, Error> { + ) -> Result, Error> { let mut mmap_flags = libc::MAP_NORESERVE; // The duplication of mmap_flags ORing here is unfortunate but it also makes @@ -1403,17 +1404,13 @@ impl MemoryManager { None }; - let region = GuestRegionMmap::new( - MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) - .map_err(Error::GuestMemoryRegion)?, - start_addr, - ) - .map_err(Error::GuestMemory)?; + let region = MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) + .map_err(Error::GuestMemoryRegion)?; // Apply NUMA policy if needed. if let Some(node) = host_numa_node { - let addr = region.deref().as_ptr(); - let len = region.deref().size() as u64; + let addr = region.as_ptr(); + let len = region.size() as u64; let mode = MPOL_BIND; let mut nodemask: Vec = Vec::new(); let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; @@ -1498,7 +1495,39 @@ impl MemoryManager { } } - Ok(Arc::new(region)) + Ok(region) + } + + #[allow(clippy::too_many_arguments)] + pub fn create_ram_region( + backing_file: &Option, + file_offset: u64, + start_addr: GuestAddress, + size: usize, + prefault: bool, + shared: bool, + hugepages: bool, + hugepage_size: Option, + host_numa_node: Option, + existing_memory_file: Option, + thp: bool, + ) -> Result, Error> { + let r = Self::create_ram_region_raw( + backing_file, + file_offset, + size, + prefault, + shared, + hugepages, + hugepage_size, + host_numa_node, + existing_memory_file, + thp, + )?; + + Ok(Arc::new( + GuestRegionMmap::new(r, start_addr).map_err(Error::GuestMemory)?, + )) } // Duplicate of `memory_zone_get_align_size` that does not require a `zone` @@ -1612,12 +1641,12 @@ impl MemoryManager { )?; // Map it into the guest - // SAFETY: create_ram_region only produces valid mappings. + // SAFETY: guaranteed by GuestMmapRegion invariants let slot = unsafe { self.create_userspace_mapping( region.start_addr().0, - region.len(), - region.as_ptr() as u64, + region.len().try_into().unwrap(), + region.as_ptr(), self.mergeable, false, self.log_dirty, @@ -1722,8 +1751,8 @@ impl MemoryManager { pub unsafe fn create_userspace_mapping( &mut self, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, mergeable: bool, readonly: bool, log_dirty: bool, @@ -1731,10 +1760,11 @@ impl MemoryManager { let slot = self.allocate_memory_slot(); info!( - "Creating userspace mapping: {guest_phys_addr:x} -> {userspace_addr:x} {memory_size:x}, slot {slot}" + "Creating userspace mapping: {guest_phys_addr:x} -> {userspace_addr_:x} {memory_size:x}, slot {slot}", + userspace_addr_ = userspace_addr as u64 ); - // SAFETY: promised by caller + // SAFETY: caller promises parameters are correct. unsafe { self.vm .create_user_memory_region( @@ -1788,7 +1818,8 @@ impl MemoryManager { } info!( - "Created userspace mapping: {guest_phys_addr:x} -> {userspace_addr:x} {memory_size:x}" + "Created userspace mapping: {guest_phys_addr:x} -> {userspace_addr_:x} {memory_size:x}", + userspace_addr_ = userspace_addr as u64 ); Ok(slot) @@ -1806,12 +1837,12 @@ impl MemoryManager { pub unsafe fn remove_userspace_mapping( &mut self, guest_phys_addr: u64, - memory_size: u64, - userspace_addr: u64, + memory_size: usize, + userspace_addr: *mut u8, mergeable: bool, slot: u32, ) -> Result<(), Error> { - // SAFETY: The caller promises that the parameters are correct. + // SAFETY: Caller promises parameters are correct. unsafe { self.vm .remove_user_memory_region( @@ -1852,7 +1883,8 @@ impl MemoryManager { } info!( - "Removed userspace mapping: {guest_phys_addr:x} -> {userspace_addr:x} {memory_size:x}" + "Removed userspace mapping: {guest_phys_addr:x} -> {userspace_addr_:x} {memory_size:x}", + userspace_addr_ = userspace_addr as u64 ); Ok(()) diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 89255e28b..8803f68aa 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -3469,8 +3469,8 @@ mod unit_tests { vm.create_user_memory_region( index as u32, region.start_addr().raw_value(), - region.len(), - region.as_ptr() as u64, + region.len().try_into().unwrap(), + region.as_ptr(), false, false, ) @@ -3607,8 +3607,8 @@ pub fn test_vm() { vm.create_user_memory_region( index as u32, region.start_addr().raw_value(), - region.len(), - region.as_ptr() as u64, + region.len().try_into().unwrap(), + region.as_ptr() as _, false, false, )