diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index f84cd40cc..02dc09a4e 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -19,7 +19,8 @@ use std::{io, result}; use anyhow::anyhow; use block::async_io::{AsyncIo, AsyncIoError, DiskFile}; -use block::{build_serial, Request, RequestType, VirtioBlockConfig}; +use block::fcntl::{get_lock_state, LockError, LockType}; +use block::{build_serial, fcntl, Request, RequestType, VirtioBlockConfig}; use rate_limiter::group::{RateLimiterGroup, RateLimiterGroupHandle}; use rate_limiter::TokenType; use seccompiler::SeccompAction; @@ -80,6 +81,16 @@ pub enum Error { RequestStatus(GuestMemoryError), #[error("Failed to enable notification: {0}")] QueueEnableNotification(virtio_queue::Error), + #[error("Failed to get {lock_type:?} lock for disk image {path}: {error}")] + LockDiskImage { + /// The underlying error. + #[source] + error: LockError, + /// The requested lock type. + lock_type: LockType, + /// The path of the disk image. + path: PathBuf, + }, } pub type Result = result::Result; @@ -703,6 +714,53 @@ impl Block { }) } + /// Tries to set an advisory lock for the corresponding disk image. + pub fn try_lock_image(&mut self) -> Result<()> { + let lock_type = match self.read_only { + true => LockType::Read, + false => LockType::Write, + }; + log::debug!( + "Attempting to acquire {lock_type:?} lock for disk image id={},path={}", + self.id, + self.disk_path.display() + ); + let fd = self.disk_image.fd(); + fcntl::try_acquire_lock(fd, lock_type).map_err(|error| { + let current_lock = get_lock_state(fd); + // Don't propagate the error to the outside, as it is not useful at all. Instead, + // we try to log additional help to the user. + if let Ok(current_lock) = current_lock { + log::error!("Can't get {lock_type:?} lock for {} as there is already a {current_lock:?} lock", self.disk_path.display()); + } else { + log::error!("Can't get {lock_type:?} lock for {}, but also can't determine the current lock state", self.disk_path.display()); + } + Error::LockDiskImage { + path: self.disk_path.clone(), + error, + lock_type, + } + })?; + log::info!( + "Acquired {lock_type:?} lock for disk image id={},path={}", + self.id, + self.disk_path.display() + ); + Ok(()) + } + + /// Releases the advisory lock held for the corresponding disk image. + pub fn unlock_image(&mut self) -> Result<()> { + // It is very unlikely that this fails; + // Should we remove the Result to simplify the error propagation on + // higher levels? + fcntl::clear_lock(self.disk_image.fd()).map_err(|error| Error::LockDiskImage { + path: self.disk_path.clone(), + error, + lock_type: LockType::Unlock, + }) + } + fn state(&self) -> BlockState { BlockState { disk_path: self.disk_path.to_str().unwrap().to_owned(), diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index b22bc1c0b..3e843c26f 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -48,6 +48,9 @@ pub enum MigratableError { #[error("Failed to complete migration for migratable component: {0}")] CompleteMigration(#[source] anyhow::Error), + + #[error("Failed to release a disk lock before the migration: {0}")] + UnlockError(#[source] anyhow::Error), } /// A Pausable component can be paused and resumed. diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index b6eafdb2f..adf4d26ac 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -79,7 +79,7 @@ use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd}; use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport}; use virtio_devices::vhost_user::VhostUserConfig; use virtio_devices::{ - AccessPlatformMapping, ActivateError, Endpoint, IommuMapping, VdpaDmaMapping, + AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, VdpaDmaMapping, VirtioMemMappingSource, }; use vm_allocator::{AddressAllocator, SystemAllocator}; @@ -517,6 +517,9 @@ pub enum DeviceManagerError { // Invalid console fd InvalidConsoleFd, + + /// Cannot lock images of all block devices. + DiskLockError(virtio_devices::block::Error), } pub type DeviceManagerResult = result::Result; @@ -830,6 +833,9 @@ pub struct DeviceManager { // The virtio devices on the system virtio_devices: Vec, + /// All disks. Needed for locking and unlocking the images. + block_devices: Vec>>, + // List of bus devices // Let the DeviceManager keep strong references to the BusDevice devices. // This allows the IO and MMIO buses to be provided with Weak references, @@ -1158,6 +1164,7 @@ impl DeviceManager { memory_manager, cpu_manager, virtio_devices: Vec::new(), + block_devices: vec![], bus_devices: Vec::new(), device_id_cnt, msi_interrupt_manager, @@ -2292,6 +2299,35 @@ impl DeviceManager { Ok(tpm) } + /// Tries to acquire advisory locks for all disk images. + /// + /// This should only be called when a VM boots or VM state is restored. + /// For live-migration, the locks must be released on the destination side + /// before they are acquired again by the receiving side. + pub fn try_lock_disks(&self) -> DeviceManagerResult<()> { + for dev in &self.block_devices { + let mut dev = dev.lock().unwrap(); + dev.try_lock_image() + .map_err(DeviceManagerError::DiskLockError)?; + } + Ok(()) + } + + /// Release all advisory locks held for the disk images. + /// + /// This should only be called when the VM is stopped and the VMM supposed + /// to shut down. A new VMM, either after a live migration or a + /// state save/resume cycle, should then acquire all locks before the VM + /// starts to run. + pub fn release_disk_locks(&self) -> DeviceManagerResult<()> { + for dev in &self.block_devices { + let mut dev = dev.lock().unwrap(); + dev.unlock_image() + .map_err(DeviceManagerError::DiskLockError)?; + } + Ok(()) + } + fn make_virtio_devices(&mut self) -> DeviceManagerResult> { let mut devices: Vec = Vec::new(); @@ -2345,9 +2381,20 @@ impl DeviceManager { supported } + /// Creates a [`MetaVirtioDevice`] from the provided [`DiskConfig`]. + /// + /// Depending on the config, this is a [`vhost_user::Blk`] device or a [`virtio_devices::Block`] + /// device. + /// + /// # Arguments + /// - `disk_cfg`: The [`DiskConfig`] used to create the block device. + /// - `is_hotplug`: Whether the device is being hotplugged and the lock for the disk image + /// should be acquired right away. Locking will only happen for normal block devices, and not + /// vhost-user devices. fn make_virtio_block_device( &mut self, disk_cfg: &mut DiskConfig, + is_hotplug: bool, ) -> DeviceManagerResult { let id = if let Some(id) = &disk_cfg.id { id.clone() @@ -2360,6 +2407,9 @@ impl DeviceManager { info!("Creating virtio-block device: {:?}", disk_cfg); let (virtio_device, migratable_device) = if disk_cfg.vhost_user { + if is_hotplug { + log::debug!("Acquiring image lock for vhost-user block device not supported"); + } let socket = disk_cfg.vhost_socket.as_ref().unwrap().clone(); let vu_cfg = VhostUserConfig { socket, @@ -2516,31 +2566,43 @@ impl DeviceManager { BTreeMap::new() }; - let virtio_block = Arc::new(Mutex::new( - virtio_devices::Block::new( - id.clone(), - image, - disk_cfg - .path - .as_ref() - .ok_or(DeviceManagerError::NoDiskPath)? - .clone(), - disk_cfg.readonly, - self.force_iommu | disk_cfg.iommu, - disk_cfg.num_queues, - disk_cfg.queue_size, - disk_cfg.serial.clone(), - self.seccomp_action.clone(), - rate_limit_group, - self.exit_evt - .try_clone() - .map_err(DeviceManagerError::EventFd)?, - state_from_id(self.snapshot.as_ref(), id.as_str()) - .map_err(DeviceManagerError::RestoreGetState)?, - queue_affinity, - ) - .map_err(DeviceManagerError::CreateVirtioBlock)?, - )); + let mut virtio_block = virtio_devices::Block::new( + id.clone(), + image, + disk_cfg + .path + .as_ref() + .ok_or(DeviceManagerError::NoDiskPath)? + .clone(), + disk_cfg.readonly, + self.force_iommu | disk_cfg.iommu, + disk_cfg.num_queues, + disk_cfg.queue_size, + disk_cfg.serial.clone(), + self.seccomp_action.clone(), + rate_limit_group, + self.exit_evt + .try_clone() + .map_err(DeviceManagerError::EventFd)?, + state_from_id(self.snapshot.as_ref(), id.as_str()) + .map_err(DeviceManagerError::RestoreGetState)?, + queue_affinity, + ) + .map_err(DeviceManagerError::CreateVirtioBlock)?; + + // We lock the file here only for hotplugging. In normal operation, + // state save/resume, and live-migration, locking is part of the outer control flow + // to ensure proper order of (un)locking. + if is_hotplug { + log::debug!("Acquiring lock for hotplugged image"); + virtio_block + .try_lock_image() + .map_err(DeviceManagerError::DiskLockError)?; + } + + let virtio_block = Arc::new(Mutex::new(virtio_block)); + + self.block_devices.push(virtio_block.clone()); ( Arc::clone(&virtio_block) as Arc>, @@ -2571,7 +2633,7 @@ impl DeviceManager { let mut block_devices = self.config.lock().unwrap().disks.clone(); if let Some(disk_list_cfg) = &mut block_devices { for disk_cfg in disk_list_cfg.iter_mut() { - devices.push(self.make_virtio_block_device(disk_cfg)?); + devices.push(self.make_virtio_block_device(disk_cfg, false)?); } } self.config.lock().unwrap().disks = block_devices; @@ -4141,7 +4203,24 @@ impl DeviceManager { let device_tree = self.device_tree.lock().unwrap(); let node = device_tree .get(&id) - .ok_or(DeviceManagerError::UnknownDeviceId(id))?; + .ok_or(DeviceManagerError::UnknownDeviceId(id.clone()))?; + + // Release advisory locks by dropping all references. + // Linux automatically releases all locks of that file if the last open FD is closed. + { + let maybe_block_device_index = self + .block_devices + .iter() + .enumerate() + .find(|(_, dev)| { + let dev = dev.lock().unwrap(); + dev.id() == id + }) + .map(|(i, _)| i); + if let Some(index) = maybe_block_device_index { + let _ = self.block_devices.swap_remove(index); + } + } let pci_device_node = if node.pci_bdf.is_some() && node.pci_device_handle.is_some() { node @@ -4454,7 +4533,7 @@ impl DeviceManager { return Err(DeviceManagerError::InvalidIommuHotplug); } - let device = self.make_virtio_block_device(disk_cfg)?; + let device = self.make_virtio_block_device(disk_cfg, true)?; self.hotplug_virtio_pci_device(device) } @@ -4858,7 +4937,6 @@ impl Pausable for DeviceManager { migratable.lock().unwrap().resume()?; } } - Ok(()) } } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index e96b6e3fa..f7f31b8d3 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -1210,6 +1210,12 @@ impl Vmm { // Send last batch of dirty pages Self::vm_maybe_send_dirty_pages(vm, &mut socket)?; } + + // We release the locks early to enable locking them on the destination host. + // The VM is already stopped. + vm.release_disk_locks() + .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; + // Capture snapshot and send it let vm_snapshot = vm.snapshot()?; let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap(); @@ -1222,6 +1228,7 @@ impl Vmm { MigratableError::MigrateSend(anyhow!("Error during state migration")), )?; // Complete the migration + // At this step, the receiving VMM will acquire disk locks again. Request::complete().write_to(&mut socket)?; Response::read_from(&mut socket)?.ok_or_abandon( &mut socket, diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 6214c4fac..21dedfd18 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -332,6 +332,9 @@ pub enum Error { #[error("Error creating console devices")] CreateConsoleDevices(ConsoleDeviceError), + + #[error("Error locking the disk images")] + LockingError(DeviceManagerError), } pub type Result = result::Result; @@ -2234,6 +2237,14 @@ impl Vm { return self.resume().map_err(Error::Resume); } + // We acquire all advisory disk image locks here and not on device creation + // to enable live-migration without locking issues. + self.device_manager + .lock() + .unwrap() + .try_lock_disks() + .map_err(Error::LockingError)?; + let new_state = if self.stop_on_boot { VmState::BreakPoint } else { @@ -2369,6 +2380,13 @@ impl Vm { pub fn restore(&mut self) -> Result<()> { event!("vm", "restoring"); + // We acquire all advisory disk image locks again. + self.device_manager + .lock() + .unwrap() + .try_lock_disks() + .map_err(Error::LockingError)?; + // Now we can start all vCPUs from here. self.cpu_manager .lock() @@ -2482,6 +2500,21 @@ impl Vm { self.device_manager.lock().unwrap().device_tree() } + /// Release all advisory locks held for the disk images. + /// + /// This should only be called when the VM is stopped and the VMM supposed + /// to shut down. A new VMM, either after a live migration or a + /// state save/resume cycle, should then acquire all locks before the VM + /// starts to run. + pub fn release_disk_locks(&self) -> Result<()> { + self.device_manager + .lock() + .unwrap() + .release_disk_locks() + .map_err(Error::LockingError)?; + Ok(()) + } + pub fn activate_virtio_devices(&self) -> Result<()> { self.device_manager .lock()