block: introduce advisory locks for disk image files
# What
This commit introduces file-based advisory locking for the files backing
up the block devices by using the fcntl() syscall with OFD locks. The
per-open-file-descriptor (OFD) locks are more robust than traditional
POSIX locks (F_SETLK) as they are not tied to process IDs and avoid
common issues in multithreaded or multi-fd scenarios [1]. Therefore,
we don't use `std::fs::File::try_lock()`, which is backed by F_SETLKW.
The locking mechanism is aware of the `readonly` property and allows
`n` readers or `1` writer (exclusive mode).
As the locks are advisory, multiple cloud-hypervisor processes can
prevent themselves from writing to the same file. However, this is not
a system-wide file-system level locking mechanism preventing to open()
a file.
The introduced new locking mechanism does not cover vhost-user devices.
# Why
To prevent misconfiguration and improve safety, it is good practice to
protect disk image files with a locking mechanism. Experience and common
best practices suggest that advisory locks are preferable over mandatory
locks due to better compatibility and fewer pitfalls (in fs space).
The introduced functionality is aligned with the approach taken by
QEMU [0], and is also recommended in [1].
# Implementation Details
We need to ensure that not only normal operation keeps working but also
state save/resume and live-migration. Especially for live migration,
it is crucial that the sender VMM releases the locks when the VM stops
so the receiver VMM can acquire them right after that.
Therefore, the locking and releasing happen directly on the block
device struct. The device manager knows all block devices and can
forward requests to these types.
Last but not least, this commit uses on explicit lock acquiring
but implicit lock releasing (FD close). It only explicitly releases
the locks where this integrates more smoothly into the existing
code.
# Testing
I tested
- normal operation
- state save/resume,
- device hot plugging,
- and live-migration
with read/shared and write/exclusive locks.
One can use the `fcntl-tool` to test if locks are actually acquired
or released [2].
# Links
[0] 825b96dbce/util/osdep.c (L266)
[1] https://apenwarr.ca/log/20101213
[2] https://crates.io/crates/fcntl-tool
Signed-off-by: Philipp Schuster <philipp.schuster@cyberus-technology.de>
On-behalf-of: SAP philipp.schuster@sap.com
This commit is contained in:
parent
71a36e0c69
commit
05968f5c2c
5 changed files with 210 additions and 31 deletions
|
|
@ -19,7 +19,8 @@ use std::{io, result};
|
|||
|
||||
use anyhow::anyhow;
|
||||
use block::async_io::{AsyncIo, AsyncIoError, DiskFile};
|
||||
use block::{build_serial, Request, RequestType, VirtioBlockConfig};
|
||||
use block::fcntl::{get_lock_state, LockError, LockType};
|
||||
use block::{build_serial, fcntl, Request, RequestType, VirtioBlockConfig};
|
||||
use rate_limiter::group::{RateLimiterGroup, RateLimiterGroupHandle};
|
||||
use rate_limiter::TokenType;
|
||||
use seccompiler::SeccompAction;
|
||||
|
|
@ -80,6 +81,16 @@ pub enum Error {
|
|||
RequestStatus(GuestMemoryError),
|
||||
#[error("Failed to enable notification: {0}")]
|
||||
QueueEnableNotification(virtio_queue::Error),
|
||||
#[error("Failed to get {lock_type:?} lock for disk image {path}: {error}")]
|
||||
LockDiskImage {
|
||||
/// The underlying error.
|
||||
#[source]
|
||||
error: LockError,
|
||||
/// The requested lock type.
|
||||
lock_type: LockType,
|
||||
/// The path of the disk image.
|
||||
path: PathBuf,
|
||||
},
|
||||
}
|
||||
|
||||
pub type Result<T> = result::Result<T, Error>;
|
||||
|
|
@ -703,6 +714,53 @@ impl Block {
|
|||
})
|
||||
}
|
||||
|
||||
/// Tries to set an advisory lock for the corresponding disk image.
|
||||
pub fn try_lock_image(&mut self) -> Result<()> {
|
||||
let lock_type = match self.read_only {
|
||||
true => LockType::Read,
|
||||
false => LockType::Write,
|
||||
};
|
||||
log::debug!(
|
||||
"Attempting to acquire {lock_type:?} lock for disk image id={},path={}",
|
||||
self.id,
|
||||
self.disk_path.display()
|
||||
);
|
||||
let fd = self.disk_image.fd();
|
||||
fcntl::try_acquire_lock(fd, lock_type).map_err(|error| {
|
||||
let current_lock = get_lock_state(fd);
|
||||
// Don't propagate the error to the outside, as it is not useful at all. Instead,
|
||||
// we try to log additional help to the user.
|
||||
if let Ok(current_lock) = current_lock {
|
||||
log::error!("Can't get {lock_type:?} lock for {} as there is already a {current_lock:?} lock", self.disk_path.display());
|
||||
} else {
|
||||
log::error!("Can't get {lock_type:?} lock for {}, but also can't determine the current lock state", self.disk_path.display());
|
||||
}
|
||||
Error::LockDiskImage {
|
||||
path: self.disk_path.clone(),
|
||||
error,
|
||||
lock_type,
|
||||
}
|
||||
})?;
|
||||
log::info!(
|
||||
"Acquired {lock_type:?} lock for disk image id={},path={}",
|
||||
self.id,
|
||||
self.disk_path.display()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Releases the advisory lock held for the corresponding disk image.
|
||||
pub fn unlock_image(&mut self) -> Result<()> {
|
||||
// It is very unlikely that this fails;
|
||||
// Should we remove the Result to simplify the error propagation on
|
||||
// higher levels?
|
||||
fcntl::clear_lock(self.disk_image.fd()).map_err(|error| Error::LockDiskImage {
|
||||
path: self.disk_path.clone(),
|
||||
error,
|
||||
lock_type: LockType::Unlock,
|
||||
})
|
||||
}
|
||||
|
||||
fn state(&self) -> BlockState {
|
||||
BlockState {
|
||||
disk_path: self.disk_path.to_str().unwrap().to_owned(),
|
||||
|
|
|
|||
|
|
@ -48,6 +48,9 @@ pub enum MigratableError {
|
|||
|
||||
#[error("Failed to complete migration for migratable component: {0}")]
|
||||
CompleteMigration(#[source] anyhow::Error),
|
||||
|
||||
#[error("Failed to release a disk lock before the migration: {0}")]
|
||||
UnlockError(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
/// A Pausable component can be paused and resumed.
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd};
|
|||
use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport};
|
||||
use virtio_devices::vhost_user::VhostUserConfig;
|
||||
use virtio_devices::{
|
||||
AccessPlatformMapping, ActivateError, Endpoint, IommuMapping, VdpaDmaMapping,
|
||||
AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, VdpaDmaMapping,
|
||||
VirtioMemMappingSource,
|
||||
};
|
||||
use vm_allocator::{AddressAllocator, SystemAllocator};
|
||||
|
|
@ -517,6 +517,9 @@ pub enum DeviceManagerError {
|
|||
|
||||
// Invalid console fd
|
||||
InvalidConsoleFd,
|
||||
|
||||
/// Cannot lock images of all block devices.
|
||||
DiskLockError(virtio_devices::block::Error),
|
||||
}
|
||||
|
||||
pub type DeviceManagerResult<T> = result::Result<T, DeviceManagerError>;
|
||||
|
|
@ -830,6 +833,9 @@ pub struct DeviceManager {
|
|||
// The virtio devices on the system
|
||||
virtio_devices: Vec<MetaVirtioDevice>,
|
||||
|
||||
/// All disks. Needed for locking and unlocking the images.
|
||||
block_devices: Vec<Arc<Mutex<Block>>>,
|
||||
|
||||
// List of bus devices
|
||||
// Let the DeviceManager keep strong references to the BusDevice devices.
|
||||
// This allows the IO and MMIO buses to be provided with Weak references,
|
||||
|
|
@ -1158,6 +1164,7 @@ impl DeviceManager {
|
|||
memory_manager,
|
||||
cpu_manager,
|
||||
virtio_devices: Vec::new(),
|
||||
block_devices: vec![],
|
||||
bus_devices: Vec::new(),
|
||||
device_id_cnt,
|
||||
msi_interrupt_manager,
|
||||
|
|
@ -2292,6 +2299,35 @@ impl DeviceManager {
|
|||
Ok(tpm)
|
||||
}
|
||||
|
||||
/// Tries to acquire advisory locks for all disk images.
|
||||
///
|
||||
/// This should only be called when a VM boots or VM state is restored.
|
||||
/// For live-migration, the locks must be released on the destination side
|
||||
/// before they are acquired again by the receiving side.
|
||||
pub fn try_lock_disks(&self) -> DeviceManagerResult<()> {
|
||||
for dev in &self.block_devices {
|
||||
let mut dev = dev.lock().unwrap();
|
||||
dev.try_lock_image()
|
||||
.map_err(DeviceManagerError::DiskLockError)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Release all advisory locks held for the disk images.
|
||||
///
|
||||
/// This should only be called when the VM is stopped and the VMM supposed
|
||||
/// to shut down. A new VMM, either after a live migration or a
|
||||
/// state save/resume cycle, should then acquire all locks before the VM
|
||||
/// starts to run.
|
||||
pub fn release_disk_locks(&self) -> DeviceManagerResult<()> {
|
||||
for dev in &self.block_devices {
|
||||
let mut dev = dev.lock().unwrap();
|
||||
dev.unlock_image()
|
||||
.map_err(DeviceManagerError::DiskLockError)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_virtio_devices(&mut self) -> DeviceManagerResult<Vec<MetaVirtioDevice>> {
|
||||
let mut devices: Vec<MetaVirtioDevice> = Vec::new();
|
||||
|
||||
|
|
@ -2345,9 +2381,20 @@ impl DeviceManager {
|
|||
supported
|
||||
}
|
||||
|
||||
/// Creates a [`MetaVirtioDevice`] from the provided [`DiskConfig`].
|
||||
///
|
||||
/// Depending on the config, this is a [`vhost_user::Blk`] device or a [`virtio_devices::Block`]
|
||||
/// device.
|
||||
///
|
||||
/// # Arguments
|
||||
/// - `disk_cfg`: The [`DiskConfig`] used to create the block device.
|
||||
/// - `is_hotplug`: Whether the device is being hotplugged and the lock for the disk image
|
||||
/// should be acquired right away. Locking will only happen for normal block devices, and not
|
||||
/// vhost-user devices.
|
||||
fn make_virtio_block_device(
|
||||
&mut self,
|
||||
disk_cfg: &mut DiskConfig,
|
||||
is_hotplug: bool,
|
||||
) -> DeviceManagerResult<MetaVirtioDevice> {
|
||||
let id = if let Some(id) = &disk_cfg.id {
|
||||
id.clone()
|
||||
|
|
@ -2360,6 +2407,9 @@ impl DeviceManager {
|
|||
info!("Creating virtio-block device: {:?}", disk_cfg);
|
||||
|
||||
let (virtio_device, migratable_device) = if disk_cfg.vhost_user {
|
||||
if is_hotplug {
|
||||
log::debug!("Acquiring image lock for vhost-user block device not supported");
|
||||
}
|
||||
let socket = disk_cfg.vhost_socket.as_ref().unwrap().clone();
|
||||
let vu_cfg = VhostUserConfig {
|
||||
socket,
|
||||
|
|
@ -2516,31 +2566,43 @@ impl DeviceManager {
|
|||
BTreeMap::new()
|
||||
};
|
||||
|
||||
let virtio_block = Arc::new(Mutex::new(
|
||||
virtio_devices::Block::new(
|
||||
id.clone(),
|
||||
image,
|
||||
disk_cfg
|
||||
.path
|
||||
.as_ref()
|
||||
.ok_or(DeviceManagerError::NoDiskPath)?
|
||||
.clone(),
|
||||
disk_cfg.readonly,
|
||||
self.force_iommu | disk_cfg.iommu,
|
||||
disk_cfg.num_queues,
|
||||
disk_cfg.queue_size,
|
||||
disk_cfg.serial.clone(),
|
||||
self.seccomp_action.clone(),
|
||||
rate_limit_group,
|
||||
self.exit_evt
|
||||
.try_clone()
|
||||
.map_err(DeviceManagerError::EventFd)?,
|
||||
state_from_id(self.snapshot.as_ref(), id.as_str())
|
||||
.map_err(DeviceManagerError::RestoreGetState)?,
|
||||
queue_affinity,
|
||||
)
|
||||
.map_err(DeviceManagerError::CreateVirtioBlock)?,
|
||||
));
|
||||
let mut virtio_block = virtio_devices::Block::new(
|
||||
id.clone(),
|
||||
image,
|
||||
disk_cfg
|
||||
.path
|
||||
.as_ref()
|
||||
.ok_or(DeviceManagerError::NoDiskPath)?
|
||||
.clone(),
|
||||
disk_cfg.readonly,
|
||||
self.force_iommu | disk_cfg.iommu,
|
||||
disk_cfg.num_queues,
|
||||
disk_cfg.queue_size,
|
||||
disk_cfg.serial.clone(),
|
||||
self.seccomp_action.clone(),
|
||||
rate_limit_group,
|
||||
self.exit_evt
|
||||
.try_clone()
|
||||
.map_err(DeviceManagerError::EventFd)?,
|
||||
state_from_id(self.snapshot.as_ref(), id.as_str())
|
||||
.map_err(DeviceManagerError::RestoreGetState)?,
|
||||
queue_affinity,
|
||||
)
|
||||
.map_err(DeviceManagerError::CreateVirtioBlock)?;
|
||||
|
||||
// We lock the file here only for hotplugging. In normal operation,
|
||||
// state save/resume, and live-migration, locking is part of the outer control flow
|
||||
// to ensure proper order of (un)locking.
|
||||
if is_hotplug {
|
||||
log::debug!("Acquiring lock for hotplugged image");
|
||||
virtio_block
|
||||
.try_lock_image()
|
||||
.map_err(DeviceManagerError::DiskLockError)?;
|
||||
}
|
||||
|
||||
let virtio_block = Arc::new(Mutex::new(virtio_block));
|
||||
|
||||
self.block_devices.push(virtio_block.clone());
|
||||
|
||||
(
|
||||
Arc::clone(&virtio_block) as Arc<Mutex<dyn virtio_devices::VirtioDevice>>,
|
||||
|
|
@ -2571,7 +2633,7 @@ impl DeviceManager {
|
|||
let mut block_devices = self.config.lock().unwrap().disks.clone();
|
||||
if let Some(disk_list_cfg) = &mut block_devices {
|
||||
for disk_cfg in disk_list_cfg.iter_mut() {
|
||||
devices.push(self.make_virtio_block_device(disk_cfg)?);
|
||||
devices.push(self.make_virtio_block_device(disk_cfg, false)?);
|
||||
}
|
||||
}
|
||||
self.config.lock().unwrap().disks = block_devices;
|
||||
|
|
@ -4141,7 +4203,24 @@ impl DeviceManager {
|
|||
let device_tree = self.device_tree.lock().unwrap();
|
||||
let node = device_tree
|
||||
.get(&id)
|
||||
.ok_or(DeviceManagerError::UnknownDeviceId(id))?;
|
||||
.ok_or(DeviceManagerError::UnknownDeviceId(id.clone()))?;
|
||||
|
||||
// Release advisory locks by dropping all references.
|
||||
// Linux automatically releases all locks of that file if the last open FD is closed.
|
||||
{
|
||||
let maybe_block_device_index = self
|
||||
.block_devices
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find(|(_, dev)| {
|
||||
let dev = dev.lock().unwrap();
|
||||
dev.id() == id
|
||||
})
|
||||
.map(|(i, _)| i);
|
||||
if let Some(index) = maybe_block_device_index {
|
||||
let _ = self.block_devices.swap_remove(index);
|
||||
}
|
||||
}
|
||||
|
||||
let pci_device_node = if node.pci_bdf.is_some() && node.pci_device_handle.is_some() {
|
||||
node
|
||||
|
|
@ -4454,7 +4533,7 @@ impl DeviceManager {
|
|||
return Err(DeviceManagerError::InvalidIommuHotplug);
|
||||
}
|
||||
|
||||
let device = self.make_virtio_block_device(disk_cfg)?;
|
||||
let device = self.make_virtio_block_device(disk_cfg, true)?;
|
||||
self.hotplug_virtio_pci_device(device)
|
||||
}
|
||||
|
||||
|
|
@ -4858,7 +4937,6 @@ impl Pausable for DeviceManager {
|
|||
migratable.lock().unwrap().resume()?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1210,6 +1210,12 @@ impl Vmm {
|
|||
// Send last batch of dirty pages
|
||||
Self::vm_maybe_send_dirty_pages(vm, &mut socket)?;
|
||||
}
|
||||
|
||||
// We release the locks early to enable locking them on the destination host.
|
||||
// The VM is already stopped.
|
||||
vm.release_disk_locks()
|
||||
.map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?;
|
||||
|
||||
// Capture snapshot and send it
|
||||
let vm_snapshot = vm.snapshot()?;
|
||||
let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap();
|
||||
|
|
@ -1222,6 +1228,7 @@ impl Vmm {
|
|||
MigratableError::MigrateSend(anyhow!("Error during state migration")),
|
||||
)?;
|
||||
// Complete the migration
|
||||
// At this step, the receiving VMM will acquire disk locks again.
|
||||
Request::complete().write_to(&mut socket)?;
|
||||
Response::read_from(&mut socket)?.ok_or_abandon(
|
||||
&mut socket,
|
||||
|
|
|
|||
|
|
@ -332,6 +332,9 @@ pub enum Error {
|
|||
|
||||
#[error("Error creating console devices")]
|
||||
CreateConsoleDevices(ConsoleDeviceError),
|
||||
|
||||
#[error("Error locking the disk images")]
|
||||
LockingError(DeviceManagerError),
|
||||
}
|
||||
pub type Result<T> = result::Result<T, Error>;
|
||||
|
||||
|
|
@ -2234,6 +2237,14 @@ impl Vm {
|
|||
return self.resume().map_err(Error::Resume);
|
||||
}
|
||||
|
||||
// We acquire all advisory disk image locks here and not on device creation
|
||||
// to enable live-migration without locking issues.
|
||||
self.device_manager
|
||||
.lock()
|
||||
.unwrap()
|
||||
.try_lock_disks()
|
||||
.map_err(Error::LockingError)?;
|
||||
|
||||
let new_state = if self.stop_on_boot {
|
||||
VmState::BreakPoint
|
||||
} else {
|
||||
|
|
@ -2369,6 +2380,13 @@ impl Vm {
|
|||
pub fn restore(&mut self) -> Result<()> {
|
||||
event!("vm", "restoring");
|
||||
|
||||
// We acquire all advisory disk image locks again.
|
||||
self.device_manager
|
||||
.lock()
|
||||
.unwrap()
|
||||
.try_lock_disks()
|
||||
.map_err(Error::LockingError)?;
|
||||
|
||||
// Now we can start all vCPUs from here.
|
||||
self.cpu_manager
|
||||
.lock()
|
||||
|
|
@ -2482,6 +2500,21 @@ impl Vm {
|
|||
self.device_manager.lock().unwrap().device_tree()
|
||||
}
|
||||
|
||||
/// Release all advisory locks held for the disk images.
|
||||
///
|
||||
/// This should only be called when the VM is stopped and the VMM supposed
|
||||
/// to shut down. A new VMM, either after a live migration or a
|
||||
/// state save/resume cycle, should then acquire all locks before the VM
|
||||
/// starts to run.
|
||||
pub fn release_disk_locks(&self) -> Result<()> {
|
||||
self.device_manager
|
||||
.lock()
|
||||
.unwrap()
|
||||
.release_disk_locks()
|
||||
.map_err(Error::LockingError)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn activate_virtio_devices(&self) -> Result<()> {
|
||||
self.device_manager
|
||||
.lock()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue