block: advisory locks: use byte-range locks to match QEMU behavior

The granularity has significant implications in typical cloud deployments with network storage. The Linux kernel will sync advisory locks to network file systems, but these backends may have different policies and handle locks differently. For example, Netapp speaks a NFS API but will treat advisory OFD locks for the whole file as mandatory locks, whereas byte-range locks for the whole file will remain advisory [0]. As it is a valid use case to prevent multiple CHV instances from accessing the same disk but disk management software (e.g., Cinder in OpenStack) should be able to snapshot disks while VMs are running, we need special control over the lock granularity. Therefore, it is a valid use case to lock the whole byte range of a disk image without technically locking the whole file - to get the best of both worlds. This also brings CHVs behavior in line with QEMU [1]. Whole-file locks remain a valid use case and could be supported later. This patch only provides the necessary groundwork; making it configurable is out of scope for now. [0] https://kb.netapp.com/on-prem/ontap/da/NAS/NAS-KBs/How_is_Mandatory_Locking_supported_for_NFSv4_on_ONTAP_9 [1] <qemu>/util/osdep.c::qemu_lock_fcntl() Signed-off-by: Philipp Schuster <philipp.schuster@cyberus-technology.de> On-behalf-of: SAP philipp.schuster@sap.com
2025-10-21 10:02:09 +02:00 · 2025-10-21 10:02:09 +02:00 · f5d2973546
commit f5d2973546
parent 2be304b392
2 changed files with 87 additions and 14 deletions
--- a/block/src/fcntl.rs
+++ b/block/src/fcntl.rs
@ -101,13 +101,52 @@ impl LockState {
    }
 }

+/// The granularity of the advisory lock.
+///
+/// The granularity has significant implications in typical cloud deployments
+/// with network storage. The Linux kernel will sync advisory locks to network
+/// file systems, but these backends may have different policies and handle
+/// locks differently. For example, Netapp speaks a NFS API but will treat
+/// advisory OFD locks for the whole file as mandatory locks, whereas byte-range
+/// locks for the whole file will remain advisory [0].
+///
+/// As it is a valid use case to prevent multiple CHV instances from accessing
+/// the same disk but disk management software (e.g., Cinder in OpenStack)
+/// should be able to snapshot disks while VMs are running, we need special
+/// control over the lock granularity. Therefore, it is a valid use case to lock
+/// the whole byte range of a disk image without technically locking the whole
+/// file - to get the best of both worlds.
+///
+/// [0] https://kb.netapp.com/on-prem/ontap/da/NAS/NAS-KBs/How_is_Mandatory_Locking_supported_for_NFSv4_on_ONTAP_9
+#[derive(Clone, Copy, Debug)]
+pub enum LockGranularity {
+    WholeFile,
+    ByteRange(u64 /* from, inclusive */, u64 /* len */),
+}
+
+impl LockGranularity {
+    const fn l_start(self) -> u64 {
+        match self {
+            LockGranularity::WholeFile => 0,
+            LockGranularity::ByteRange(start, _) => start,
+        }
+    }
+
+    const fn l_len(self) -> u64 {
+        match self {
+            LockGranularity::WholeFile => 0, /* EOF */
+            LockGranularity::ByteRange(_, len) => len,
+        }
+    }
+}
+
 /// Returns a [`struct@libc::flock`] structure for the whole file.
-const fn get_flock(lock_type: LockType) -> libc::flock {
+const fn get_flock(lock_type: LockType, granularity: LockGranularity) -> libc::flock {
    libc::flock {
        l_type: lock_type.to_libc_val() as libc::c_short,
        l_whence: libc::SEEK_SET as libc::c_short,
-        l_start: 0,
-        l_len: 0, /* EOF */
+        l_start: granularity.l_start() as libc::c_long,
+        l_len: granularity.l_len() as libc::c_long,
        l_pid: 0, /* filled by callee */
    }
 }
@ -122,8 +161,13 @@ const fn get_flock(lock_type: LockType) -> libc::flock {
 /// - `file`: The file to acquire a lock for [`LockType`]. The file's state will
 ///   be logically mutated, but not technically.
 /// - `lock_type`: The [`LockType`]
-pub fn try_acquire_lock<Fd: AsRawFd>(file: Fd, lock_type: LockType) -> Result<(), LockError> {
-    let flock = get_flock(lock_type);
+/// - `granularity`: The [`LockGranularity`].
+pub fn try_acquire_lock<Fd: AsRawFd>(
+    file: Fd,
+    lock_type: LockType,
+    granularity: LockGranularity,
+) -> Result<(), LockError> {
+    let flock = get_flock(lock_type, granularity);

    let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_SETLK(&flock));
    match res {
@ -146,8 +190,9 @@ pub fn try_acquire_lock<Fd: AsRawFd>(file: Fd, lock_type: LockType) -> Result<()
 ///
 /// # Parameters
 /// - `file`: The file to clear all locks for [`LockType`].
-pub fn clear_lock<Fd: AsRawFd>(file: Fd) -> Result<(), LockError> {
-    try_acquire_lock(file, LockType::Unlock)
+/// - `granularity`: The [`LockGranularity`].
+pub fn clear_lock<Fd: AsRawFd>(file: Fd, granularity: LockGranularity) -> Result<(), LockError> {
+    try_acquire_lock(file, LockType::Unlock, granularity)
 }

 /// Returns the current lock state using [`fcntl`] with respect to the given
@ -155,8 +200,12 @@ pub fn clear_lock<Fd: AsRawFd>(file: Fd) -> Result<(), LockError> {
 ///
 /// # Parameters
 /// - `file`: The file for which to get the lock state.
-pub fn get_lock_state<Fd: AsRawFd>(file: Fd) -> Result<LockState, LockError> {
-    let mut flock = get_flock(LockType::Write);
+/// - `granularity`: The [`LockGranularity`].
+pub fn get_lock_state<Fd: AsRawFd>(
+    file: Fd,
+    granularity: LockGranularity,
+) -> Result<LockState, LockError> {
+    let mut flock = get_flock(LockType::Write, granularity);
    let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_GETLK(&mut flock));
    match res {
        0 => {
--- a/virtio-devices/src/block.rs
+++ b/virtio-devices/src/block.rs
@ -19,7 +19,7 @@ use std::{io, result};

 use anyhow::anyhow;
 use block::async_io::{AsyncIo, AsyncIoError, DiskFile};
-use block::fcntl::{LockError, LockType, get_lock_state};
+use block::fcntl::{LockError, LockGranularity, LockType, get_lock_state};
 use block::{
    ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl,
 };
@ -767,20 +767,42 @@ impl Block {
        has_feature(self.features(), VIRTIO_BLK_F_RO.into())
    }

+    /// Returns the granularity for the advisory lock for this disk.
+    // TODO In future, we could add a `lock_granularity=` configuration to the CLI.
+    // For now, we stick to QEMU behavior.
+    fn lock_granularity(&mut self) -> LockGranularity {
+        let fallback = LockGranularity::WholeFile;
+
+        self.disk_image
+            .size()
+            .map(|size| LockGranularity::ByteRange(0, size))
+            // use a safe fallback
+            .unwrap_or_else(|e| {
+                log::warn!(
+                    "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}",
+                    self.id,
+                    self.disk_path.display(),
+                    fallback
+                );
+                fallback
+            })
+    }
+
    /// Tries to set an advisory lock for the corresponding disk image.
    pub fn try_lock_image(&mut self) -> Result<()> {
        let lock_type = match self.read_only() {
            true => LockType::Read,
            false => LockType::Write,
        };
+        let granularity = self.lock_granularity();
        log::debug!(
-            "Attempting to acquire {lock_type:?} lock for disk image id={},path={}",
+            "Attempting to acquire {lock_type:?} lock for disk image: id={},path={},granularity={granularity:?}",
            self.id,
            self.disk_path.display()
        );
        let fd = self.disk_image.fd();
-        fcntl::try_acquire_lock(fd, lock_type).map_err(|error| {
-            let current_lock = get_lock_state(fd);
+        fcntl::try_acquire_lock(fd, lock_type, granularity).map_err(|error| {
+            let current_lock = get_lock_state(fd, granularity);
            // Don't propagate the error to the outside, as it is not useful at all. Instead,
            // we try to log additional help to the user.
            if let Ok(current_lock) = current_lock {
@ -804,10 +826,12 @@ impl Block {

    /// Releases the advisory lock held for the corresponding disk image.
    pub fn unlock_image(&mut self) -> Result<()> {
+        let granularity = self.lock_granularity();
+
        // It is very unlikely that this fails;
        // Should we remove the Result to simplify the error propagation on
        // higher levels?
-        fcntl::clear_lock(self.disk_image.fd()).map_err(|error| Error::LockDiskImage {
+        fcntl::clear_lock(self.disk_image.fd(), granularity).map_err(|error| Error::LockDiskImage {
            path: self.disk_path.clone(),
            error,
            lock_type: LockType::Unlock,