From bf6f0f8352f9e414d008eef66ce86338ba310f83 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 9 Feb 2026 03:29:10 +0000 Subject: [PATCH] virtio-devices: vsock: Accept multi-descriptor TX packets Since kernel commit 6693731487a8 ("vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers"), a large vsock packet can be split into multiple descriptors. If we encounter such TX packets, pull the content into an owned buffer. Fixes: #7672 Signed-off-by: Wei Liu --- virtio-devices/src/vsock/packet.rs | 212 +++++++++++++++++++++-------- 1 file changed, 156 insertions(+), 56 deletions(-) diff --git a/virtio-devices/src/vsock/packet.rs b/virtio-devices/src/vsock/packet.rs index 65d41b06b..57218a5b8 100644 --- a/virtio-devices/src/vsock/packet.rs +++ b/virtio-devices/src/vsock/packet.rs @@ -3,14 +3,13 @@ // //! `VsockPacket` provides a thin wrapper over the buffers exchanged via virtio queues. -//! There are two components to a vsock packet, each using its own descriptor in a -//! virtio queue: +//! There are two components to a vsock packet, each described by a virtio descriptor chain: //! - the packet header; and //! - the packet data/buffer. //! //! There is a 1:1 relation between descriptor chains and packets: the first (chain head) holds -//! the header, and an optional second descriptor holds the data. The second descriptor is only -//! present for data packets (VSOCK_OP_RW). +//! the header, and the remaining descriptors (if any) hold the data. The data descriptors are +//! only present for data packets (VSOCK_OP_RW). //! //! `VsockPacket` wraps these two buffers and provides direct access to the data stored //! in guest memory. This is done to avoid unnecessarily copying data from guest memory @@ -91,17 +90,26 @@ const HDROFF_BUF_ALLOC: usize = 36; // we have successfully written to a backing Unix socket. const HDROFF_FWD_CNT: usize = 40; +/// The packet data buffer, which may be either: +/// - a borrowed slice of guest memory, if the packet data is stored in one contiguous buffer +/// described by a single virtq descriptor; +/// - an owned, linear buffer, if the packet data is stored in multiple buffers described by +/// multiple virtq descriptors. +enum PacketBuffer { + Borrowed { ptr: *mut u8, len: usize }, + Owned(Box<[u8]>), +} + /// The vsock packet, implemented as a wrapper over a virtq descriptor chain: /// - the chain head, holding the packet header; and -/// - (an optional) data/buffer descriptor, only present for data packets (VSOCK_OP_RW). +/// - (optional) buffer, only present for data packets (VSOCK_OP_RW). /// pub struct VsockPacket { // We still hold the header address in guest memory. We need to write back the modified // header in RX buffers. guest_hdr_addr: GuestAddress, hdr: [u8; VSOCK_PKT_HDR_SIZE], - buf: Option<*mut u8>, - buf_size: usize, + buf: Option, } impl VsockPacket { @@ -150,7 +158,6 @@ impl VsockPacket { guest_hdr_addr, hdr, buf: None, - buf_size: 0, }; // No point looking for a data/buffer descriptor, if the packet is zero-length. @@ -164,44 +171,85 @@ impl VsockPacket { return Err(VsockError::InvalidPktLen(pkt.len())); } - // Prior to Linux v6.3 there are two descriptors - if head.has_next() { - let buf_desc = desc_chain.next().ok_or(VsockError::BufDescMissing)?; + // For small packets, the data may be stored in the same descriptor as the header. + if !head.has_next() { + let buf_size: usize = head.len() as usize - VSOCK_PKT_HDR_SIZE; + let buf_ptr = get_host_address_range( + desc_chain.memory(), + head.addr() + .checked_add(VSOCK_PKT_HDR_SIZE as u64) + .unwrap() + .translate_gva(access_platform, buf_size), + buf_size, + ) + .ok_or(VsockError::GuestMemory)?; + pkt.buf = Some(PacketBuffer::Borrowed { + ptr: buf_ptr, + len: buf_size, + }); - // TX data should be read-only. - if buf_desc.is_write_only() { - return Err(VsockError::UnreadableDescriptor); + return Ok(pkt); + } + + // We have separate header and data descriptors. + let buf_desc = desc_chain.next().ok_or(VsockError::BufDescMissing)?; + + // TX data should be read-only. + if buf_desc.is_write_only() { + return Err(VsockError::UnreadableDescriptor); + } + + if buf_desc.has_next() { + // Multiple data descriptors -- copy into a linear buffer. + let total_len = pkt.len() as usize; + let mut owned = vec![0u8; total_len]; + let mut offset = 0usize; + let mut cur_desc = Some(buf_desc); + + while let Some(desc) = cur_desc { + if desc.is_write_only() { + return Err(VsockError::UnreadableDescriptor); + } + + let desc_len = desc.len() as usize; + if desc_len > 0 && offset < total_len { + let to_copy = std::cmp::min(desc_len, total_len - offset); + let desc_addr = desc.addr().translate_gva(access_platform, desc_len); + desc_chain + .memory() + .read_slice(&mut owned[offset..offset + to_copy], desc_addr) + .map_err(|_| VsockError::GuestMemory)?; + offset += to_copy; + } + + cur_desc = if desc.has_next() { + Some(desc_chain.next().ok_or(VsockError::BufDescMissing)?) + } else { + None + }; } + if offset < total_len { + return Err(VsockError::BufDescTooSmall); + } + pkt.buf = Some(PacketBuffer::Owned(owned.into_boxed_slice())); + } else { // The data buffer should be large enough to fit the size of the data, as described by // the header descriptor. if buf_desc.len() < pkt.len() { return Err(VsockError::BufDescTooSmall); } let buf_size = buf_desc.len() as usize; - pkt.buf_size = buf_size; - pkt.buf = Some( - get_host_address_range( - desc_chain.memory(), - buf_desc.addr().translate_gva(access_platform, buf_size), - pkt.buf_size, - ) - .ok_or(VsockError::GuestMemory)?, - ); - } else { - let buf_size: usize = head.len() as usize - VSOCK_PKT_HDR_SIZE; - pkt.buf_size = buf_size; - pkt.buf = Some( - get_host_address_range( - desc_chain.memory(), - head.addr() - .checked_add(VSOCK_PKT_HDR_SIZE as u64) - .unwrap() - .translate_gva(access_platform, buf_size), - buf_size, - ) - .ok_or(VsockError::GuestMemory)?, - ); + let buf_ptr = get_host_address_range( + desc_chain.memory(), + buf_desc.addr().translate_gva(access_platform, buf_size), + buf_size, + ) + .ok_or(VsockError::GuestMemory)?; + pkt.buf = Some(PacketBuffer::Borrowed { + ptr: buf_ptr, + len: buf_size, + }); } Ok(pkt) @@ -252,26 +300,33 @@ impl VsockPacket { let buf_desc = desc_chain.next().ok_or(VsockError::BufDescMissing)?; let buf_size = buf_desc.len() as usize; + // TODO: We still assume that there are at most two descriptors. We should probably + // support multi-descriptor RX packets as well, like we do for TX. This means we should + // add a function to commit the owned buffer back to guest memory. + if buf_desc.has_next() { + return Err(VsockError::BufDescTooSmall); + } + Ok(Self { guest_hdr_addr, hdr, - buf: Some( - get_host_address_range( + buf: Some(PacketBuffer::Borrowed { + ptr: get_host_address_range( desc_chain.memory(), buf_desc.addr().translate_gva(access_platform, buf_size), buf_size, ) .ok_or(VsockError::GuestMemory)?, - ), - buf_size, + len: buf_size, + }), }) } else { let buf_size: usize = head.len() as usize - VSOCK_PKT_HDR_SIZE; Ok(Self { guest_hdr_addr, hdr, - buf: Some( - get_host_address_range( + buf: Some(PacketBuffer::Borrowed { + ptr: get_host_address_range( desc_chain.memory(), head.addr() .checked_add(VSOCK_PKT_HDR_SIZE as u64) @@ -280,8 +335,8 @@ impl VsockPacket { buf_size, ) .ok_or(VsockError::GuestMemory)?, - ), - buf_size, + len: buf_size, + }), }) } } @@ -320,11 +375,14 @@ impl VsockPacket { /// (and often is) larger than the length of the packet data. The packet data length /// is stored in the packet header, and accessible via `VsockPacket::len()`. pub fn buf(&self) -> Option<&[u8]> { - self.buf.map(|ptr| { - // SAFETY: bound checks have already been performed when creating the packet - // from the virtq descriptor. - unsafe { std::slice::from_raw_parts(ptr as *const u8, self.buf_size) } - }) + match self.buf.as_ref()? { + PacketBuffer::Owned(owned) => Some(owned), + PacketBuffer::Borrowed { ptr, len } => { + // SAFETY: bound checks have already been performed when creating the packet + // from the virtq descriptor. + Some(unsafe { std::slice::from_raw_parts(*ptr as *const u8, *len) }) + } + } } /// Provides in-place, byte-slice, mutable access to the vsock packet data buffer. @@ -335,11 +393,14 @@ impl VsockPacket { /// (and often is) larger than the length of the packet data. The packet data length /// is stored in the packet header, and accessible via `VsockPacket::len()`. pub fn buf_mut(&mut self) -> Option<&mut [u8]> { - self.buf.map(|ptr| { - // SAFETY: bound checks have already been performed when creating the packet - // from the virtq descriptor. - unsafe { std::slice::from_raw_parts_mut(ptr, self.buf_size) } - }) + match self.buf.as_mut()? { + PacketBuffer::Owned(owned) => Some(owned), + PacketBuffer::Borrowed { ptr, len } => { + // SAFETY: bound checks have already been performed when creating the packet + // from the virtq descriptor. + Some(unsafe { std::slice::from_raw_parts_mut(*ptr, *len) }) + } + } } pub fn src_cid(&self) -> u64 { @@ -447,7 +508,7 @@ mod unit_tests { use virtio_bindings::virtio_ring::VRING_DESC_F_WRITE; use virtio_queue::QueueOwnedT; use vm_memory::GuestAddress; - use vm_virtio::queue::testing::VirtqDesc as GuestQDesc; + use vm_virtio::queue::testing::{VirtQueue as GuestQ, VirtqDesc as GuestQDesc}; use super::super::unit_tests::TestContext; use super::*; @@ -583,6 +644,45 @@ mod unit_tests { } } + #[test] + fn test_tx_packet_assembly_multi_desc() { + const QSIZE: u16 = 4; + let test_ctx = TestContext::new(); + let guest_txvq = GuestQ::new(GuestAddress(0x0060_0000), &test_ctx.mem, QSIZE); + let mut queue = guest_txvq.create_queue(); + + guest_txvq.dtable[0].set( + 0x0061_0000, + VSOCK_PKT_HDR_SIZE as u32, + virtio_bindings::virtio_ring::VRING_DESC_F_NEXT + .try_into() + .unwrap(), + 1, + ); + guest_txvq.dtable[1].set( + 0x0061_1000, + 4 * 1024, + virtio_bindings::virtio_ring::VRING_DESC_F_NEXT + .try_into() + .unwrap(), + 2, + ); + guest_txvq.dtable[2].set(0x0061_2000, 4 * 1024, 0, 0); + guest_txvq.avail.ring[0].set(0); + guest_txvq.avail.idx.set(1); + + set_pkt_len(8 * 1024, &guest_txvq.dtable[0], &test_ctx.mem); + + let pkt = VsockPacket::from_tx_virtq_head( + &mut queue.iter(&test_ctx.mem).unwrap().next().unwrap(), + None, + ) + .unwrap(); + + assert_eq!(pkt.len(), 8 * 1024); + assert_eq!(pkt.buf().unwrap().len(), 8 * 1024); + } + #[test] fn test_rx_packet_assembly() { // Test case: successful RX packet assembly.