block: qcow: support compressed clusters (zlib, zstd)

Add support of reading and writing compressed clusters.
Support zlib and zstd compressions.

L2 cache: store entire L2 entries, not only standard cluster addresses.

Read path. Offsets of compressed clusters cannot be determined,
therefore replace QcowFile.file_offset_read() with QcowFile.file_read().
This method reads the cluster, decompresses it if necessary and returns
the data to the caller.

Write path. QcowFile.file_offset_write(): since writing to compressed
clusters is not generally possible, allocate a new standard
(non-compressed) cluster if compressed L2 entry is encountered; then
decompress compressed cluster into new cluster; then return offset
inside new cluster to the caller. Processing of standard clusters is
not changed.

Signed-off-by: Eugene Korenevsky <ekorenevsky@aliyun.com>
This commit is contained in:
Eugene Korenevsky 2025-11-04 03:34:56 +03:00 committed by Rob Bradford
parent 4d79709b5e
commit aa67250049
4 changed files with 311 additions and 72 deletions

59
Cargo.lock generated
View file

@ -321,6 +321,7 @@ version = "0.1.0"
dependencies = [
"byteorder",
"crc-any",
"flate2",
"io-uring",
"libc",
"log",
@ -334,6 +335,7 @@ dependencies = [
"vm-memory",
"vm-virtio",
"vmm-sys-util",
"zstd",
]
[[package]]
@ -368,6 +370,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7"
dependencies = [
"find-msvc-tools",
"jobserver",
"libc",
"shlex",
]
@ -728,6 +732,16 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
[[package]]
name = "flate2"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "flume"
version = "0.11.1"
@ -1076,6 +1090,16 @@ dependencies = [
"syn",
]
[[package]]
name = "jobserver"
version = "0.1.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
dependencies = [
"getrandom 0.3.3",
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.77"
@ -1235,6 +1259,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
dependencies = [
"adler2",
"simd-adler32",
]
[[package]]
@ -1965,6 +1990,12 @@ dependencies = [
"libc",
]
[[package]]
name = "simd-adler32"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
[[package]]
name = "slab"
version = "0.4.11"
@ -2809,6 +2840,34 @@ dependencies = [
"syn",
]
[[package]]
name = "zstd"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "7.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
dependencies = [
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.16+zstd.1.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
dependencies = [
"cc",
"pkg-config",
]
[[package]]
name = "zvariant"
version = "5.7.0"

View file

@ -11,6 +11,7 @@ io_uring = ["dep:io-uring"]
[dependencies]
byteorder = { workspace = true }
crc-any = "2.5.0"
flate2 = "1.0"
io-uring = { version = "0.7.10", optional = true }
libc = { workspace = true }
log = { workspace = true }
@ -28,6 +29,7 @@ vm-memory = { workspace = true, features = [
] }
vm-virtio = { path = "../vm-virtio" }
vmm-sys-util = { workspace = true }
zstd = "0.13"
[lints]
workspace = true

87
block/src/qcow/decoder.rs Normal file
View file

@ -0,0 +1,87 @@
// Copyright 2025 The Cloud Hypervisor Authors. All rights reserved.
//
// SPDX-License-Identifier: Apache-2.0
use thiserror::Error;
#[derive(Debug, Error)]
pub enum Error {
#[error("Zlib decompress error")]
ZlibDecompress(#[source] flate2::DecompressError),
#[error("Zlib unexpected status: {0:?}")]
ZlibUnexpectedStatus(flate2::Status),
#[error("Zstd decompress error")]
ZstdDecompress(#[source] std::io::Error),
#[error("Zstd: failed to fill buffer")]
ZstdFillBuffer(#[source] std::io::Error),
}
pub type Result<T> = std::result::Result<T, Error>;
/// Generic trait for decoding zlib/zstd formats
pub trait Decoder {
fn decode(&self, input: &[u8], output: &mut [u8]) -> Result<usize>;
}
#[derive(Default)]
pub struct ZlibDecoder {}
impl Decoder for ZlibDecoder {
fn decode(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
use flate2::{Decompress, FlushDecompress, Status};
let mut decompressor = Decompress::new(false);
let status = decompressor
.decompress(input, output, FlushDecompress::Finish)
.map_err(Error::ZlibDecompress)?;
if status == Status::StreamEnd {
Ok(decompressor.total_out() as usize)
} else {
Err(Error::ZlibUnexpectedStatus(status))
}
}
}
#[derive(Default)]
pub struct ZstdDecoder {}
impl Decoder for ZstdDecoder {
fn decode(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
use std::io::Read;
let mut decoder = zstd::stream::read::Decoder::new(input).map_err(Error::ZstdDecompress)?;
let decoded_size = decoder.read(output).map_err(Error::ZstdFillBuffer)?;
Ok(decoded_size)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_zlib_decode() {
let d = ZlibDecoder::default();
let valid_input = vec![99, 96, 100, 98, 6, 0];
let mut output1 = vec![0; 4];
d.decode(&valid_input, &mut output1).unwrap();
assert_eq!(&output1, b"\x00\x01\x02\x03");
let invalid_input = vec![1, 2, 3, 4];
let mut output2 = vec![0; 1024];
d.decode(&invalid_input, &mut output2).unwrap_err();
}
#[test]
fn test_zstd_decode() {
let d = ZstdDecoder::default();
let valid_input = vec![40, 181, 47, 253, 32, 2, 17, 0, 0, 1, 254];
let mut output1 = vec![0; 2];
d.decode(&valid_input, &mut output1).unwrap();
assert_eq!(&output1, b"\x01\xfe");
let invalid_input = vec![1, 2, 3, 4];
let mut output2 = vec![0; 1024];
d.decode(&invalid_input, &mut output2).unwrap_err();
}
}

View file

@ -4,6 +4,7 @@
//
// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
mod decoder;
mod qcow_raw_file;
mod raw_file;
mod refcount;
@ -17,7 +18,7 @@ use std::os::fd::{AsRawFd, RawFd};
use std::str;
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use libc::{EINVAL, ENOSPC, ENOTSUP};
use libc::{EINVAL, EIO, ENOSPC};
use remain::sorted;
use thiserror::Error;
use vmm_sys_util::file_traits::{FileSetLen, FileSync};
@ -25,6 +26,7 @@ use vmm_sys_util::seek_hole::SeekHole;
use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt};
use crate::BlockBackend;
use crate::qcow::decoder::{Decoder, ZlibDecoder, ZstdDecoder};
use crate::qcow::qcow_raw_file::QcowRawFile;
pub use crate::qcow::raw_file::RawFile;
use crate::qcow::refcount::RefCount;
@ -42,8 +44,6 @@ pub enum Error {
BackingFileOpen(#[source] Box<Error>),
#[error("Backing file name is too long: {0} bytes over")]
BackingFileTooLong(usize),
#[error("Compressed blocks not supported")]
CompressedBlocksNotSupported,
#[error("Failed to evict cache")]
EvictingCache(#[source] io::Error),
#[error("File larger than max of {MAX_QCOW_FILE_SIZE}: {0}")]
@ -110,6 +110,8 @@ pub enum Error {
TooManyL1Entries(u64),
#[error("Ref count table too large: {0}")]
TooManyRefcounts(u64),
#[error("Unsupported compression type")]
UnsupportedCompressionType,
#[error("Unsupported refcount order")]
UnsupportedRefcountOrder,
#[error("Unsupported version: {0}")]
@ -127,6 +129,12 @@ pub enum ImageType {
Qcow2,
}
#[derive(Clone, Debug)]
pub enum CompressionType {
Zlib,
Zstd,
}
// Maximum data size supported.
const MAX_QCOW_FILE_SIZE: u64 = 0x01 << 44; // 16 TB.
@ -153,15 +161,50 @@ const L1_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00;
const L2_TABLE_OFFSET_MASK: u64 = 0x00ff_ffff_ffff_fe00;
// Flags
const COMPRESSED_FLAG: u64 = 1 << 62;
const COMPRESSED_SECTOR_SIZE: u64 = 512;
const CLUSTER_USED_FLAG: u64 = 1 << 63;
const COMPATIBLE_FEATURES_LAZY_REFCOUNTS: u64 = 1;
// Compression types as defined in https://www.qemu.org/docs/master/interop/qcow2.html
const COMPRESSION_TYPE_ZLIB: u64 = 0; // zlib/deflate <https://www.ietf.org/rfc/rfc1951.txt>
const COMPRESSION_TYPE_ZSTD: u64 = 1; // zstd <http://github.com/facebook/zstd>
// The format supports a "header extension area", that crosvm does not use.
const QCOW_EMPTY_HEADER_EXTENSION_SIZE: u32 = 8;
// Defined by the specification
const MAX_BACKING_FILE_SIZE: u32 = 1023;
fn l2_entry_is_empty(l2_entry: u64) -> bool {
l2_entry == 0
}
fn l2_entry_is_compressed(l2_entry: u64) -> bool {
l2_entry & COMPRESSED_FLAG != 0
}
// Get file offset and size of compressed cluster data
fn l2_entry_compressed_cluster_layout(l2_entry: u64, cluster_bits: u32) -> (u64, usize) {
let compressed_size_shift = 62 - (cluster_bits - 8);
let compressed_size_mask = (1 << (cluster_bits - 8)) - 1;
let compressed_cluster_addr = l2_entry & ((1 << compressed_size_shift) - 1);
let nsectors = (l2_entry >> compressed_size_shift & compressed_size_mask) + 1;
let compressed_cluster_size = ((nsectors * COMPRESSED_SECTOR_SIZE)
- (compressed_cluster_addr & (COMPRESSED_SECTOR_SIZE - 1)))
as usize;
(compressed_cluster_addr, compressed_cluster_size)
}
// Get file offset of standard (non-compressed) cluster
fn l2_entry_std_cluster_addr(l2_entry: u64) -> u64 {
l2_entry & L2_TABLE_OFFSET_MASK
}
// Make L2 entry for standard (non-compressed) cluster
fn l2_entry_make_std(cluster_addr: u64) -> u64 {
(cluster_addr & L2_TABLE_OFFSET_MASK) | CLUSTER_USED_FLAG
}
/// Contains the information from the header of a qcow file.
#[derive(Clone, Debug)]
pub struct QcowHeader {
@ -190,6 +233,7 @@ pub struct QcowHeader {
pub autoclear_features: u64,
pub refcount_order: u32,
pub header_size: u32,
pub compression_type: CompressionType,
// Post-header entries
pub backing_file_path: Option<String>,
@ -255,8 +299,19 @@ impl QcowHeader {
} else {
read_u32_from_file(f)?
},
compression_type: CompressionType::Zlib,
backing_file_path: None,
};
if version == 3 && header.header_size > V3_BARE_HEADER_SIZE {
let raw_compression_type = read_u64_from_file(f)? >> (64 - 8);
header.compression_type = if raw_compression_type == COMPRESSION_TYPE_ZLIB {
Ok(CompressionType::Zlib)
} else if raw_compression_type == COMPRESSION_TYPE_ZSTD {
Ok(CompressionType::Zstd)
} else {
Err(Error::UnsupportedCompressionType)
}?;
}
if header.backing_file_size > MAX_BACKING_FILE_SIZE {
return Err(Error::BackingFileTooLong(header.backing_file_size as usize));
}
@ -274,6 +329,13 @@ impl QcowHeader {
Ok(header)
}
pub fn get_decoder(&self) -> Box<dyn Decoder> {
match self.compression_type {
CompressionType::Zlib => Box::new(ZlibDecoder {}),
CompressionType::Zstd => Box::new(ZstdDecoder {}),
}
}
pub fn create_for_size_and_path(
version: u32,
size: u64,
@ -337,6 +399,7 @@ impl QcowHeader {
autoclear_features: 0,
refcount_order: DEFAULT_REFCOUNT_ORDER,
header_size,
compression_type: CompressionType::Zlib,
backing_file_path: backing_file.map(String::from),
})
}
@ -588,17 +651,6 @@ impl QcowFile {
let l2_entries = cluster_size / size_of::<u64>() as u64;
// Check for compressed blocks
for l2_addr_disk in l1_table.get_values() {
if *l2_addr_disk != 0
&& let Err(e) = Self::read_l2_cluster(&mut raw_file, *l2_addr_disk)
&& let Some(os_error) = e.raw_os_error()
&& os_error == ENOTSUP
{
return Err(Error::CompressedBlocksNotSupported);
}
}
let mut qcow = QcowFile {
raw_file,
header,
@ -714,11 +766,7 @@ impl QcowFile {
let raw_file = &mut self.raw_file;
self.l2_cache
.insert(l1_index, table, |index, evicted| {
raw_file.write_pointer_table(
l1_table[index],
evicted.get_values(),
CLUSTER_USED_FLAG,
)
raw_file.write_pointer_table(l1_table[index], evicted.get_values(), 0)
})
.map_err(Error::EvictingCache)?;
}
@ -1050,11 +1098,40 @@ impl QcowFile {
(address / self.raw_file.cluster_size()) % self.l2_entries
}
// Gets the offset of the given guest address in the host file. If L1, L2, or data clusters have
// yet to be allocated, return None.
fn file_offset_read(&mut self, address: u64) -> std::io::Result<Option<u64>> {
// Decompress the cluster, return EIO on failure
fn decompress_l2_cluster(&mut self, l2_entry: u64) -> std::io::Result<Vec<u8>> {
let (compressed_cluster_addr, compressed_cluster_size) =
l2_entry_compressed_cluster_layout(l2_entry, self.header.cluster_bits);
// Read compressed cluster from raw file
self.raw_file
.file_mut()
.seek(SeekFrom::Start(compressed_cluster_addr))?;
let mut compressed_cluster = vec![0; compressed_cluster_size];
self.raw_file
.file_mut()
.read_exact(&mut compressed_cluster)?;
let decoder = self.header.get_decoder();
// Decompress
let cluster_size = self.raw_file.cluster_size() as usize;
let mut decompressed_cluster = vec![0; cluster_size];
let decompressed_size = decoder
.decode(&compressed_cluster, &mut decompressed_cluster)
.map_err(|_| std::io::Error::from_raw_os_error(EIO))?;
if decompressed_size as u64 != self.raw_file.cluster_size() {
return Err(std::io::Error::from_raw_os_error(EIO));
}
Ok(decompressed_cluster)
}
fn file_read(
&mut self,
address: u64,
count: usize,
buf: &mut [u8],
) -> std::io::Result<Option<()>> {
let err_inval = std::io::Error::from_raw_os_error(EINVAL);
if address >= self.virtual_size() {
return Err(std::io::Error::from_raw_os_error(EINVAL));
return Err(err_inval);
}
let l1_index = self.l1_table_index(address) as usize;
@ -1072,11 +1149,28 @@ impl QcowFile {
self.cache_l2_cluster(l1_index, l2_addr_disk, false)?;
let cluster_addr = self.l2_cache.get(l1_index).unwrap()[l2_index];
if cluster_addr == 0 {
let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index];
if l2_entry_is_empty(l2_entry) {
// Reading from an unallocated cluster will return zeros.
return Ok(None);
} else if l2_entry_is_compressed(l2_entry) {
// Compressed cluster.
// Read it, decompress, then return slice from decompressed data.
let mut decompressed_cluster = self.decompress_l2_cluster(l2_entry)?;
decompressed_cluster.resize(self.raw_file.cluster_size() as usize, 0);
let start = self.raw_file.cluster_offset(address) as usize;
let end = start.checked_add(count);
if end.is_none() || end.unwrap() > decompressed_cluster.len() {
return Err(err_inval);
}
buf[..count].copy_from_slice(&decompressed_cluster[start..end.unwrap()]);
} else {
let start = l2_entry_std_cluster_addr(l2_entry) + self.raw_file.cluster_offset(address);
let raw_file = self.raw_file.file_mut();
raw_file.seek(SeekFrom::Start(start))?;
raw_file.read_exact(buf)?;
}
Ok(Some(cluster_addr + self.raw_file.cluster_offset(address)))
Ok(Some(()))
}
// Gets the offset of the given guest address in the host file. If L1, L2, or data clusters need
@ -1100,24 +1194,39 @@ impl QcowFile {
set_refcounts.push((new_addr, 1));
}
let cluster_addr = match self.l2_cache.get(l1_index).unwrap()[l2_index] {
0 => {
let initial_data = if let Some(backing) = self.backing_file.as_mut() {
let cluster_size = self.raw_file.cluster_size();
let cluster_begin = address - (address % cluster_size);
let mut cluster_data = vec![0u8; cluster_size as usize];
backing.seek(SeekFrom::Start(cluster_begin))?;
backing.read_exact(&mut cluster_data)?;
Some(cluster_data)
} else {
None
};
// Need to allocate a data cluster
let cluster_addr = self.append_data_cluster(initial_data)?;
self.update_cluster_addr(l1_index, l2_index, cluster_addr, &mut set_refcounts)?;
cluster_addr
let l2_entry = self.l2_cache.get(l1_index).unwrap()[l2_index];
let cluster_addr = if l2_entry_is_compressed(l2_entry) {
// Writing to compressed cluster.
// Allocate new cluster, decompress into new cluster, then use
// offset of new cluster.
let decompressed_cluster = self.decompress_l2_cluster(l2_entry)?;
let cluster_addr = self.append_data_cluster(None)?;
self.update_cluster_addr(l1_index, l2_index, cluster_addr, &mut set_refcounts)?;
self.raw_file
.file_mut()
.seek(SeekFrom::Start(cluster_addr))?;
let nwritten = self.raw_file.file_mut().write(&decompressed_cluster)?;
if nwritten != decompressed_cluster.len() {
return Err(std::io::Error::from_raw_os_error(EIO));
}
a => a,
cluster_addr
} else if l2_entry_is_empty(l2_entry) {
let initial_data = if let Some(backing) = self.backing_file.as_mut() {
let cluster_size = self.raw_file.cluster_size();
let cluster_begin = address - (address % cluster_size);
let mut cluster_data = vec![0u8; cluster_size as usize];
backing.seek(SeekFrom::Start(cluster_begin))?;
backing.read_exact(&mut cluster_data)?;
Some(cluster_data)
} else {
None
};
// Need to allocate a data cluster
let cluster_addr = self.append_data_cluster(initial_data)?;
self.update_cluster_addr(l1_index, l2_index, cluster_addr, &mut set_refcounts)?;
cluster_addr
} else {
l2_entry_std_cluster_addr(l2_entry)
};
for (addr, count) in set_refcounts {
@ -1157,7 +1266,7 @@ impl QcowFile {
self.l1_table[l1_index] = new_addr;
}
// 'unwrap' is OK because it was just added.
self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = cluster_addr;
self.l2_cache.get_mut(l1_index).unwrap()[l2_index] = l2_entry_make_std(cluster_addr);
Ok(())
}
@ -1334,10 +1443,9 @@ impl QcowFile {
// Partial cluster - zero out the relevant bytes if it was allocated.
// Any space in unallocated clusters can be left alone, since
// unallocated clusters already read back as zeroes.
if let Some(offset) = self.file_offset_read(curr_addr)? {
// Partial cluster - zero it out.
self.raw_file.file_mut().write_zeroes_at(offset, count)?;
}
let offset = self.file_offset_write(curr_addr)?;
// Partial cluster - zero it out.
self.raw_file.file_mut().write_zeroes_at(offset, count)?;
}
nwritten += count;
@ -1348,14 +1456,8 @@ impl QcowFile {
// Reads an L2 cluster from the disk, returning an error if the file can't be read or if any
// cluster is compressed.
fn read_l2_cluster(raw_file: &mut QcowRawFile, cluster_addr: u64) -> std::io::Result<Vec<u64>> {
let file_values = raw_file.read_pointer_cluster(cluster_addr, None)?;
if file_values.iter().any(|entry| entry & COMPRESSED_FLAG != 0) {
return Err(std::io::Error::from_raw_os_error(ENOTSUP));
}
Ok(file_values
.iter()
.map(|entry| *entry & L2_TABLE_OFFSET_MASK)
.collect())
let l2_table = raw_file.read_pointer_cluster(cluster_addr, None)?;
Ok(l2_table)
}
// Put an L2 cluster to the cache with evicting less-used cluster
@ -1383,11 +1485,7 @@ impl QcowFile {
let l1_table = &self.l1_table;
let raw_file = &mut self.raw_file;
self.l2_cache.insert(l1_index, l2_table, |index, evicted| {
raw_file.write_pointer_table(
l1_table[index],
evicted.get_values(),
CLUSTER_USED_FLAG,
)
raw_file.write_pointer_table(l1_table[index], evicted.get_values(), 0)
})?;
}
Ok(new_cluster)
@ -1456,11 +1554,8 @@ impl QcowFile {
// The index must be valid from when we inserted it.
let addr = self.l1_table[*l1_index];
if addr != 0 {
self.raw_file.write_pointer_table(
addr,
l2_table.get_values(),
CLUSTER_USED_FLAG,
)?;
self.raw_file
.write_pointer_table(addr, l2_table.get_values(), 0)?;
} else {
return Err(std::io::Error::from_raw_os_error(EINVAL));
}
@ -1512,14 +1607,10 @@ impl Read for QcowFile {
let mut nread: usize = 0;
while nread < read_count {
let curr_addr = address + nread as u64;
let file_offset = self.file_offset_read(curr_addr)?;
let count = self.limit_range_cluster(curr_addr, read_count - nread);
if let Some(offset) = file_offset {
self.raw_file.file_mut().seek(SeekFrom::Start(offset))?;
self.raw_file
.file_mut()
.read_exact(&mut buf[nread..(nread + count)])?;
if (self.file_read(curr_addr, count, &mut buf[nread..(nread + count)])?).is_some() {
// Data is successfully read from the cluster
} else if let Some(backing) = self.backing_file.as_mut() {
backing.seek(SeekFrom::Start(curr_addr))?;
backing.read_exact(&mut buf[nread..(nread + count)])?;