hypervisor: AMX state snapshot and restore support

The TILE data state of AMX may require 8KB+ space, calling the legacy
KVM_GET_XSAVE will encounter an error since KVM_GET_XSAVE only can get
4KB space. This patch adds KVM_GET_XSAVE2 support to allow snapping more
data.

Fixes: #7533

Signed-off-by: Songqian Li <sionli@tencent.com>
This commit is contained in:
Songqian Li 2025-12-03 17:35:10 +08:00 committed by Rob Bradford
parent 5c5f33050c
commit 4b4954ff86
3 changed files with 152 additions and 16 deletions

View file

@ -315,12 +315,19 @@ pub struct MsrEntry {
pub struct XsaveState {
#[serde_as(as = "[_; 1024usize]")]
pub region: [u32; 1024usize],
// extra data to support xsave2
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub extra: Vec<u32>,
}
impl Default for XsaveState {
fn default() -> Self {
// SAFETY: this is plain old data structure
unsafe { ::std::mem::zeroed() }
Self {
// SAFETY: this is plain old data structure
region: unsafe { std::mem::zeroed() },
extra: Vec::new(),
}
}
}

View file

@ -107,11 +107,13 @@ use kvm_bindings::{
use kvm_bindings::{KVM_REG_RISCV_CORE, kvm_riscv_core};
#[cfg(feature = "tdx")]
use kvm_bindings::{KVM_X86_DEFAULT_VM, KVM_X86_SW_PROTECTED_VM, KVMIO, kvm_run__bindgen_ty_1};
#[cfg(target_arch = "x86_64")]
use kvm_bindings::{Xsave as xsave2, kvm_xsave2};
pub use kvm_ioctls::{Cap, Kvm, VcpuExit};
use thiserror::Error;
use vfio_ioctls::VfioDeviceFd;
#[cfg(target_arch = "x86_64")]
use vmm_sys_util::ioctl_io_nr;
use vmm_sys_util::{fam::FamStruct, ioctl_io_nr};
#[cfg(feature = "tdx")]
use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_iowr_nr};
pub use {kvm_bindings, kvm_ioctls};
@ -120,6 +122,9 @@ pub use {kvm_bindings, kvm_ioctls};
use crate::RegList;
#[cfg(target_arch = "aarch64")]
use crate::arch::aarch64::regs;
#[cfg(target_arch = "x86_64")]
use crate::kvm::x86_64::XsaveStateError;
#[cfg(target_arch = "x86_64")]
ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
@ -564,6 +569,17 @@ impl vm::Vm for KvmVm {
.fd
.create_vcpu(id as u64)
.map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
#[cfg(target_arch = "x86_64")]
// Safety: `xsave_size` will not change after vcpu creation because:
// 1. `xsave_size` depends on cpuid
// 2. The only factor that affects cpuid is xsave permission, obtained via
// `ARCH_GET_XCOMP_GUEST_PERM`
// 3. This permission is already acquired before vcpu creation
// Therefore, cpuid remains unchanged after vcpu creation, and so does `xsave_size`.
//
// First vCPU allocation locks the permissions of `ARCH_GET_XCOMP_GUEST_PERM`.
let xsave_size = self.fd.check_extension_int(Cap::Xsave2);
let vcpu = KvmVcpu {
fd,
#[cfg(target_arch = "x86_64")]
@ -571,6 +587,8 @@ impl vm::Vm for KvmVm {
vm_ops,
#[cfg(target_arch = "x86_64")]
hyperv_synic: AtomicBool::new(false),
#[cfg(target_arch = "x86_64")]
xsave_size,
};
Ok(Box::new(vcpu))
}
@ -1332,6 +1350,8 @@ pub struct KvmVcpu {
vm_ops: Option<Arc<dyn vm::VmOps>>,
#[cfg(target_arch = "x86_64")]
hyperv_synic: AtomicBool,
#[cfg(target_arch = "x86_64")]
xsave_size: i32,
}
/// Implementation of Vcpu trait for KVM
@ -2329,7 +2349,11 @@ impl cpu::Vcpu for KvmVcpu {
let mp_state = self.get_mp_state()?.into();
let regs = self.get_regs()?;
let sregs = self.get_sregs()?;
let xsave = self.get_xsave()?;
let xsave = if self.xsave_size > 0 {
self.get_xsave2()?
} else {
self.get_xsave()?
};
let xcrs = self.get_xcrs()?;
let lapic_state = self.get_lapic()?;
let fpu = self.get_fpu()?;
@ -2566,7 +2590,11 @@ impl cpu::Vcpu for KvmVcpu {
self.set_mp_state(state.mp_state.into())?;
self.set_regs(&state.regs.into())?;
self.set_sregs(&state.sregs.into())?;
self.set_xsave(&state.xsave)?;
if self.xsave_size > 0 {
self.set_xsave2(&state.xsave)?;
} else {
self.set_xsave(&state.xsave)?;
}
self.set_xcrs(&state.xcrs)?;
self.set_lapic(&state.lapic_state)?;
self.set_fpu(&state.fpu)?;
@ -2856,7 +2884,10 @@ impl KvmVcpu {
/// X86 specific call that sets the vcpu's current "xsave struct".
///
fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
let xsave: kvm_bindings::kvm_xsave = (*xsave)
.clone()
.try_into()
.map_err(|e: XsaveStateError| cpu::HypervisorCpuError::GetXsaveState(e.into()))?;
// SAFETY: Here we trust the kernel not to read past the end of the kvm_xsave struct
// when calling the kvm-ioctl library function.
unsafe {
@ -2866,6 +2897,53 @@ impl KvmVcpu {
}
}
#[cfg(target_arch = "x86_64")]
/// X86 specific call that returns the vcpu's current "xsave struct" using the extended
/// xsave2 interface which supports larger state buffers (>4KB) for features like Intel AMX.
///
/// This method requires KVM_CAP_XSAVE2 capability and uses KVM_GET_XSAVE2 ioctl.
/// The xsave parameter must be allocated with sufficient size based on the value
/// returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2).
pub fn get_xsave2(&self) -> cpu::Result<XsaveState> {
assert!(
self.xsave_size > 0,
"'xsave_size' must be initialized via 'KVM_CAP_XSAVE2' first"
);
let fam_size = (self.xsave_size as usize - size_of::<kvm_bindings::kvm_xsave>())
.div_ceil(size_of::<<kvm_xsave2 as FamStruct>::Entry>());
let mut xsave =
xsave2::new(fam_size).map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?;
// SAFETY: The caller guarantees that xsave is allocated with enough space
unsafe {
self.fd
.get_xsave2(&mut xsave)
.map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?;
}
Ok((&xsave).into())
}
#[cfg(target_arch = "x86_64")]
/// X86 specific call that sets the vcpu's current "xsave struct" using the extended
/// xsave2 interface which supports larger state buffers (>4KB) for features like Intel AMX.
///
/// This method uses KVM_SET_XSAVE ioctl but with extended buffer support when
/// KVM_CAP_XSAVE2 is available.
pub fn set_xsave2(&self, xsave_state: &XsaveState) -> cpu::Result<()> {
assert!(
self.xsave_size > 0,
"'xsave_size' must be initialized via 'KVM_CAP_XSAVE2' first"
);
let xsave = xsave_state
.to_xsave2()
.map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))?;
// SAFETY: The caller guarantees that xsave contains valid data
unsafe {
self.fd
.set_xsave2(&xsave)
.map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
}
}
#[cfg(target_arch = "x86_64")]
///
/// X86 specific call that returns the vcpu's current "xcrs".

View file

@ -8,14 +8,17 @@
//
//
use log::error;
use serde::{Deserialize, Serialize};
use thiserror::Error;
///
/// Export generically-named wrappers of kvm-bindings for Unix-based platforms
///
pub use {
kvm_bindings::CpuId, kvm_bindings::KVM_CPUID_FLAG_SIGNIFCANT_INDEX, kvm_bindings::MsrList,
kvm_bindings::Msrs as MsrEntries, kvm_bindings::kvm_cpuid_entry2, kvm_bindings::kvm_dtable,
kvm_bindings::kvm_fpu, kvm_bindings::kvm_lapic_state, kvm_bindings::kvm_mp_state as MpState,
kvm_bindings::Msrs as MsrEntries, kvm_bindings::Xsave as xsave2,
kvm_bindings::kvm_cpuid_entry2, kvm_bindings::kvm_dtable, kvm_bindings::kvm_fpu,
kvm_bindings::kvm_lapic_state, kvm_bindings::kvm_mp_state as MpState,
kvm_bindings::kvm_msr_entry, kvm_bindings::kvm_regs, kvm_bindings::kvm_segment,
kvm_bindings::kvm_sregs, kvm_bindings::kvm_vcpu_events as VcpuEvents,
kvm_bindings::kvm_xcrs as ExtendedControlRegisters, kvm_bindings::kvm_xsave,
@ -294,17 +297,65 @@ impl From<MsrEntry> for kvm_msr_entry {
}
}
impl From<kvm_xsave> for XsaveState {
fn from(s: kvm_xsave) -> Self {
Self { region: s.region }
}
#[derive(Error, Debug)]
pub enum XsaveStateError {
#[error("kvm_xsave extra field is not empty")]
XsaveExtraFieldNotEmpty,
}
impl From<XsaveState> for kvm_xsave {
fn from(s: XsaveState) -> Self {
impl From<kvm_xsave> for XsaveState {
fn from(value: kvm_xsave) -> Self {
// Check if kvm_xsave struct size is larger than region size, indicating extra data exists
assert_eq!(
size_of_val(&value),
size_of_val(&value.region),
"kvm_xsave extra field is not empty"
);
Self {
region: s.region,
extra: Default::default(),
region: value.region,
extra: Vec::new(),
}
}
}
impl TryFrom<XsaveState> for kvm_xsave {
type Error = XsaveStateError;
fn try_from(value: XsaveState) -> Result<Self, Self::Error> {
if !value.extra.is_empty() {
error!("XsaveState extra field is not empty");
return Err(XsaveStateError::XsaveExtraFieldNotEmpty);
}
Ok(Self {
region: value.region,
extra: Default::default(),
})
}
}
impl From<&xsave2> for XsaveState {
fn from(xsave: &xsave2) -> Self {
// SAFETY: `xsave` is a valid reference with properly initialized FAM structure.
let region = unsafe {
let ptr = xsave.as_fam_struct_ptr();
(*ptr).xsave.region
};
Self {
region,
extra: xsave.as_slice().to_vec(),
}
}
}
impl XsaveState {
pub fn to_xsave2(&self) -> Result<xsave2, vmm_sys_util::fam::Error> {
let mut xsave = xsave2::new(self.extra.len())?;
// SAFETY: `xsave` was just created via `Xsave::new()` with valid allocated memory.
unsafe {
let ptr = xsave.as_mut_fam_struct_ptr();
(*ptr).xsave.region = self.region;
}
let extra_slice = xsave.as_mut_slice();
extra_slice.copy_from_slice(&self.extra);
Ok(xsave)
}
}