This helps to uncover expensive and needless clones in the code base. For example, I prevented extensive clones in the snapshot path where (nested) BTreeMap's have been cloned over and over again. Further, the lint helps devs to much better reason about the ownership of parameters. All of these changes have been done manually with the necessary caution. A few structs that are cheap to clone are now `copy` so that this lint won't trigger for them. I didn't enable the lint so far as it is a massive rabbit hole and needs much more fixes. Nevertheless, it is very useful. Signed-off-by: Philipp Schuster <philipp.schuster@cyberus-technology.de> On-behalf-of: SAP philipp.schuster@sap.com
1630 lines
55 KiB
Rust
1630 lines
55 KiB
Rust
// Copyright © 2020, Oracle and/or its affiliates.
|
|
//
|
|
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE-BSD-3-Clause file.
|
|
|
|
pub mod interrupts;
|
|
pub mod layout;
|
|
pub mod regs;
|
|
|
|
#[cfg(feature = "tdx")]
|
|
pub mod tdx;
|
|
|
|
mod mpspec;
|
|
mod mptable;
|
|
mod smbios;
|
|
|
|
use std::arch::x86_64;
|
|
use std::mem;
|
|
|
|
use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry};
|
|
use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError};
|
|
use linux_loader::loader::bootparam::{boot_params, setup_header};
|
|
use linux_loader::loader::elf::start_info::{
|
|
hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info,
|
|
};
|
|
use log::{debug, error, info};
|
|
use thiserror::Error;
|
|
use vm_memory::{
|
|
Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
|
|
GuestMemoryRegion,
|
|
};
|
|
|
|
use crate::{GuestMemoryMmap, InitramfsConfig, RegionType};
|
|
|
|
// While modern architectures support more than 255 CPUs via x2APIC,
|
|
// legacy devices such as mptable support at most 254 CPUs.
|
|
pub const MAX_SUPPORTED_CPUS_LEGACY: u32 = 254;
|
|
|
|
// CPUID feature bits
|
|
#[cfg(feature = "kvm")]
|
|
const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit.
|
|
const HYPERVISOR_ECX_BIT: u8 = 31; // Hypervisor ecx bit.
|
|
const MTRR_EDX_BIT: u8 = 12; // Hypervisor ecx bit.
|
|
const INVARIANT_TSC_EDX_BIT: u8 = 8; // Invariant TSC bit on 0x8000_0007 EDX
|
|
const AMX_BF16: u8 = 22; // AMX tile computation on bfloat16 numbers
|
|
const AMX_TILE: u8 = 24; // AMX tile load/store instructions
|
|
const AMX_INT8: u8 = 25; // AMX tile computation on 8-bit integers
|
|
|
|
// KVM feature bits
|
|
#[cfg(feature = "tdx")]
|
|
const KVM_FEATURE_CLOCKSOURCE_BIT: u8 = 0;
|
|
#[cfg(feature = "tdx")]
|
|
const KVM_FEATURE_CLOCKSOURCE2_BIT: u8 = 3;
|
|
#[cfg(feature = "tdx")]
|
|
const KVM_FEATURE_CLOCKSOURCE_STABLE_BIT: u8 = 24;
|
|
#[cfg(feature = "tdx")]
|
|
const KVM_FEATURE_ASYNC_PF_BIT: u8 = 4;
|
|
#[cfg(feature = "tdx")]
|
|
const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10;
|
|
#[cfg(feature = "tdx")]
|
|
const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5;
|
|
|
|
const KVM_FEATURE_MSI_EXT_DEST_ID: u8 = 15;
|
|
|
|
pub const _NSIG: i32 = 65;
|
|
|
|
#[derive(Debug, Copy, Clone)]
|
|
/// Specifies the entry point address where the guest must start
|
|
/// executing code, as well as which of the supported boot protocols
|
|
/// is to be used to configure the guest initial state.
|
|
pub struct EntryPoint {
|
|
/// Address in guest memory where the guest must start execution
|
|
pub entry_addr: GuestAddress,
|
|
/// This field is used for bzImage to fill the zero page
|
|
pub setup_header: Option<setup_header>,
|
|
}
|
|
|
|
const E820_RAM: u32 = 1;
|
|
const E820_RESERVED: u32 = 2;
|
|
|
|
pub struct CpuidConfig {
|
|
pub phys_bits: u8,
|
|
pub kvm_hyperv: bool,
|
|
#[cfg(feature = "tdx")]
|
|
pub tdx: bool,
|
|
pub amx: bool,
|
|
}
|
|
|
|
#[derive(Debug, Error)]
|
|
pub enum Error {
|
|
/// Error writing MP table to memory.
|
|
#[error("Error writing MP table to memory")]
|
|
MpTableSetup(#[source] mptable::Error),
|
|
|
|
/// Error configuring the general purpose registers
|
|
#[error("Error configuring the general purpose registers")]
|
|
RegsConfiguration(#[source] regs::Error),
|
|
|
|
/// Error configuring the special registers
|
|
#[error("Error configuring the special registers")]
|
|
SregsConfiguration(#[source] regs::Error),
|
|
|
|
/// Error configuring the floating point related registers
|
|
#[error("Error configuring the floating point related registers")]
|
|
FpuConfiguration(#[source] regs::Error),
|
|
|
|
/// Error configuring the MSR registers
|
|
#[error("Error configuring the MSR registers")]
|
|
MsrsConfiguration(#[source] regs::Error),
|
|
|
|
/// Failed to set supported CPUs.
|
|
#[error("Failed to set supported CPUs")]
|
|
SetSupportedCpusFailed(#[source] anyhow::Error),
|
|
|
|
/// Cannot set the local interruption due to bad configuration.
|
|
#[error("Cannot set the local interruption due to bad configuration")]
|
|
LocalIntConfiguration(#[source] anyhow::Error),
|
|
|
|
/// Error setting up SMBIOS table
|
|
#[error("Error setting up SMBIOS table")]
|
|
SmbiosSetup(#[source] smbios::Error),
|
|
|
|
/// Error getting supported CPUID through the hypervisor (kvm/mshv) API
|
|
#[error("Error getting supported CPUID through the hypervisor API")]
|
|
CpuidGetSupported(#[source] HypervisorError),
|
|
|
|
/// Error populating CPUID with KVM HyperV emulation details
|
|
#[error("Error populating CPUID with KVM HyperV emulation details")]
|
|
CpuidKvmHyperV(#[source] vmm_sys_util::fam::Error),
|
|
|
|
/// Error populating CPUID with CPU identification
|
|
#[error("Error populating CPUID with CPU identification")]
|
|
CpuidIdentification(#[source] vmm_sys_util::fam::Error),
|
|
|
|
/// Error checking CPUID compatibility
|
|
#[error("Error checking CPUID compatibility")]
|
|
CpuidCheckCompatibility,
|
|
|
|
// Error writing EBDA address
|
|
#[error("Error writing EBDA address")]
|
|
EbdaSetup(#[source] vm_memory::GuestMemoryError),
|
|
|
|
// Error getting CPU TSC frequency
|
|
#[error("Error getting CPU TSC frequency")]
|
|
GetTscFrequency(#[source] HypervisorCpuError),
|
|
|
|
/// Error retrieving TDX capabilities through the hypervisor (kvm/mshv) API
|
|
#[cfg(feature = "tdx")]
|
|
#[error("Error retrieving TDX capabilities through the hypervisor API")]
|
|
TdxCapabilities(#[source] HypervisorError),
|
|
|
|
/// Failed to configure E820 map for bzImage
|
|
#[error("Failed to configure E820 map for bzImage")]
|
|
E820Configuration,
|
|
}
|
|
|
|
pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u16, u16, u16, u16)>) -> u32 {
|
|
if let Some(t) = topology {
|
|
let thread_mask_width = u16::BITS - (t.0 - 1).leading_zeros();
|
|
let core_mask_width = u16::BITS - (t.1 - 1).leading_zeros();
|
|
let die_mask_width = u16::BITS - (t.2 - 1).leading_zeros();
|
|
|
|
let thread_id = cpu_id % (t.0 as u32);
|
|
let core_id = cpu_id / (t.0 as u32) % (t.1 as u32);
|
|
let die_id = cpu_id / ((t.0 * t.1) as u32) % (t.2 as u32);
|
|
let socket_id = cpu_id / ((t.0 * t.1 * t.2) as u32);
|
|
|
|
return thread_id
|
|
| (core_id << thread_mask_width)
|
|
| (die_id << (thread_mask_width + core_mask_width))
|
|
| (socket_id << (thread_mask_width + core_mask_width + die_mask_width));
|
|
}
|
|
|
|
cpu_id
|
|
}
|
|
|
|
pub fn get_max_x2apic_id(topology: (u16, u16, u16, u16)) -> u32 {
|
|
get_x2apic_id(
|
|
(topology.0 as u32 * topology.1 as u32 * topology.2 as u32 * topology.3 as u32) - 1,
|
|
Some(topology),
|
|
)
|
|
}
|
|
|
|
#[derive(Copy, Clone, Debug)]
|
|
pub enum CpuidReg {
|
|
EAX,
|
|
EBX,
|
|
ECX,
|
|
EDX,
|
|
}
|
|
|
|
pub struct CpuidPatch {
|
|
pub function: u32,
|
|
pub index: u32,
|
|
pub flags_bit: Option<u8>,
|
|
pub eax_bit: Option<u8>,
|
|
pub ebx_bit: Option<u8>,
|
|
pub ecx_bit: Option<u8>,
|
|
pub edx_bit: Option<u8>,
|
|
}
|
|
|
|
impl CpuidPatch {
|
|
pub fn get_cpuid_reg(
|
|
cpuid: &[CpuIdEntry],
|
|
function: u32,
|
|
index: Option<u32>,
|
|
reg: CpuidReg,
|
|
) -> Option<u32> {
|
|
for entry in cpuid.iter() {
|
|
if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
|
|
return match reg {
|
|
CpuidReg::EAX => Some(entry.eax),
|
|
CpuidReg::EBX => Some(entry.ebx),
|
|
CpuidReg::ECX => Some(entry.ecx),
|
|
CpuidReg::EDX => Some(entry.edx),
|
|
};
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
pub fn set_cpuid_reg(
|
|
cpuid: &mut Vec<CpuIdEntry>,
|
|
function: u32,
|
|
index: Option<u32>,
|
|
reg: CpuidReg,
|
|
value: u32,
|
|
) {
|
|
let mut entry_found = false;
|
|
for entry in cpuid.iter_mut() {
|
|
if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
|
|
entry_found = true;
|
|
match reg {
|
|
CpuidReg::EAX => {
|
|
entry.eax = value;
|
|
}
|
|
CpuidReg::EBX => {
|
|
entry.ebx = value;
|
|
}
|
|
CpuidReg::ECX => {
|
|
entry.ecx = value;
|
|
}
|
|
CpuidReg::EDX => {
|
|
entry.edx = value;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if entry_found {
|
|
return;
|
|
}
|
|
|
|
// Entry not found, so let's add it.
|
|
if let Some(index) = index {
|
|
let mut entry = CpuIdEntry {
|
|
function,
|
|
index,
|
|
flags: CPUID_FLAG_VALID_INDEX,
|
|
..Default::default()
|
|
};
|
|
match reg {
|
|
CpuidReg::EAX => {
|
|
entry.eax = value;
|
|
}
|
|
CpuidReg::EBX => {
|
|
entry.ebx = value;
|
|
}
|
|
CpuidReg::ECX => {
|
|
entry.ecx = value;
|
|
}
|
|
CpuidReg::EDX => {
|
|
entry.edx = value;
|
|
}
|
|
}
|
|
|
|
cpuid.push(entry);
|
|
}
|
|
}
|
|
|
|
pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: &[CpuidPatch]) {
|
|
for entry in cpuid {
|
|
for patch in patches.iter() {
|
|
if entry.function == patch.function && entry.index == patch.index {
|
|
if let Some(flags_bit) = patch.flags_bit {
|
|
entry.flags |= 1 << flags_bit;
|
|
}
|
|
if let Some(eax_bit) = patch.eax_bit {
|
|
entry.eax |= 1 << eax_bit;
|
|
}
|
|
if let Some(ebx_bit) = patch.ebx_bit {
|
|
entry.ebx |= 1 << ebx_bit;
|
|
}
|
|
if let Some(ecx_bit) = patch.ecx_bit {
|
|
entry.ecx |= 1 << ecx_bit;
|
|
}
|
|
if let Some(edx_bit) = patch.edx_bit {
|
|
entry.edx |= 1 << edx_bit;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn is_feature_enabled(
|
|
cpuid: &[CpuIdEntry],
|
|
function: u32,
|
|
index: u32,
|
|
reg: CpuidReg,
|
|
feature_bit: usize,
|
|
) -> bool {
|
|
let mask = 1 << feature_bit;
|
|
|
|
for entry in cpuid {
|
|
if entry.function == function && entry.index == index {
|
|
let reg_val = match reg {
|
|
CpuidReg::EAX => entry.eax,
|
|
CpuidReg::EBX => entry.ebx,
|
|
CpuidReg::ECX => entry.ecx,
|
|
CpuidReg::EDX => entry.edx,
|
|
};
|
|
|
|
return (reg_val & mask) == mask;
|
|
}
|
|
}
|
|
|
|
false
|
|
}
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
enum CpuidCompatibleCheck {
|
|
BitwiseSubset, // bitwise subset
|
|
Equal, // equal in value
|
|
NumNotGreater, // smaller or equal as a number
|
|
}
|
|
|
|
pub struct CpuidFeatureEntry {
|
|
function: u32,
|
|
index: u32,
|
|
feature_reg: CpuidReg,
|
|
compatible_check: CpuidCompatibleCheck,
|
|
}
|
|
|
|
impl CpuidFeatureEntry {
|
|
fn checked_feature_entry_list() -> Vec<CpuidFeatureEntry> {
|
|
vec![
|
|
// The following list includes all hardware features bits from
|
|
// the CPUID Wiki Page: https://en.wikipedia.org/wiki/CPUID
|
|
// Leaf 0x1, ECX/EDX, feature bits
|
|
CpuidFeatureEntry {
|
|
function: 1,
|
|
index: 0,
|
|
feature_reg: CpuidReg::ECX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 1,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EDX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
// Leaf 0x7, EAX/EBX/ECX/EDX, extended features
|
|
CpuidFeatureEntry {
|
|
function: 7,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EAX,
|
|
compatible_check: CpuidCompatibleCheck::NumNotGreater,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 7,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EBX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 7,
|
|
index: 0,
|
|
feature_reg: CpuidReg::ECX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 7,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EDX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
// Leaf 0x7 subleaf 0x1, EAX, extended features
|
|
CpuidFeatureEntry {
|
|
function: 7,
|
|
index: 1,
|
|
feature_reg: CpuidReg::EAX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
// Leaf 0x8000_0001, ECX/EDX, CPUID features bits
|
|
CpuidFeatureEntry {
|
|
function: 0x8000_0001,
|
|
index: 0,
|
|
feature_reg: CpuidReg::ECX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 0x8000_0001,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EDX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
// KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/x86/cpuid.html
|
|
// Leaf 0x4000_0000, EAX/EBX/ECX/EDX, KVM CPUID SIGNATURE
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0000,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EAX,
|
|
compatible_check: CpuidCompatibleCheck::NumNotGreater,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0000,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EBX,
|
|
compatible_check: CpuidCompatibleCheck::Equal,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0000,
|
|
index: 0,
|
|
feature_reg: CpuidReg::ECX,
|
|
compatible_check: CpuidCompatibleCheck::Equal,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0000,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EDX,
|
|
compatible_check: CpuidCompatibleCheck::Equal,
|
|
},
|
|
// Leaf 0x4000_0001, EAX/EBX/ECX/EDX, KVM CPUID features
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0001,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EAX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0001,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EBX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0001,
|
|
index: 0,
|
|
feature_reg: CpuidReg::ECX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
CpuidFeatureEntry {
|
|
function: 0x4000_0001,
|
|
index: 0,
|
|
feature_reg: CpuidReg::EDX,
|
|
compatible_check: CpuidCompatibleCheck::BitwiseSubset,
|
|
},
|
|
]
|
|
}
|
|
|
|
fn get_features_from_cpuid(
|
|
cpuid: &[CpuIdEntry],
|
|
feature_entry_list: &[CpuidFeatureEntry],
|
|
) -> Vec<u32> {
|
|
let mut features = vec![0; feature_entry_list.len()];
|
|
for (i, feature_entry) in feature_entry_list.iter().enumerate() {
|
|
for cpuid_entry in cpuid {
|
|
if cpuid_entry.function == feature_entry.function
|
|
&& cpuid_entry.index == feature_entry.index
|
|
{
|
|
match feature_entry.feature_reg {
|
|
CpuidReg::EAX => {
|
|
features[i] = cpuid_entry.eax;
|
|
}
|
|
CpuidReg::EBX => {
|
|
features[i] = cpuid_entry.ebx;
|
|
}
|
|
CpuidReg::ECX => {
|
|
features[i] = cpuid_entry.ecx;
|
|
}
|
|
CpuidReg::EDX => {
|
|
features[i] = cpuid_entry.edx;
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
features
|
|
}
|
|
|
|
// The function returns `Error` (a.k.a. "incompatible"), when the CPUID features from `src_vm_cpuid`
|
|
// is not a subset of those of the `dest_vm_cpuid`.
|
|
pub fn check_cpuid_compatibility(
|
|
src_vm_cpuid: &[CpuIdEntry],
|
|
dest_vm_cpuid: &[CpuIdEntry],
|
|
) -> Result<(), Error> {
|
|
let feature_entry_list = &Self::checked_feature_entry_list();
|
|
let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list);
|
|
let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list);
|
|
|
|
// Loop on feature bit and check if the 'source vm' feature is a subset
|
|
// of those of the 'destination vm' feature
|
|
let mut compatible = true;
|
|
for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features
|
|
.iter()
|
|
.zip(dest_vm_features.iter())
|
|
.enumerate()
|
|
{
|
|
let entry = &feature_entry_list[i];
|
|
let entry_compatible = match entry.compatible_check {
|
|
CpuidCompatibleCheck::BitwiseSubset => {
|
|
let different_feature_bits = src_vm_feature ^ dest_vm_feature;
|
|
let src_vm_feature_bits_only = different_feature_bits & src_vm_feature;
|
|
src_vm_feature_bits_only == 0
|
|
}
|
|
CpuidCompatibleCheck::Equal => src_vm_feature == dest_vm_feature,
|
|
CpuidCompatibleCheck::NumNotGreater => src_vm_feature <= dest_vm_feature,
|
|
};
|
|
if !entry_compatible {
|
|
error!(
|
|
"Detected incompatible CPUID entry: leaf={:#02x} (subleaf={:#02x}), register='{:?}', \
|
|
compatible_check='{:?}', source VM feature='{:#04x}', destination VM feature'{:#04x}'.",
|
|
entry.function,
|
|
entry.index,
|
|
entry.feature_reg,
|
|
entry.compatible_check,
|
|
src_vm_feature,
|
|
dest_vm_feature
|
|
);
|
|
|
|
compatible = false;
|
|
}
|
|
}
|
|
|
|
if compatible {
|
|
info!("No CPU incompatibility detected.");
|
|
Ok(())
|
|
} else {
|
|
Err(Error::CpuidCheckCompatibility)
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn generate_common_cpuid(
|
|
hypervisor: &dyn hypervisor::Hypervisor,
|
|
config: &CpuidConfig,
|
|
) -> super::Result<Vec<CpuIdEntry>> {
|
|
// SAFETY: cpuid called with valid leaves
|
|
if unsafe { x86_64::__cpuid(1) }.ecx & (1 << HYPERVISOR_ECX_BIT) == 1 << HYPERVISOR_ECX_BIT {
|
|
// SAFETY: cpuid called with valid leaves
|
|
let hypervisor_cpuid = unsafe { x86_64::__cpuid(0x4000_0000) };
|
|
|
|
let mut identifier: [u8; 12] = [0; 12];
|
|
identifier[0..4].copy_from_slice(&hypervisor_cpuid.ebx.to_le_bytes()[..]);
|
|
identifier[4..8].copy_from_slice(&hypervisor_cpuid.ecx.to_le_bytes()[..]);
|
|
identifier[8..12].copy_from_slice(&hypervisor_cpuid.edx.to_le_bytes()[..]);
|
|
|
|
info!(
|
|
"Running under nested virtualisation. Hypervisor string: {}",
|
|
String::from_utf8_lossy(&identifier)
|
|
);
|
|
}
|
|
|
|
info!(
|
|
"Generating guest CPUID for with physical address size: {}",
|
|
config.phys_bits
|
|
);
|
|
#[allow(unused_mut)]
|
|
let mut cpuid_patches = vec![
|
|
// Patch hypervisor bit
|
|
CpuidPatch {
|
|
function: 1,
|
|
index: 0,
|
|
flags_bit: None,
|
|
eax_bit: None,
|
|
ebx_bit: None,
|
|
ecx_bit: Some(HYPERVISOR_ECX_BIT),
|
|
edx_bit: None,
|
|
},
|
|
// Enable MTRR feature
|
|
CpuidPatch {
|
|
function: 1,
|
|
index: 0,
|
|
flags_bit: None,
|
|
eax_bit: None,
|
|
ebx_bit: None,
|
|
ecx_bit: None,
|
|
edx_bit: Some(MTRR_EDX_BIT),
|
|
},
|
|
];
|
|
|
|
#[cfg(feature = "kvm")]
|
|
if matches!(
|
|
hypervisor.hypervisor_type(),
|
|
hypervisor::HypervisorType::Kvm
|
|
) {
|
|
// Patch tsc deadline timer bit
|
|
cpuid_patches.push(CpuidPatch {
|
|
function: 1,
|
|
index: 0,
|
|
flags_bit: None,
|
|
eax_bit: None,
|
|
ebx_bit: None,
|
|
ecx_bit: Some(TSC_DEADLINE_TIMER_ECX_BIT),
|
|
edx_bit: None,
|
|
});
|
|
}
|
|
|
|
// Supported CPUID
|
|
let mut cpuid = hypervisor
|
|
.get_supported_cpuid()
|
|
.map_err(Error::CpuidGetSupported)?;
|
|
|
|
CpuidPatch::patch_cpuid(&mut cpuid, &cpuid_patches);
|
|
|
|
#[cfg(feature = "tdx")]
|
|
let tdx_capabilities = if config.tdx {
|
|
let caps = hypervisor
|
|
.tdx_capabilities()
|
|
.map_err(Error::TdxCapabilities)?;
|
|
info!("TDX capabilities {caps:#?}");
|
|
Some(caps)
|
|
} else {
|
|
None
|
|
};
|
|
|
|
// Update some existing CPUID
|
|
for entry in cpuid.as_mut_slice().iter_mut() {
|
|
match entry.function {
|
|
// Clear AMX related bits if the AMX feature is not enabled
|
|
0x7 => {
|
|
if !config.amx && entry.index == 0 {
|
|
entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8));
|
|
}
|
|
}
|
|
0xd =>
|
|
{
|
|
#[cfg(feature = "tdx")]
|
|
if let Some(caps) = &tdx_capabilities {
|
|
let xcr0_mask: u64 = 0x82ff;
|
|
let xss_mask: u64 = !xcr0_mask;
|
|
if entry.index == 0 {
|
|
entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32);
|
|
entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32);
|
|
entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32;
|
|
entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32;
|
|
} else if entry.index == 1 {
|
|
entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32);
|
|
entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32);
|
|
entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32;
|
|
entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32;
|
|
}
|
|
}
|
|
}
|
|
// Copy host L1 cache details if not populated by KVM
|
|
0x8000_0005 => {
|
|
if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
|
|
// SAFETY: cpuid called with valid leaves
|
|
if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 {
|
|
// SAFETY: cpuid called with valid leaves
|
|
let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) };
|
|
entry.eax = leaf.eax;
|
|
entry.ebx = leaf.ebx;
|
|
entry.ecx = leaf.ecx;
|
|
entry.edx = leaf.edx;
|
|
}
|
|
}
|
|
}
|
|
// Copy host L2 cache details if not populated by KVM
|
|
0x8000_0006 => {
|
|
if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
|
|
// SAFETY: cpuid called with valid leaves
|
|
if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 {
|
|
// SAFETY: cpuid called with valid leaves
|
|
let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) };
|
|
entry.eax = leaf.eax;
|
|
entry.ebx = leaf.ebx;
|
|
entry.ecx = leaf.ecx;
|
|
entry.edx = leaf.edx;
|
|
}
|
|
}
|
|
}
|
|
// Set CPU physical bits
|
|
0x8000_0008 => {
|
|
entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff);
|
|
}
|
|
0x4000_0001 => {
|
|
// Enable KVM_FEATURE_MSI_EXT_DEST_ID. This allows the guest to target
|
|
// device interrupts to cpus with APIC IDs > 254 without interrupt remapping.
|
|
entry.eax |= 1 << KVM_FEATURE_MSI_EXT_DEST_ID;
|
|
|
|
// These features are not supported by TDX
|
|
#[cfg(feature = "tdx")]
|
|
if config.tdx {
|
|
entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT)
|
|
| (1 << KVM_FEATURE_CLOCKSOURCE2_BIT)
|
|
| (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)
|
|
| (1 << KVM_FEATURE_ASYNC_PF_BIT)
|
|
| (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT)
|
|
| (1 << KVM_FEATURE_STEAL_TIME_BIT));
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
// Copy CPU identification string
|
|
for i in 0x8000_0002..=0x8000_0004 {
|
|
cpuid.retain(|c| c.function != i);
|
|
// SAFETY: call cpuid with valid leaves
|
|
let leaf = unsafe { std::arch::x86_64::__cpuid(i) };
|
|
cpuid.push(CpuIdEntry {
|
|
function: i,
|
|
eax: leaf.eax,
|
|
ebx: leaf.ebx,
|
|
ecx: leaf.ecx,
|
|
edx: leaf.edx,
|
|
..Default::default()
|
|
});
|
|
}
|
|
|
|
if config.kvm_hyperv {
|
|
// Remove conflicting entries
|
|
cpuid.retain(|c| c.function != 0x4000_0000);
|
|
cpuid.retain(|c| c.function != 0x4000_0001);
|
|
// See "Hypervisor Top Level Functional Specification" for details
|
|
// Compliance with "Hv#1" requires leaves up to 0x4000_000a
|
|
cpuid.push(CpuIdEntry {
|
|
function: 0x40000000,
|
|
eax: 0x4000000a, // Maximum cpuid leaf
|
|
ebx: 0x756e694c, // "Linu"
|
|
ecx: 0x564b2078, // "x KV"
|
|
edx: 0x7648204d, // "M Hv"
|
|
..Default::default()
|
|
});
|
|
cpuid.push(CpuIdEntry {
|
|
function: 0x40000001,
|
|
eax: 0x31237648, // "Hv#1"
|
|
..Default::default()
|
|
});
|
|
cpuid.push(CpuIdEntry {
|
|
function: 0x40000002,
|
|
eax: 0x3839, // "Build number"
|
|
ebx: 0xa0000, // "Version"
|
|
..Default::default()
|
|
});
|
|
cpuid.push(CpuIdEntry {
|
|
function: 0x4000_0003,
|
|
eax: (1 << 1) // AccessPartitionReferenceCounter
|
|
| (1 << 2) // AccessSynicRegs
|
|
| (1 << 3) // AccessSyntheticTimerRegs
|
|
| (1 << 9), // AccessPartitionReferenceTsc
|
|
edx: 1 << 3, // CPU dynamic partitioning
|
|
..Default::default()
|
|
});
|
|
cpuid.push(CpuIdEntry {
|
|
function: 0x4000_0004,
|
|
eax: 1 << 5, // Recommend relaxed timing
|
|
..Default::default()
|
|
});
|
|
for i in 0x4000_0005..=0x4000_000a {
|
|
cpuid.push(CpuIdEntry {
|
|
function: i,
|
|
..Default::default()
|
|
});
|
|
}
|
|
}
|
|
|
|
Ok(cpuid)
|
|
}
|
|
|
|
pub fn configure_vcpu(
|
|
vcpu: &dyn hypervisor::Vcpu,
|
|
id: u32,
|
|
boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
|
|
cpuid: Vec<CpuIdEntry>,
|
|
kvm_hyperv: bool,
|
|
cpu_vendor: CpuVendor,
|
|
topology: (u16, u16, u16, u16),
|
|
) -> super::Result<()> {
|
|
let x2apic_id = get_x2apic_id(id, Some(topology));
|
|
|
|
// Per vCPU CPUID changes; common are handled via generate_common_cpuid()
|
|
let mut cpuid = cpuid;
|
|
CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, x2apic_id);
|
|
CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, x2apic_id);
|
|
if matches!(cpu_vendor, CpuVendor::AMD) {
|
|
CpuidPatch::set_cpuid_reg(&mut cpuid, 0x8000_001e, Some(0), CpuidReg::EAX, x2apic_id);
|
|
}
|
|
|
|
// Set ApicId in cpuid for each vcpu - found in cpuid ebx when eax = 1
|
|
let mut apic_id_patched = false;
|
|
for entry in &mut cpuid {
|
|
if entry.function == 1 {
|
|
entry.ebx &= 0xffffff;
|
|
entry.ebx |= x2apic_id << 24;
|
|
apic_id_patched = true;
|
|
break;
|
|
}
|
|
}
|
|
assert!(apic_id_patched);
|
|
|
|
update_cpuid_topology(
|
|
&mut cpuid, topology.0, topology.1, topology.2, topology.3, cpu_vendor, id,
|
|
);
|
|
|
|
// The TSC frequency CPUID leaf should not be included when running with HyperV emulation
|
|
if !kvm_hyperv && let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? {
|
|
// Need to check that the TSC doesn't vary with dynamic frequency
|
|
// SAFETY: cpuid called with valid leaves
|
|
if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx & (1u32 << INVARIANT_TSC_EDX_BIT)
|
|
> 0
|
|
{
|
|
CpuidPatch::set_cpuid_reg(&mut cpuid, 0x4000_0000, None, CpuidReg::EAX, 0x4000_0010);
|
|
cpuid.retain(|c| c.function != 0x4000_0010);
|
|
cpuid.push(CpuIdEntry {
|
|
function: 0x4000_0010,
|
|
eax: tsc_khz,
|
|
ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's
|
|
* APIC_BUS_CYCLE_NS */
|
|
..Default::default()
|
|
});
|
|
}
|
|
}
|
|
|
|
for c in &cpuid {
|
|
debug!("{c}");
|
|
}
|
|
|
|
vcpu.set_cpuid2(&cpuid)
|
|
.map_err(|e| Error::SetSupportedCpusFailed(e.into()))?;
|
|
|
|
if kvm_hyperv {
|
|
vcpu.enable_hyperv_synic().unwrap();
|
|
}
|
|
|
|
regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?;
|
|
if let Some((kernel_entry_point, guest_memory)) = boot_setup {
|
|
regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?;
|
|
regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?;
|
|
|
|
// CPUs are required (by Intel sdm spec) to boot in x2apic mode if any
|
|
// of the apic IDs is larger than 255. Experimentally, the Linux kernel
|
|
// does not recognize the last vCPU if x2apic is not enabled when
|
|
// there are 256 vCPUs in a flat hierarchy (i.e. max x2apic ID is 255),
|
|
// so we need to enable x2apic in this case as well.
|
|
let enable_x2_apic_mode = get_max_x2apic_id(topology) > MAX_SUPPORTED_CPUS_LEGACY;
|
|
regs::setup_sregs(&guest_memory.memory(), vcpu, enable_x2_apic_mode)
|
|
.map_err(Error::SregsConfiguration)?;
|
|
}
|
|
interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Returns a Vec of the valid memory addresses.
|
|
///
|
|
/// These should be used to configure the GuestMemory structure for the platform.
|
|
/// For x86_64 all addresses are valid from the start of the kernel except a
|
|
/// carve out at the end of 32bit address space.
|
|
pub fn arch_memory_regions() -> Vec<(GuestAddress, usize, RegionType)> {
|
|
vec![
|
|
// 0 GiB ~ 3GiB: memory before the gap
|
|
(
|
|
GuestAddress(0),
|
|
layout::MEM_32BIT_RESERVED_START.raw_value() as usize,
|
|
RegionType::Ram,
|
|
),
|
|
// 4 GiB ~ inf: memory after the gap
|
|
(layout::RAM_64BIT_START, usize::MAX, RegionType::Ram),
|
|
// 3 GiB ~ 3712 MiB: 32-bit device memory hole
|
|
(
|
|
layout::MEM_32BIT_RESERVED_START,
|
|
layout::MEM_32BIT_DEVICES_SIZE as usize,
|
|
RegionType::SubRegion,
|
|
),
|
|
// 3712 MiB ~ 3968 MiB: 32-bit reserved memory hole
|
|
(
|
|
layout::MEM_32BIT_RESERVED_START.unchecked_add(layout::MEM_32BIT_DEVICES_SIZE),
|
|
(layout::MEM_32BIT_RESERVED_SIZE - layout::MEM_32BIT_DEVICES_SIZE) as usize,
|
|
RegionType::Reserved,
|
|
),
|
|
]
|
|
}
|
|
|
|
/// Configures the system and should be called once per vm before starting vcpu threads.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `guest_mem` - The memory to be used by the guest.
|
|
/// * `cmdline_addr` - Address in `guest_mem` where the kernel command line was loaded.
|
|
/// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator.
|
|
/// * `num_cpus` - Number of virtual CPUs the guest will have.
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub fn configure_system(
|
|
guest_mem: &GuestMemoryMmap,
|
|
cmdline_addr: GuestAddress,
|
|
cmdline_size: usize,
|
|
initramfs: &Option<InitramfsConfig>,
|
|
_num_cpus: u32,
|
|
setup_header: Option<setup_header>,
|
|
rsdp_addr: Option<GuestAddress>,
|
|
serial_number: Option<&str>,
|
|
uuid: Option<&str>,
|
|
oem_strings: Option<&[&str]>,
|
|
topology: Option<(u16, u16, u16, u16)>,
|
|
) -> super::Result<()> {
|
|
// Write EBDA address to location where ACPICA expects to find it
|
|
guest_mem
|
|
.write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER)
|
|
.map_err(Error::EbdaSetup)?;
|
|
|
|
let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings)
|
|
.map_err(Error::SmbiosSetup)?;
|
|
|
|
// Place the MP table after the SMIOS table aligned to 16 bytes
|
|
let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size);
|
|
let offset = GuestAddress((offset.0 + 16) & !0xf);
|
|
mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?;
|
|
|
|
// Check that the RAM is not smaller than the RSDP start address
|
|
if let Some(rsdp_addr) = rsdp_addr
|
|
&& rsdp_addr.0 > guest_mem.last_addr().0
|
|
{
|
|
return Err(super::Error::RsdpPastRamEnd);
|
|
}
|
|
|
|
match setup_header {
|
|
Some(hdr) => configure_32bit_entry(
|
|
guest_mem,
|
|
cmdline_addr,
|
|
cmdline_size,
|
|
initramfs,
|
|
hdr,
|
|
rsdp_addr,
|
|
),
|
|
None => configure_pvh(guest_mem, cmdline_addr, initramfs, rsdp_addr),
|
|
}
|
|
}
|
|
|
|
type RamRange = (u64, u64);
|
|
|
|
/// Returns usable physical memory ranges for the guest
|
|
/// These should be used to create e820_RAM memory maps
|
|
pub fn generate_ram_ranges(guest_mem: &GuestMemoryMmap) -> super::Result<Vec<RamRange>> {
|
|
// Merge continuous memory regions into one region.
|
|
// Note: memory regions from "GuestMemory" are sorted and non-zero sized.
|
|
let ram_regions = {
|
|
let mut ram_regions = Vec::new();
|
|
let mut current_start = guest_mem
|
|
.iter()
|
|
.next()
|
|
.map(GuestMemoryRegion::start_addr)
|
|
.expect("GuestMemory must have one memory region at least")
|
|
.raw_value();
|
|
let mut current_end = current_start;
|
|
|
|
for (start, size) in guest_mem
|
|
.iter()
|
|
.map(|m| (m.start_addr().raw_value(), m.len()))
|
|
{
|
|
if current_end == start {
|
|
// This zone is continuous with the previous one.
|
|
current_end += size;
|
|
} else {
|
|
ram_regions.push((current_start, current_end));
|
|
|
|
current_start = start;
|
|
current_end = start + size;
|
|
}
|
|
}
|
|
|
|
ram_regions.push((current_start, current_end));
|
|
|
|
ram_regions
|
|
};
|
|
|
|
// Create the memory map entry for memory region before the gap
|
|
let mut ram_ranges = vec![];
|
|
|
|
// Generate the first usable physical memory range before the gap. The e820 map
|
|
// should only report memory above 1MiB.
|
|
let first_ram_range = {
|
|
let (first_region_start, first_region_end) =
|
|
ram_regions.first().ok_or(super::Error::MemmapTableSetup)?;
|
|
let high_ram_start = layout::HIGH_RAM_START.raw_value();
|
|
let mem_32bit_reserved_start = layout::MEM_32BIT_RESERVED_START.raw_value();
|
|
|
|
if !((first_region_start <= &high_ram_start)
|
|
&& (first_region_end > &high_ram_start)
|
|
&& (first_region_end <= &mem_32bit_reserved_start))
|
|
{
|
|
error!(
|
|
"Unexpected first memory region layout: (start: 0x{first_region_start:08x}, end: 0x{first_region_end:08x}).
|
|
high_ram_start: 0x{high_ram_start:08x}, mem_32bit_reserved_start: 0x{mem_32bit_reserved_start:08x}"
|
|
);
|
|
|
|
return Err(super::Error::MemmapTableSetup);
|
|
}
|
|
|
|
info!(
|
|
"first usable physical memory range, start: 0x{high_ram_start:08x}, end: 0x{first_region_end:08x}"
|
|
);
|
|
|
|
(high_ram_start, *first_region_end)
|
|
};
|
|
ram_ranges.push(first_ram_range);
|
|
|
|
// Generate additional usable physical memory range after the gap if any.
|
|
for ram_region in ram_regions.iter().skip(1) {
|
|
info!(
|
|
"found usable physical memory range, start: 0x{:08x}, end: 0x{:08x}",
|
|
ram_region.0, ram_region.1
|
|
);
|
|
|
|
ram_ranges.push(*ram_region);
|
|
}
|
|
|
|
Ok(ram_ranges)
|
|
}
|
|
|
|
fn configure_pvh(
|
|
guest_mem: &GuestMemoryMmap,
|
|
cmdline_addr: GuestAddress,
|
|
initramfs: &Option<InitramfsConfig>,
|
|
rsdp_addr: Option<GuestAddress>,
|
|
) -> super::Result<()> {
|
|
const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336ec578;
|
|
|
|
let mut start_info = hvm_start_info {
|
|
magic: XEN_HVM_START_MAGIC_VALUE,
|
|
version: 1, // pvh has version 1
|
|
nr_modules: 0,
|
|
cmdline_paddr: cmdline_addr.raw_value(),
|
|
memmap_paddr: layout::MEMMAP_START.raw_value(),
|
|
..Default::default()
|
|
};
|
|
|
|
if let Some(rsdp_addr) = rsdp_addr {
|
|
start_info.rsdp_paddr = rsdp_addr.0;
|
|
}
|
|
|
|
if let Some(initramfs_config) = initramfs {
|
|
// The initramfs has been written to guest memory already, here we just need to
|
|
// create the module structure that describes it.
|
|
let ramdisk_mod = hvm_modlist_entry {
|
|
paddr: initramfs_config.address.raw_value(),
|
|
size: initramfs_config.size as u64,
|
|
..Default::default()
|
|
};
|
|
|
|
start_info.nr_modules += 1;
|
|
start_info.modlist_paddr = layout::MODLIST_START.raw_value();
|
|
|
|
// Write the modlist struct to guest memory.
|
|
guest_mem
|
|
.write_obj(ramdisk_mod, layout::MODLIST_START)
|
|
.map_err(super::Error::ModlistSetup)?;
|
|
}
|
|
|
|
// Vector to hold the memory maps which needs to be written to guest memory
|
|
// at MEMMAP_START after all of the mappings are recorded.
|
|
let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
|
|
|
|
// Create the memory map entries.
|
|
add_memmap_entry(&mut memmap, 0, layout::EBDA_START.raw_value(), E820_RAM);
|
|
|
|
// Get usable physical memory ranges
|
|
let ram_ranges = generate_ram_ranges(guest_mem)?;
|
|
|
|
// Create e820 memory map entries
|
|
for ram_range in ram_ranges {
|
|
info!(
|
|
"create_memmap_entry, start: 0x{:08x}, end: 0x{:08x}",
|
|
ram_range.0, ram_range.1
|
|
);
|
|
add_memmap_entry(
|
|
&mut memmap,
|
|
ram_range.0,
|
|
ram_range.1 - ram_range.0,
|
|
E820_RAM,
|
|
);
|
|
}
|
|
|
|
add_memmap_entry(
|
|
&mut memmap,
|
|
layout::PCI_MMCONFIG_START.0,
|
|
layout::PCI_MMCONFIG_SIZE,
|
|
E820_RESERVED,
|
|
);
|
|
|
|
start_info.memmap_entries = memmap.len() as u32;
|
|
|
|
// Copy the vector with the memmap table to the MEMMAP_START address
|
|
// which is already saved in the memmap_paddr field of hvm_start_info struct.
|
|
let mut memmap_start_addr = layout::MEMMAP_START;
|
|
|
|
guest_mem
|
|
.checked_offset(
|
|
memmap_start_addr,
|
|
mem::size_of::<hvm_memmap_table_entry>() * start_info.memmap_entries as usize,
|
|
)
|
|
.ok_or(super::Error::MemmapTablePastRamEnd)?;
|
|
|
|
// For every entry in the memmap vector, write it to guest memory.
|
|
for memmap_entry in memmap {
|
|
guest_mem
|
|
.write_obj(memmap_entry, memmap_start_addr)
|
|
.map_err(|_| super::Error::MemmapTableSetup)?;
|
|
memmap_start_addr =
|
|
memmap_start_addr.unchecked_add(mem::size_of::<hvm_memmap_table_entry>() as u64);
|
|
}
|
|
|
|
// The hvm_start_info struct itself must be stored at PVH_START_INFO
|
|
// address, and %rbx will be initialized to contain PVH_INFO_START prior to
|
|
// starting the guest, as required by the PVH ABI.
|
|
let start_info_addr = layout::PVH_INFO_START;
|
|
|
|
guest_mem
|
|
.checked_offset(start_info_addr, mem::size_of::<hvm_start_info>())
|
|
.ok_or(super::Error::StartInfoPastRamEnd)?;
|
|
|
|
// Write the start_info struct to guest memory.
|
|
guest_mem
|
|
.write_obj(start_info, start_info_addr)
|
|
.map_err(|_| super::Error::StartInfoSetup)?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn configure_32bit_entry(
|
|
guest_mem: &GuestMemoryMmap,
|
|
cmdline_addr: GuestAddress,
|
|
cmdline_size: usize,
|
|
initramfs: &Option<InitramfsConfig>,
|
|
setup_hdr: setup_header,
|
|
rsdp_addr: Option<GuestAddress>,
|
|
) -> super::Result<()> {
|
|
const KERNEL_LOADER_OTHER: u8 = 0xff;
|
|
|
|
// Use the provided setup header
|
|
let mut params = boot_params {
|
|
hdr: setup_hdr,
|
|
..Default::default()
|
|
};
|
|
|
|
// Common bootparams settings
|
|
if params.hdr.type_of_loader == 0 {
|
|
params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
|
|
}
|
|
params.hdr.cmd_line_ptr = cmdline_addr.raw_value() as u32;
|
|
params.hdr.cmdline_size = cmdline_size as u32;
|
|
|
|
if let Some(initramfs_config) = initramfs {
|
|
params.hdr.ramdisk_image = initramfs_config.address.raw_value() as u32;
|
|
params.hdr.ramdisk_size = initramfs_config.size as u32;
|
|
}
|
|
|
|
add_e820_entry(&mut params, 0, layout::EBDA_START.raw_value(), E820_RAM)?;
|
|
|
|
let mem_end = guest_mem.last_addr();
|
|
if mem_end < layout::MEM_32BIT_RESERVED_START {
|
|
add_e820_entry(
|
|
&mut params,
|
|
layout::HIGH_RAM_START.raw_value(),
|
|
mem_end.unchecked_offset_from(layout::HIGH_RAM_START) + 1,
|
|
E820_RAM,
|
|
)?;
|
|
} else {
|
|
add_e820_entry(
|
|
&mut params,
|
|
layout::HIGH_RAM_START.raw_value(),
|
|
layout::MEM_32BIT_RESERVED_START.unchecked_offset_from(layout::HIGH_RAM_START),
|
|
E820_RAM,
|
|
)?;
|
|
if mem_end > layout::RAM_64BIT_START {
|
|
add_e820_entry(
|
|
&mut params,
|
|
layout::RAM_64BIT_START.raw_value(),
|
|
mem_end.unchecked_offset_from(layout::RAM_64BIT_START) + 1,
|
|
E820_RAM,
|
|
)?;
|
|
}
|
|
}
|
|
|
|
add_e820_entry(
|
|
&mut params,
|
|
layout::PCI_MMCONFIG_START.0,
|
|
layout::PCI_MMCONFIG_SIZE,
|
|
E820_RESERVED,
|
|
)?;
|
|
|
|
if let Some(rsdp_addr) = rsdp_addr {
|
|
params.acpi_rsdp_addr = rsdp_addr.0;
|
|
}
|
|
|
|
let zero_page_addr = layout::ZERO_PAGE_START;
|
|
guest_mem
|
|
.checked_offset(zero_page_addr, mem::size_of::<boot_params>())
|
|
.ok_or(super::Error::ZeroPagePastRamEnd)?;
|
|
guest_mem
|
|
.write_obj(params, zero_page_addr)
|
|
.map_err(super::Error::ZeroPageSetup)?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Add an e820 region to the e820 map.
|
|
/// Returns Ok(()) if successful, or an error if there is no space left in the map.
|
|
fn add_e820_entry(
|
|
params: &mut boot_params,
|
|
addr: u64,
|
|
size: u64,
|
|
mem_type: u32,
|
|
) -> Result<(), Error> {
|
|
if params.e820_entries >= params.e820_table.len() as u8 {
|
|
return Err(Error::E820Configuration);
|
|
}
|
|
|
|
params.e820_table[params.e820_entries as usize].addr = addr;
|
|
params.e820_table[params.e820_entries as usize].size = size;
|
|
params.e820_table[params.e820_entries as usize].type_ = mem_type;
|
|
params.e820_entries += 1;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn add_memmap_entry(memmap: &mut Vec<hvm_memmap_table_entry>, addr: u64, size: u64, mem_type: u32) {
|
|
// Add the table entry to the vector
|
|
memmap.push(hvm_memmap_table_entry {
|
|
addr,
|
|
size,
|
|
type_: mem_type,
|
|
reserved: 0,
|
|
});
|
|
}
|
|
|
|
/// Returns the memory address where the initramfs could be loaded.
|
|
pub fn initramfs_load_addr(
|
|
guest_mem: &GuestMemoryMmap,
|
|
initramfs_size: usize,
|
|
) -> super::Result<u64> {
|
|
let first_region = guest_mem
|
|
.find_region(GuestAddress::new(0))
|
|
.ok_or(super::Error::InitramfsAddress)?;
|
|
// It's safe to cast to usize because the size of a region can't be greater than usize.
|
|
let lowmem_size = first_region.len() as usize;
|
|
|
|
if lowmem_size < initramfs_size {
|
|
return Err(super::Error::InitramfsAddress);
|
|
}
|
|
|
|
let aligned_addr: u64 = ((lowmem_size - initramfs_size) & !(crate::pagesize() - 1)) as u64;
|
|
Ok(aligned_addr)
|
|
}
|
|
|
|
pub fn get_host_cpu_phys_bits(hypervisor: &dyn hypervisor::Hypervisor) -> u8 {
|
|
// SAFETY: call cpuid with valid leaves
|
|
unsafe {
|
|
let leaf = x86_64::__cpuid(0x8000_0000);
|
|
|
|
// Detect and handle AMD SME (Secure Memory Encryption) properly.
|
|
// Some physical address bits may become reserved when the feature is enabled.
|
|
// See AMD64 Architecture Programmer's Manual Volume 2, Section 7.10.1
|
|
let reduced = if leaf.eax >= 0x8000_001f
|
|
&& matches!(hypervisor.get_cpu_vendor(), CpuVendor::AMD)
|
|
&& x86_64::__cpuid(0x8000_001f).eax & 0x1 != 0
|
|
{
|
|
(x86_64::__cpuid(0x8000_001f).ebx >> 6) & 0x3f
|
|
} else {
|
|
0
|
|
};
|
|
|
|
if leaf.eax >= 0x8000_0008 {
|
|
let leaf = x86_64::__cpuid(0x8000_0008);
|
|
((leaf.eax & 0xff) - reduced) as u8
|
|
} else {
|
|
36
|
|
}
|
|
}
|
|
}
|
|
|
|
fn update_cpuid_topology(
|
|
cpuid: &mut Vec<CpuIdEntry>,
|
|
threads_per_core: u16,
|
|
cores_per_die: u16,
|
|
dies_per_package: u16,
|
|
packages: u16,
|
|
cpu_vendor: CpuVendor,
|
|
id: u32,
|
|
) {
|
|
let x2apic_id = get_x2apic_id(
|
|
id,
|
|
Some((threads_per_core, cores_per_die, dies_per_package, packages)),
|
|
);
|
|
|
|
// Note: the topology defined here is per "package" (~NUMA node).
|
|
let thread_width = u16::BITS - (threads_per_core - 1).leading_zeros();
|
|
let core_width = u16::BITS - (cores_per_die - 1).leading_zeros() + thread_width;
|
|
let die_width = u16::BITS - (dies_per_package - 1).leading_zeros() + core_width;
|
|
|
|
// The very old way: a flat number of logical CPUs per package: CPUID.1H:EBX[23:16] bits.
|
|
let core_count = dies_per_package as u32 * cores_per_die as u32 * threads_per_core as u32;
|
|
let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0);
|
|
cpu_ebx &= !(0xff << 16);
|
|
cpu_ebx |= (core_count & 0xff) << 16;
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX, cpu_ebx);
|
|
|
|
let mut cpu_edx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX).unwrap_or(0);
|
|
cpu_edx |= 1 << 28;
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx);
|
|
|
|
// The legacy way: threads+cores per package.
|
|
// CPU Topology leaf 0xb
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0xb,
|
|
Some(0),
|
|
CpuidReg::EBX,
|
|
u32::from(threads_per_core),
|
|
);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::ECX, 1 << 8);
|
|
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EAX, die_width);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0xb,
|
|
Some(1),
|
|
CpuidReg::EBX,
|
|
u32::from(dies_per_package * cores_per_die * threads_per_core),
|
|
);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EDX, x2apic_id);
|
|
|
|
// The modern way: many-level hierarchy (but we here only support four levels).
|
|
// CPU Topology leaf 0x1f
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x1f,
|
|
Some(0),
|
|
CpuidReg::EBX,
|
|
u32::from(threads_per_core),
|
|
);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::ECX, 1 << 8);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EDX, x2apic_id);
|
|
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::EAX, core_width);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x1f,
|
|
Some(1),
|
|
CpuidReg::EBX,
|
|
u32::from(cores_per_die * threads_per_core),
|
|
);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::ECX, 2 << 8);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::EDX, x2apic_id);
|
|
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::EAX, die_width);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x1f,
|
|
Some(2),
|
|
CpuidReg::EBX,
|
|
u32::from(dies_per_package * cores_per_die * threads_per_core),
|
|
);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::ECX, 5 << 8);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::EDX, x2apic_id);
|
|
|
|
if matches!(cpu_vendor, CpuVendor::AMD) {
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x8000_001e,
|
|
Some(0),
|
|
CpuidReg::EBX,
|
|
((threads_per_core as u32 - 1) << 8) | (x2apic_id & 0xff),
|
|
);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x8000_001e,
|
|
Some(0),
|
|
CpuidReg::ECX,
|
|
((dies_per_package as u32 - 1) << 8) | (thread_width + die_width) & 0xff,
|
|
);
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x8000_001e, Some(0), CpuidReg::EDX, 0);
|
|
if cores_per_die * threads_per_core > 1 {
|
|
let ecx =
|
|
CpuidPatch::get_cpuid_reg(cpuid, 0x8000_0001, Some(0), CpuidReg::ECX).unwrap_or(0);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x8000_0001,
|
|
Some(0),
|
|
CpuidReg::ECX,
|
|
ecx | (1u32 << 1) | (1u32 << 22),
|
|
);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x0000_0001,
|
|
Some(0),
|
|
CpuidReg::EBX,
|
|
(x2apic_id << 24) | (8 << 8) | (((cores_per_die * threads_per_core) as u32) << 16),
|
|
);
|
|
let cpuid_patches = vec![
|
|
// Patch tsc deadline timer bit
|
|
CpuidPatch {
|
|
function: 1,
|
|
index: 0,
|
|
flags_bit: None,
|
|
eax_bit: None,
|
|
ebx_bit: None,
|
|
ecx_bit: None,
|
|
edx_bit: Some(28),
|
|
},
|
|
];
|
|
CpuidPatch::patch_cpuid(cpuid, &cpuid_patches);
|
|
CpuidPatch::set_cpuid_reg(
|
|
cpuid,
|
|
0x8000_0008,
|
|
Some(0),
|
|
CpuidReg::ECX,
|
|
((thread_width + core_width + die_width) << 12)
|
|
| ((cores_per_die * threads_per_core) - 1) as u32,
|
|
);
|
|
} else {
|
|
CpuidPatch::set_cpuid_reg(cpuid, 0x8000_0008, Some(0), CpuidReg::ECX, 0u32);
|
|
}
|
|
}
|
|
}
|
|
#[cfg(test)]
|
|
mod unit_tests {
|
|
use linux_loader::loader::bootparam::boot_e820_entry;
|
|
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn regions_base_addr() {
|
|
let regions = arch_memory_regions();
|
|
assert_eq!(4, regions.len());
|
|
assert_eq!(GuestAddress(0), regions[0].0);
|
|
assert_eq!(GuestAddress(1 << 32), regions[1].0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_system_configuration() {
|
|
let no_vcpus = 4;
|
|
let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap();
|
|
let config_err = configure_system(
|
|
&gm,
|
|
GuestAddress(0),
|
|
0,
|
|
&None,
|
|
1,
|
|
None,
|
|
Some(layout::RSDP_POINTER),
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
);
|
|
config_err.unwrap_err();
|
|
|
|
// Now assigning some memory that falls before the 32bit memory hole.
|
|
let arch_mem_regions = arch_memory_regions();
|
|
let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
|
|
.iter()
|
|
.filter(|r| r.2 == RegionType::Ram && r.1 != usize::MAX)
|
|
.map(|r| (r.0, r.1))
|
|
.collect();
|
|
let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
|
|
|
|
configure_system(
|
|
&gm,
|
|
GuestAddress(0),
|
|
0,
|
|
&None,
|
|
no_vcpus,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
)
|
|
.unwrap();
|
|
|
|
// Now assigning some memory that falls after the 32bit memory hole.
|
|
let arch_mem_regions = arch_memory_regions();
|
|
let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
|
|
.iter()
|
|
.filter(|r| r.2 == RegionType::Ram)
|
|
.map(|r| {
|
|
if r.1 == usize::MAX {
|
|
(r.0, 128 << 20)
|
|
} else {
|
|
(r.0, r.1)
|
|
}
|
|
})
|
|
.collect();
|
|
let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
|
|
configure_system(
|
|
&gm,
|
|
GuestAddress(0),
|
|
0,
|
|
&None,
|
|
no_vcpus,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
)
|
|
.unwrap();
|
|
|
|
configure_system(
|
|
&gm,
|
|
GuestAddress(0),
|
|
0,
|
|
&None,
|
|
no_vcpus,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
)
|
|
.unwrap();
|
|
}
|
|
|
|
#[test]
|
|
fn test_add_e820_entry() {
|
|
let e820_table = [(boot_e820_entry {
|
|
addr: 0x1,
|
|
size: 4,
|
|
type_: 1,
|
|
}); 128];
|
|
|
|
let expected_params = boot_params {
|
|
e820_table,
|
|
e820_entries: 1,
|
|
..Default::default()
|
|
};
|
|
|
|
let mut params: boot_params = Default::default();
|
|
add_e820_entry(
|
|
&mut params,
|
|
e820_table[0].addr,
|
|
e820_table[0].size,
|
|
e820_table[0].type_,
|
|
)
|
|
.unwrap();
|
|
assert_eq!(
|
|
format!("{:?}", params.e820_table[0]),
|
|
format!("{:?}", expected_params.e820_table[0])
|
|
);
|
|
assert_eq!(params.e820_entries, expected_params.e820_entries);
|
|
|
|
// Exercise the scenario where the field storing the length of the e820 entry table is
|
|
// is bigger than the allocated memory.
|
|
params.e820_entries = params.e820_table.len() as u8 + 1;
|
|
add_e820_entry(
|
|
&mut params,
|
|
e820_table[0].addr,
|
|
e820_table[0].size,
|
|
e820_table[0].type_,
|
|
)
|
|
.unwrap_err();
|
|
}
|
|
|
|
#[test]
|
|
fn test_add_memmap_entry() {
|
|
let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
|
|
|
|
let expected_memmap = vec![
|
|
hvm_memmap_table_entry {
|
|
addr: 0x0,
|
|
size: 0x1000,
|
|
type_: E820_RAM,
|
|
..Default::default()
|
|
},
|
|
hvm_memmap_table_entry {
|
|
addr: 0x10000,
|
|
size: 0xa000,
|
|
type_: E820_RESERVED,
|
|
..Default::default()
|
|
},
|
|
];
|
|
|
|
add_memmap_entry(&mut memmap, 0, 0x1000, E820_RAM);
|
|
add_memmap_entry(&mut memmap, 0x10000, 0xa000, E820_RESERVED);
|
|
|
|
assert_eq!(format!("{memmap:?}"), format!("{expected_memmap:?}"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_get_x2apic_id() {
|
|
let x2apic_id = get_x2apic_id(0, Some((2, 3, 1, 1)));
|
|
assert_eq!(x2apic_id, 0);
|
|
|
|
let x2apic_id = get_x2apic_id(1, Some((2, 3, 1, 1)));
|
|
assert_eq!(x2apic_id, 1);
|
|
|
|
let x2apic_id = get_x2apic_id(2, Some((2, 3, 1, 1)));
|
|
assert_eq!(x2apic_id, 2);
|
|
|
|
let x2apic_id = get_x2apic_id(6, Some((2, 3, 1, 1)));
|
|
assert_eq!(x2apic_id, 8);
|
|
|
|
let x2apic_id = get_x2apic_id(7, Some((2, 3, 1, 1)));
|
|
assert_eq!(x2apic_id, 9);
|
|
|
|
let x2apic_id = get_x2apic_id(8, Some((2, 3, 1, 1)));
|
|
assert_eq!(x2apic_id, 10);
|
|
|
|
let x2apic_id = get_x2apic_id(257, Some((1, 312, 1, 1)));
|
|
assert_eq!(x2apic_id, 257);
|
|
|
|
assert_eq!(255, get_max_x2apic_id((1, 256, 1, 1)));
|
|
}
|
|
}
|