diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index e4c626835..a4c0b88bf 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -57,6 +57,8 @@ const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10; #[cfg(feature = "tdx")] const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5; +const KVM_FEATURE_MSI_EXT_DEST_ID: u8 = 15; + pub const _NSIG: i32 = 65; #[derive(Debug, Copy, Clone)] @@ -745,6 +747,10 @@ pub fn generate_common_cpuid( entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); } 0x4000_0001 => { + // Enable KVM_FEATURE_MSI_EXT_DEST_ID. This allows the guest to target + // device interrupts to cpus with APIC IDs > 254 without interrupt remapping. + entry.eax |= 1 << KVM_FEATURE_MSI_EXT_DEST_ID; + // These features are not supported by TDX #[cfg(feature = "tdx")] if config.tdx { @@ -903,7 +909,15 @@ pub fn configure_vcpu( if let Some((kernel_entry_point, guest_memory)) = boot_setup { regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?; - regs::setup_sregs(&guest_memory.memory(), vcpu).map_err(Error::SregsConfiguration)?; + + // CPUs are required (by Intel sdm spec) to boot in x2apic mode if any + // of the apic IDs is larger than 255. Experimentally, the Linux kernel + // does not recognize the last vCPU if x2apic is not enabled when + // there are 256 vCPUs in a flat hierarchy (i.e. max x2apic ID is 255), + // so we need to enable x2apic in this case as well. + let enable_x2_apic_mode = get_max_x2apic_id(topology) >= 255; + regs::setup_sregs(&guest_memory.memory(), vcpu, enable_x2_apic_mode) + .map_err(Error::SregsConfiguration)?; } interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?; Ok(()) diff --git a/arch/src/x86_64/regs.rs b/arch/src/x86_64/regs.rs index 3826fdb6c..706dcd062 100644 --- a/arch/src/x86_64/regs.rs +++ b/arch/src/x86_64/regs.rs @@ -119,9 +119,13 @@ pub fn setup_regs(vcpu: &Arc, entry_point: EntryPoint) -> /// /// * `mem` - The memory that will be passed to the guest. /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. -pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &Arc) -> Result<()> { +pub fn setup_sregs( + mem: &GuestMemoryMmap, + vcpu: &Arc, + enable_x2_apic_mode: bool, +) -> Result<()> { let mut sregs: SpecialRegisters = vcpu.get_sregs().map_err(Error::GetStatusRegisters)?; - configure_segments_and_sregs(mem, &mut sregs)?; + configure_segments_and_sregs(mem, &mut sregs, enable_x2_apic_mode)?; vcpu.set_sregs(&sregs).map_err(Error::SetStatusRegisters) } @@ -148,6 +152,7 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<()> { pub fn configure_segments_and_sregs( mem: &GuestMemoryMmap, sregs: &mut SpecialRegisters, + enable_x2_apic_mode: bool, ) -> Result<()> { let gdt_table: [u64; BOOT_GDT_MAX] = { // Configure GDT entries as specified by PVH boot protocol @@ -183,6 +188,11 @@ pub fn configure_segments_and_sregs( sregs.cr0 = CR0_PE; sregs.cr4 = 0; + if enable_x2_apic_mode { + const X2APIC_ENABLE_BIT: u64 = 1 << 10; + sregs.apic_base |= X2APIC_ENABLE_BIT; + } + Ok(()) } @@ -204,7 +214,7 @@ mod tests { fn segments_and_sregs() { let mut sregs: SpecialRegisters = Default::default(); let gm = create_guest_mem(); - configure_segments_and_sregs(&gm, &mut sregs).unwrap(); + configure_segments_and_sregs(&gm, &mut sregs, false).unwrap(); assert_eq!(0x0, read_u64(&gm, BOOT_GDT_START)); assert_eq!( 0xcf9b000000ffff,