vmm: raise the (v)CPU limit on kvm/x86_64

Raise the max number of supported (v)CPUs on kvm x86_64 hosts to 8192 (the max allowed value of CONFIG_NR_CPUS in the Linux kernel). Other platfroms keep their existing CPU limits pending further development and testing. The change has been tested on Intel and AMD hosts. Signed-off-by: Barret Rhoden <brho@google.com> Signed-off-by: Neel Natu <neelnatu@google.com> Signed-off-by: Ofir Weisse <oweisse@google.com> Signed-off-by: Peter Oskolkov <posk@google.com>
2025-09-05 20:36:25 +00:00 · 2025-09-05 20:36:25 +00:00 · 05d222f0eb
commit 05d222f0eb
parent 57bc78da4f
6 changed files with 97 additions and 45 deletions
--- a/vmm/src/config.rs
+++ b/vmm/src/config.rs
@ -27,6 +27,11 @@ use crate::vm_config::*;
 const MAX_NUM_PCI_SEGMENTS: u16 = 96;
 const MAX_IOMMU_ADDRESS_WIDTH_BITS: u8 = 64;

+#[cfg(all(feature = "kvm", target_arch = "x86_64"))]
+const MAX_SUPPORTED_CPUS: u32 = 8192;
+#[cfg(not(all(feature = "kvm", target_arch = "x86_64")))]
+const MAX_SUPPORTED_CPUS: u32 = 255;
+
 /// Errors associated with VM configuration parameters.
 #[derive(Debug, Error)]
 pub enum Error {
@ -182,6 +187,9 @@ pub enum ValidationError {
    /// Max is less than boot
    #[error("Max CPUs lower than boot CPUs")]
    CpusMaxLowerThanBoot,
+    /// Too many CPUs.
+    #[error("Too many CPUs: specified {0} but {MAX_SUPPORTED_CPUS} is the limit")]
+    TooManyCpus(u32 /* specified CPUs */),
    /// Missing file value for debug-console
    #[cfg(target_arch = "x86_64")]
    #[error("Path missing when using file mode for debug console")]
@ -586,11 +594,11 @@ impl CpusConfig {
            .add("features");
        parser.parse(cpus).map_err(Error::ParseCpus)?;

-        let boot_vcpus: u8 = parser
+        let boot_vcpus: u32 = parser
            .convert("boot")
            .map_err(Error::ParseCpus)?
            .unwrap_or(DEFAULT_VCPUS);
-        let max_vcpus: u8 = parser
+        let max_vcpus: u32 = parser
            .convert("max")
            .map_err(Error::ParseCpus)?
            .unwrap_or(boot_vcpus);
@ -605,7 +613,7 @@ impl CpusConfig {
            .map_err(Error::ParseCpus)?
            .unwrap_or(DEFAULT_MAX_PHYS_BITS);
        let affinity = parser
-            .convert::<Tuple<u8, Vec<usize>>>("affinity")
+            .convert::<Tuple<u32, Vec<usize>>>("affinity")
            .map_err(Error::ParseCpus)?
            .map(|v| {
                v.0.iter()
@ -2147,7 +2155,7 @@ impl NumaConfig {
        let cpus = parser
            .convert::<IntegerList>("cpus")
            .map_err(Error::ParseNuma)?
-            .map(|v| v.0.iter().map(|e| *e as u8).collect());
+            .map(|v| v.0.iter().map(|e| *e as u32).collect());
        let distances = parser
            .convert::<Tuple<u64, u64>>("distances")
            .map_err(Error::ParseNuma)?
@ -2523,6 +2531,15 @@ impl VmConfig {
            return Err(ValidationError::CpusMaxLowerThanBoot);
        }

+        if self.cpus.max_vcpus > MAX_SUPPORTED_CPUS {
+            // Note: historically, Cloud Hypervisor did not support more than 255(254 on x64)
+            // vCPUs: self.cpus.max_vcpus was of type u8, so 255 was the maximum;
+            // on x86_64, the legacy mptable/apic was limited to 254 CPUs.
+            //
+            // Now the limit is lifted on x86_64 targets. Other targests/archs: TBD.
+            return Err(ValidationError::TooManyCpus(self.cpus.max_vcpus));
+        }
+
        if let Some(rate_limit_groups) = &self.rate_limit_groups {
            for rate_limit_group in rate_limit_groups {
                rate_limit_group.validate(self)?;
@ -2614,7 +2631,10 @@ impl VmConfig {
                return Err(ValidationError::CpuTopologyDiesPerPackage);
            }

-            let total = t.threads_per_core * t.cores_per_die * t.dies_per_package * t.packages;
+            let total: u32 = (t.threads_per_core as u32)
+                * (t.cores_per_die as u32)
+                * (t.dies_per_package as u32)
+                * (t.packages as u32);
            if total != self.cpus.max_vcpus {
                return Err(ValidationError::CpuTopologyCount);
            }
--- a/vmm/src/cpu.rs
+++ b/vmm/src/cpu.rs
@ -196,8 +196,8 @@ pub enum Error {
    #[error("Error setting up AMX")]
    AmxEnable(#[source] anyhow::Error),

-    #[error("Maximum number of vCPUs exceeds host limit")]
-    MaximumVcpusExceeded,
+    #[error("Maximum number of vCPUs {0} exceeds host limit {1}")]
+    MaximumVcpusExceeded(u32, u32),

    #[cfg(feature = "sev_snp")]
    #[error("Failed to set sev control register")]
@ -698,12 +698,16 @@ impl CpuManager {
        numa_nodes: &NumaNodes,
        #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
    ) -> Result<Arc<Mutex<CpuManager>>> {
-        if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
-            return Err(Error::MaximumVcpusExceeded);
+        if config.max_vcpus > hypervisor.get_max_vcpus() {
+            return Err(Error::MaximumVcpusExceeded(
+                config.max_vcpus,
+                hypervisor.get_max_vcpus(),
+            ));
        }

-        let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
-        vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
+        let max_vcpus = usize::try_from(config.max_vcpus).unwrap();
+        let mut vcpu_states = Vec::with_capacity(max_vcpus);
+        vcpu_states.resize_with(max_vcpus, VcpuState::default);
        let hypervisor_type = hypervisor.hypervisor_type();
        #[cfg(target_arch = "x86_64")]
        let cpu_vendor = hypervisor.get_cpu_vendor();
@ -755,7 +759,7 @@ impl CpuManager {
        let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
            cpu_affinity
                .iter()
-                .map(|a| (a.vcpu as u32, a.host_cpus.clone()))
+                .map(|a| (a.vcpu, a.host_cpus.clone()))
                .collect()
        } else {
            BTreeMap::new()
@ -781,7 +785,7 @@ impl CpuManager {
            #[cfg(feature = "guest_debug")]
            vm_debug_evt,
            selected_cpu: 0,
-            vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
+            vcpus: Vec::with_capacity(max_vcpus),
            seccomp_action,
            vm_ops,
            acpi_address: None,
@ -895,10 +899,10 @@ impl CpuManager {
            },
            |t| {
                (
-                    t.threads_per_core.into(),
-                    t.cores_per_die.into(),
-                    t.dies_per_package.into(),
-                    t.packages.into(),
+                    t.threads_per_core,
+                    t.cores_per_die,
+                    t.dies_per_package,
+                    t.packages,
                )
            },
        );
@ -934,7 +938,7 @@ impl CpuManager {
            self.present_vcpus()
        );

-        if desired_vcpus > self.config.max_vcpus as u32 {
+        if desired_vcpus > self.config.max_vcpus {
            return Err(Error::DesiredVCpuCountExceedsMax);
        }

@ -1245,7 +1249,7 @@ impl CpuManager {
        inserting: bool,
        paused: Option<bool>,
    ) -> Result<()> {
-        if desired_vcpus > self.config.max_vcpus as u32 {
+        if desired_vcpus > self.config.max_vcpus {
            return Err(Error::DesiredVCpuCountExceedsMax);
        }

@ -1418,11 +1422,11 @@ impl CpuManager {
    }

    pub fn boot_vcpus(&self) -> u32 {
-        self.config.boot_vcpus as u32
+        self.config.boot_vcpus
    }

    pub fn max_vcpus(&self) -> u32 {
-        self.config.max_vcpus as u32
+        self.config.max_vcpus
    }

    #[cfg(target_arch = "x86_64")]
@ -1456,10 +1460,10 @@ impl CpuManager {
    pub fn get_vcpu_topology(&self) -> Option<(u16, u16, u16, u16)> {
        self.config.topology.clone().map(|t| {
            (
-                t.threads_per_core.into(),
-                t.cores_per_die.into(),
-                t.dies_per_package.into(),
-                t.packages.into(),
+                t.threads_per_core,
+                t.cores_per_die,
+                t.dies_per_package,
+                t.packages,
            )
        })
    }
@ -1475,7 +1479,7 @@ impl CpuManager {
        {
            madt.write(36, arch::layout::APIC_START.0);

-            for cpu in 0..self.config.max_vcpus as u32 {
+            for cpu in 0..self.config.max_vcpus {
                let x2apic_id = get_x2apic_id(cpu, self.get_vcpu_topology());

                let lapic = LocalX2Apic {
@ -1483,7 +1487,7 @@ impl CpuManager {
                    length: 16,
                    processor_id: cpu,
                    apic_id: x2apic_id,
-                    flags: if cpu < self.config.boot_vcpus as u32 {
+                    flags: if cpu < self.config.boot_vcpus {
                        1 << MADT_CPU_ENABLE_FLAG
                    } else {
                        0
@ -1535,8 +1539,8 @@ impl CpuManager {
                    r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
                    length: 80,
                    reserved0: 0,
-                    cpu_interface_number: cpu as u32,
-                    uid: cpu as u32,
+                    cpu_interface_number: cpu,
+                    uid: cpu,
                    flags: 1,
                    parking_version: 0,
                    performance_interrupt: 0,
@ -2274,7 +2278,7 @@ impl Aml for CpuManager {
        let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
        // Bundle methods together under a common object
        let methods = CpuMethods {
-            max_vcpus: self.config.max_vcpus as u32,
+            max_vcpus: self.config.max_vcpus,
            dynamic: self.dynamic,
        };
        let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
@ -2282,7 +2286,7 @@ impl Aml for CpuManager {
        #[cfg(target_arch = "x86_64")]
        let topology = self.get_vcpu_topology();
        let mut cpu_devices = Vec::new();
-        for cpu_id in 0..(self.config.max_vcpus as u32) {
+        for cpu_id in 0..self.config.max_vcpus {
            let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
            let cpu_device = Cpu {
                cpu_id,
--- a/vmm/src/device_manager.rs
+++ b/vmm/src/device_manager.rs
@ -1694,7 +1694,7 @@ impl DeviceManager {
    ) -> DeviceManagerResult<Arc<Mutex<dyn InterruptController>>> {
        let interrupt_controller: Arc<Mutex<gic::Gic>> = Arc::new(Mutex::new(
            gic::Gic::new(
-                self.config.lock().unwrap().cpus.boot_vcpus as u32,
+                self.config.lock().unwrap().cpus.boot_vcpus,
                Arc::clone(&self.msi_interrupt_manager),
                self.address_manager.vm.clone(),
            )
--- a/vmm/src/lib.rs
+++ b/vmm/src/lib.rs
@ -27,6 +27,8 @@ use anyhow::anyhow;
 #[cfg(feature = "dbus_api")]
 use api::dbus::{DBusApiOptions, DBusApiShutdownChannels};
 use api::http::HttpApiHandle;
+#[cfg(all(feature = "kvm", target_arch = "x86_64"))]
+use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY;
 use console_devices::{pre_create_console_devices, ConsoleInfo};
 use landlock::LandlockError;
 use libc::{tcsetattr, termios, EFD_NONBLOCK, SIGINT, SIGTERM, TCSANOW};
@ -888,6 +890,11 @@ impl Vmm {
            ))
        })?;

+        #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
+        if config.lock().unwrap().max_apic_id() > MAX_SUPPORTED_CPUS_LEGACY {
+            vm.enable_x2apic_api().unwrap();
+        }
+
        let phys_bits =
            vm::physical_bits(&self.hypervisor, config.lock().unwrap().cpus.max_phys_bits);

@ -1822,7 +1829,7 @@ impl RequestHandler for Vmm {
        } else {
            let mut config = self.vm_config.as_ref().unwrap().lock().unwrap();
            if let Some(desired_vcpus) = desired_vcpus {
-                config.cpus.boot_vcpus = desired_vcpus.try_into().unwrap();
+                config.cpus.boot_vcpus = desired_vcpus;
            }
            if let Some(desired_ram) = desired_ram {
                config.memory.size = desired_ram;
--- a/vmm/src/vm.rs
+++ b/vmm/src/vm.rs
@ -29,6 +29,8 @@ use anyhow::anyhow;
 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
 #[cfg(feature = "tdx")]
 use arch::x86_64::tdx::TdvfSection;
+#[cfg(all(feature = "kvm", target_arch = "x86_64"))]
+use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY;
 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
 use arch::PciSpaceInfo;
 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes};
@ -944,7 +946,7 @@ impl Vm {
                }

                if let Some(cpus) = &config.cpus {
-                    node.cpus.extend(cpus.iter().map(|cpu| *cpu as u32));
+                    node.cpus.extend(cpus);
                }

                if let Some(pci_segments) = &config.pci_segments {
@ -1022,6 +1024,11 @@ impl Vm {
            vm_config.lock().unwrap().memory.total_size(),
        )?;

+        #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
+        if vm_config.lock().unwrap().max_apic_id() > MAX_SUPPORTED_CPUS_LEGACY {
+            vm.enable_x2apic_api().unwrap();
+        }
+
        let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);

        let memory_manager = if let Some(snapshot) =
@ -1655,7 +1662,7 @@ impl Vm {
                    .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
                    .map_err(Error::DeviceManager)?;
            }
-            self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus.try_into().unwrap();
+            self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
        }

        if let Some(desired_memory) = desired_memory {
@ -2709,7 +2716,7 @@ impl Vm {
        &mut self,
        destination_url: &str,
    ) -> std::result::Result<DumpState, GuestDebuggableError> {
-        let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
+        let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus;
        let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
        let mut elf_phdr_num = 1;
        let elf_sh_info = 0;
--- a/vmm/src/vm_config.rs
+++ b/vmm/src/vm_config.rs
@ -26,7 +26,7 @@ pub(crate) trait ApplyLandlock {

 #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
 pub struct CpuAffinity {
-    pub vcpu: u8,
+    pub vcpu: u32,
    pub host_cpus: Vec<usize>,
 }

@ -39,10 +39,10 @@ pub struct CpuFeatures {

 #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
 pub struct CpuTopology {
-    pub threads_per_core: u8,
-    pub cores_per_die: u8,
-    pub dies_per_package: u8,
-    pub packages: u8,
+    pub threads_per_core: u16,
+    pub cores_per_die: u16,
+    pub dies_per_package: u16,
+    pub packages: u16,
 }

 // When booting with PVH boot the maximum physical addressable size
@ -56,8 +56,8 @@ pub fn default_cpuconfig_max_phys_bits() -> u8 {

 #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)]
 pub struct CpusConfig {
-    pub boot_vcpus: u8,
-    pub max_vcpus: u8,
+    pub boot_vcpus: u32,
+    pub max_vcpus: u32,
    #[serde(default)]
    pub topology: Option<CpuTopology>,
    #[serde(default)]
@ -70,7 +70,7 @@ pub struct CpusConfig {
    pub features: CpuFeatures,
 }

-pub const DEFAULT_VCPUS: u8 = 1;
+pub const DEFAULT_VCPUS: u32 = 1;

 impl Default for CpusConfig {
    fn default() -> Self {
@ -684,7 +684,7 @@ pub struct NumaConfig {
    #[serde(default)]
    pub guest_numa_id: u32,
    #[serde(default)]
-    pub cpus: Option<Vec<u8>>,
+    pub cpus: Option<Vec<u32>>,
    #[serde(default)]
    pub distances: Option<Vec<NumaDistance>>,
    #[serde(default)]
@ -1035,4 +1035,18 @@ impl VmConfig {

        Ok(())
    }
+
+    #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
+    pub(crate) fn max_apic_id(&self) -> u32 {
+        if let Some(topology) = &self.cpus.topology {
+            arch::x86_64::get_max_x2apic_id((
+                topology.threads_per_core,
+                topology.cores_per_die,
+                topology.dies_per_package,
+                topology.packages,
+            ))
+        } else {
+            self.cpus.max_vcpus
+        }
+    }
 }