diff --git a/arch/src/aarch64/fdt.rs b/arch/src/aarch64/fdt.rs
index 238ad0b32..aefecf8b2 100644
--- a/arch/src/aarch64/fdt.rs
+++ b/arch/src/aarch64/fdt.rs
@@ -218,7 +218,7 @@ pub fn create_fdt<T: DeviceInfoForFdt + Clone + Debug, S: ::std::hash::BuildHash
     guest_mem: &GuestMemoryMmap,
     cmdline: &str,
     vcpu_mpidr: Vec<u64>,
-    vcpu_topology: Option<(u8, u8, u8)>,
+    vcpu_topology: Option<(u16, u16, u16, u16)>,
     device_info: &HashMap<(DeviceType, String), T, S>,
     gic_device: &Arc<Mutex<dyn Vgic>>,
     initrd: &Option<InitramfsConfig>,
@@ -280,7 +280,7 @@ pub fn write_fdt_to_memory(fdt_final: Vec<u8>, guest_mem: &GuestMemoryMmap) -> R
 fn create_cpu_nodes(
     fdt: &mut FdtWriter,
     vcpu_mpidr: &[u64],
-    vcpu_topology: Option<(u8, u8, u8)>,
+    vcpu_topology: Option<(u16, u16, u16, u16)>,
     numa_nodes: &NumaNodes,
 ) -> FdtWriterResult<()> {
     // See https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/arm/cpus.yaml.
@@ -289,8 +289,11 @@ fn create_cpu_nodes(
     fdt.property_u32("#size-cells", 0x0)?;
 
     let num_cpus = vcpu_mpidr.len();
-    let (threads_per_core, cores_per_package, packages) = vcpu_topology.unwrap_or((1, 1, 1));
-    let max_cpus: u32 = (threads_per_core * cores_per_package * packages).into();
+    let (threads_per_core, cores_per_die, dies_per_package, packages) =
+        vcpu_topology.unwrap_or((1, 1, 1, 1));
+    let cores_per_package = cores_per_die * dies_per_package;
+    let max_cpus: u32 =
+        threads_per_core as u32 * cores_per_die as u32 * dies_per_package as u32 * packages as u32;
 
     // Add cache info.
     // L1 Data Cache Info.
@@ -462,7 +465,8 @@ fn create_cpu_nodes(
     }
 
     if let Some(topology) = vcpu_topology {
-        let (threads_per_core, cores_per_package, packages) = topology;
+        let (threads_per_core, cores_per_die, dies_per_package, packages) = topology;
+        let cores_per_package = cores_per_die * dies_per_package;
         let cpu_map_node = fdt.begin_node("cpu-map")?;
 
         // Create device tree nodes with regard of above mapping.
diff --git a/arch/src/aarch64/mod.rs b/arch/src/aarch64/mod.rs
index c80742914..f7a6c3653 100644
--- a/arch/src/aarch64/mod.rs
+++ b/arch/src/aarch64/mod.rs
@@ -126,7 +126,7 @@ pub fn configure_system<T: DeviceInfoForFdt + Clone + Debug, S: ::std::hash::Bui
     guest_mem: &GuestMemoryMmap,
     cmdline: &str,
     vcpu_mpidr: Vec<u64>,
-    vcpu_topology: Option<(u8, u8, u8)>,
+    vcpu_topology: Option<(u16, u16, u16, u16)>,
     device_info: &HashMap<(DeviceType, String), T, S>,
     initrd: &Option<super::InitramfsConfig>,
     pci_space_info: &[PciSpaceInfo],
diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs
index 35ec4b925..6dcac04f1 100644
--- a/arch/src/x86_64/mod.rs
+++ b/arch/src/x86_64/mod.rs
@@ -209,11 +209,11 @@ pub enum Error {
     E820Configuration,
 }
 
-pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 {
+pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u16, u16, u16, u16)>) -> u32 {
     if let Some(t) = topology {
-        let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros();
-        let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros();
-        let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros();
+        let thread_mask_width = u16::BITS - (t.0 - 1).leading_zeros();
+        let core_mask_width = u16::BITS - (t.1 - 1).leading_zeros();
+        let die_mask_width = u16::BITS - (t.2 - 1).leading_zeros();
 
         let thread_id = cpu_id % (t.0 as u32);
         let core_id = cpu_id / (t.0 as u32) % (t.1 as u32);
@@ -229,6 +229,13 @@ pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 {
     cpu_id
 }
 
+pub fn get_max_x2apic_id(topology: (u16, u16, u16, u16)) -> u32 {
+    get_x2apic_id(
+        (topology.0 as u32 * topology.1 as u32 * topology.2 as u32 * topology.3 as u32) - 1,
+        Some(topology),
+    )
+}
+
 #[derive(Copy, Clone, Debug)]
 pub enum CpuidReg {
     EAX,
@@ -825,7 +832,7 @@ pub fn configure_vcpu(
     cpuid: Vec<CpuIdEntry>,
     kvm_hyperv: bool,
     cpu_vendor: CpuVendor,
-    topology: Option<(u8, u8, u8)>,
+    topology: Option<(u16, u16, u16, u16)>,
 ) -> super::Result<()> {
     let x2apic_id = get_x2apic_id(id, topology);
 
@@ -850,7 +857,7 @@ pub fn configure_vcpu(
     assert!(apic_id_patched);
 
     if let Some(t) = topology {
-        update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, cpu_vendor, id);
+        update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, t.3, cpu_vendor, id);
     }
 
     // The TSC frequency CPUID leaf should not be included when running with HyperV emulation
@@ -953,7 +960,7 @@ pub fn configure_system(
     serial_number: Option<&str>,
     uuid: Option<&str>,
     oem_strings: Option<&[&str]>,
-    topology: Option<(u8, u8, u8)>,
+    topology: Option<(u16, u16, u16, u16)>,
 ) -> super::Result<()> {
     // Write EBDA address to location where ACPICA expects to find it
     guest_mem
@@ -1361,21 +1368,24 @@ pub fn get_host_cpu_phys_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>) -> u
 
 fn update_cpuid_topology(
     cpuid: &mut Vec<CpuIdEntry>,
-    threads_per_core: u8,
-    cores_per_die: u8,
-    dies_per_package: u8,
+    threads_per_core: u16,
+    cores_per_die: u16,
+    dies_per_package: u16,
+    packages: u16,
     cpu_vendor: CpuVendor,
     id: u32,
 ) {
     let x2apic_id = get_x2apic_id(
         id,
-        Some((threads_per_core, cores_per_die, dies_per_package)),
+        Some((threads_per_core, cores_per_die, dies_per_package, packages)),
     );
 
-    let thread_width = 8 - (threads_per_core - 1).leading_zeros();
-    let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width;
-    let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width;
+    // Note: the topology defined here is per "package" (~NUMA node).
+    let thread_width = u16::BITS - (threads_per_core - 1).leading_zeros();
+    let core_width = u16::BITS - (cores_per_die - 1).leading_zeros() + thread_width;
+    let die_width = u16::BITS - (dies_per_package - 1).leading_zeros() + core_width;
 
+    // The very old way: a flat number of logical CPUs per package: CPUID.1H:EBX[23:16] bits.
     let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0);
     cpu_ebx |= ((dies_per_package as u32) * (cores_per_die as u32) * (threads_per_core as u32))
         & (0xff << 16);
@@ -1385,6 +1395,7 @@ fn update_cpuid_topology(
     cpu_edx |= 1 << 28;
     CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx);
 
+    // The legacy way: threads+cores per package.
     // CPU Topology leaf 0xb
     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width);
     CpuidPatch::set_cpuid_reg(
@@ -1407,6 +1418,7 @@ fn update_cpuid_topology(
     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8);
     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EDX, x2apic_id);
 
+    // The modern way: many-level hierarchy (but we here only support four levels).
     // CPU Topology leaf 0x1f
     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width);
     CpuidPatch::set_cpuid_reg(
@@ -1721,22 +1733,27 @@ mod tests {
 
     #[test]
     fn test_get_x2apic_id() {
-        let x2apic_id = get_x2apic_id(0, Some((2, 3, 1)));
+        let x2apic_id = get_x2apic_id(0, Some((2, 3, 1, 1)));
         assert_eq!(x2apic_id, 0);
 
-        let x2apic_id = get_x2apic_id(1, Some((2, 3, 1)));
+        let x2apic_id = get_x2apic_id(1, Some((2, 3, 1, 1)));
         assert_eq!(x2apic_id, 1);
 
-        let x2apic_id = get_x2apic_id(2, Some((2, 3, 1)));
+        let x2apic_id = get_x2apic_id(2, Some((2, 3, 1, 1)));
         assert_eq!(x2apic_id, 2);
 
-        let x2apic_id = get_x2apic_id(6, Some((2, 3, 1)));
+        let x2apic_id = get_x2apic_id(6, Some((2, 3, 1, 1)));
         assert_eq!(x2apic_id, 8);
 
-        let x2apic_id = get_x2apic_id(7, Some((2, 3, 1)));
+        let x2apic_id = get_x2apic_id(7, Some((2, 3, 1, 1)));
         assert_eq!(x2apic_id, 9);
 
-        let x2apic_id = get_x2apic_id(8, Some((2, 3, 1)));
+        let x2apic_id = get_x2apic_id(8, Some((2, 3, 1, 1)));
         assert_eq!(x2apic_id, 10);
+
+        let x2apic_id = get_x2apic_id(257, Some((1, 312, 1, 1)));
+        assert_eq!(x2apic_id, 257);
+
+        assert_eq!(255, get_max_x2apic_id((1, 256, 1, 1)));
     }
 }
diff --git a/arch/src/x86_64/mptable.rs b/arch/src/x86_64/mptable.rs
index 42667df9c..d709a0043 100644
--- a/arch/src/x86_64/mptable.rs
+++ b/arch/src/x86_64/mptable.rs
@@ -136,7 +136,7 @@ pub fn setup_mptable(
     offset: GuestAddress,
     mem: &GuestMemoryMmap,
     num_cpus: u32,
-    topology: Option<(u8, u8, u8)>,
+    topology: Option<(u16, u16, u16, u16)>,
 ) -> Result<()> {
     if num_cpus > 0 {
         let cpu_id_max = num_cpus - 1;
diff --git a/vmm/src/acpi.rs b/vmm/src/acpi.rs
index b9e809e4a..a2299acd8 100644
--- a/vmm/src/acpi.rs
+++ b/vmm/src/acpi.rs
@@ -285,7 +285,7 @@ fn create_tpm2_table() -> Sdt {
 
 fn create_srat_table(
     numa_nodes: &NumaNodes,
-    #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
+    #[cfg(target_arch = "x86_64")] topology: Option<(u16, u16, u16, u16)>,
 ) -> Sdt {
     let mut srat = Sdt::new(*b"SRAT", 36, 3, *b"CLOUDH", *b"CHSRAT  ", 1);
     // SRAT reserved 12 bytes
diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs
index e5813562b..7b16b3988 100644
--- a/vmm/src/cpu.rs
+++ b/vmm/src/cpu.rs
@@ -390,7 +390,7 @@ impl Vcpu {
         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
-        #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
+        #[cfg(target_arch = "x86_64")] topology: Option<(u16, u16, u16, u16)>,
     ) -> Result<()> {
         #[cfg(target_arch = "aarch64")]
         {
@@ -884,8 +884,22 @@ impl CpuManager {
 
         #[cfg(target_arch = "x86_64")]
         let topology = self.config.topology.clone().map_or_else(
-            || Some((1, self.boot_vcpus().try_into().unwrap(), 1)),
-            |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
+            || {
+                Some((
+                    1_u16,
+                    u16::try_from(self.boot_vcpus()).unwrap(),
+                    1_u16,
+                    1_u16,
+                ))
+            },
+            |t| {
+                Some((
+                    t.threads_per_core.into(),
+                    t.cores_per_die.into(),
+                    t.dies_per_package.into(),
+                    t.packages.into(),
+                ))
+            },
         );
         #[cfg(target_arch = "x86_64")]
         vcpu.configure(
@@ -1427,11 +1441,15 @@ impl CpuManager {
             .collect()
     }
 
-    pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
-        self.config
-            .topology
-            .clone()
-            .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
+    pub fn get_vcpu_topology(&self) -> Option<(u16, u16, u16, u16)> {
+        self.config.topology.clone().map(|t| {
+            (
+                t.threads_per_core.into(),
+                t.cores_per_die.into(),
+                t.dies_per_package.into(),
+                t.packages.into(),
+            )
+        })
     }
 
     #[cfg(not(target_arch = "riscv64"))]
@@ -1574,9 +1592,10 @@ impl CpuManager {
         // If topology is not specified, the default setting is:
         // 1 package, multiple cores, 1 thread per core
         // This is also the behavior when PPTT is missing.
-        let (threads_per_core, cores_per_package, packages) =
-            self.get_vcpu_topology()
-                .unwrap_or((1, self.max_vcpus().try_into().unwrap(), 1));
+        let (threads_per_core, cores_per_die, dies_per_package, packages) = self
+            .get_vcpu_topology()
+            .unwrap_or((1, u16::try_from(self.max_vcpus()).unwrap(), 1, 1));
+        let cores_per_package = cores_per_die * dies_per_package;
 
         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
 
@@ -1931,7 +1950,7 @@ struct Cpu {
     proximity_domain: u32,
     dynamic: bool,
     #[cfg(target_arch = "x86_64")]
-    topology: Option<(u8, u8, u8)>,
+    topology: Option<(u16, u16, u16, u16)>,
 }
 
 #[cfg(target_arch = "x86_64")]
diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs
index 40579b9cc..7e8851194 100644
--- a/vmm/src/vm.rs
+++ b/vmm/src/vm.rs
@@ -3582,7 +3582,7 @@ mod tests {
             &mem,
             "console=tty0",
             vec![0],
-            Some((0, 0, 0)),
+            Some((0, 0, 0, 0)),
             &dev_info,
             &gic,
             &None,