vmsilo/modules/scripts.nix
Davíð Steinn Geirsson 0a07f7f14e Switch ephemeral overlay from qcow2 to raw sparse image
qcow2 causes O_DIRECT failures on ext4 due to crosvm doing unaligned
access when parsing the qcow2 header. Since we don't use any qcow2
features (the disk is created fresh and deleted on stop), a raw sparse
file via truncate works just as well and also removes the qemu package
dependency from the VM service.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 00:49:44 +00:00

599 lines
20 KiB
Nix

# Script generation for vmsilo NixOS module
# VM launcher scripts and user-facing scripts (vm-run, vm-start, vm-stop, vm-shell)
{
config,
pkgs,
lib,
...
}:
let
cfg = config.programs.vmsilo;
helpers = import ./lib/helpers.nix { inherit lib; };
inherit (helpers)
formatKVArgs
formatPositionalKVArgs
parseCIDR
prefixToNetmask
idxToIfIndex
ifIndexToPciAddr
normalizeBdf
isBdf
generateMac
sortedInterfaceList
makeTapName
assignVmIds
mkEffectiveSharedDirs
;
vms = assignVmIds cfg.nixosVms;
# NOTE: getEffectiveInterfaces is intentionally duplicated in networking.nix and services.nix.
# It cannot live in helpers.nix (which has no config access) and the three modules
# don't share a common let-binding scope. Keep the copies in sync.
getEffectiveInterfaces =
vm: vm.network.interfaces // (cfg._internal.netvmInjections.${vm.name}.interfaces or { });
# Get effective generated guest config: vm._generatedGuestConfig + netvm-injected guestConfig.
getEffectiveGuestConfig =
vm: vm._generatedGuestConfig ++ (cfg._internal.netvmInjections.${vm.name}.guestConfig or [ ]);
# User UID/GID and runtime directory for explicit paths (system services need these)
userUid = config.users.users.${cfg.user}.uid;
userGid = config.users.groups.${config.users.users.${cfg.user}.group}.gid;
userRuntimeDir = "/run/user/${toString userUid}";
# Build effective sound config from sound options
mkSoundConfig =
sound:
if sound.playback || sound.capture then
{
backend = "pulse";
capture = sound.capture;
pulse_socket_path = "${userRuntimeDir}/pulse/native";
pulse_cookie_path = "/home/${cfg.user}/.config/pulse/cookie";
}
else
null;
# Get effective MAC for an interface (uses user-specified interface name)
getEffectiveIfaceMac =
vm: ifName: iface:
if iface.macAddress != null then iface.macAddress else generateMac vm.name ifName;
# Build rootfs for a VM
buildRootfs =
vm:
let
# Enable pipewire in guest when sound is enabled
soundConfig = lib.optionalAttrs (vm.sound.playback || vm.sound.capture) {
services.pipewire = {
enable = lib.mkDefault true;
pulse.enable = lib.mkDefault true;
# Pipewire volume defaults to 40%, likely because the ALSA device has no mixer controls.
wireplumber.extraConfig."50-default-volume"."wireplumber.settings" = {
"device.routes.default-sink-volume" = 1.0;
};
extraConfig.pipewire."50-allow-sample-rates"."context.properties" = {
# Allow everything to avoid resampling. Let the host pipewire resample if needed.
"default.clock.allowed-rates" = [
44100
48000
88200
96000
176400
192000
352800
384000
];
};
};
# Enable realtime scheduling for pipewire
security.rtkit.enable = true;
};
in
pkgs.callPackage ../rootfs-nixos {
inherit (cfg._internal) wayland-proxy-virtwl sommelier;
inherit soundConfig;
vmsilo-tray = cfg._internal.vmsilo-tray;
trayLogLevel = cfg.vmsilo-tray.logLevel;
waylandProxy = vm.waylandProxy;
guestPrograms = vm.guestPrograms;
guestConfig =
(if lib.isList vm.guestConfig then vm.guestConfig else [ vm.guestConfig ])
++ getEffectiveGuestConfig vm;
inherit (vm) copyChannel;
};
# Default block device options applied to all disks
defaultBlockOpts = {
packed-queue = true; # More efficient, better cache locality than split virtqueues
direct = true; # Bypass host page cache, guest handles its own page cache
};
# Format a disk configuration as --block argument (applies defaults)
formatBlockArg =
disk: "--block ${formatPositionalKVArgs [ "path" ] "," "=" (defaultBlockOpts // disk)}";
# Normalize all isolated devices
normalizedIsolatedDevices = map normalizeBdf cfg.isolatedPciDevices;
# Generate VM launcher script
mkVmScript =
vm:
let
# Only build rootfs if we need it (no custom root/kernel/initramfs)
needsBuiltRootfs = vm.rootDisk == null || vm.kernel == null || vm.initramfs == null;
rootfs = if needsBuiltRootfs then buildRootfs vm else null;
# Determine root disk config: use user's as-is, or built rootfs with rootDiskReadonly
rootDiskConfig =
if vm.rootDisk != null then
vm.rootDisk
else
{
path = "${rootfs}/nixos.erofs";
ro = vm.rootDiskReadonly;
};
kernelPath = if vm.kernel != null then vm.kernel else "${rootfs}/bzImage";
initramfsPath = if vm.initramfs != null then vm.initramfs else "${rootfs}/initrd";
additionalDisksArgs = lib.concatMapStringsSep " " formatBlockArg vm.additionalDisks;
# Ephemeral overlay disk (raw mode only)
ephemeralDiskPath = "/var/lib/vmsilo/${vm.name}-ephemeral.raw";
ephemeralDiskId = "ephemeral";
ephemeralDiskConfig = {
path = ephemeralDiskPath;
ro = false;
id = ephemeralDiskId;
};
ephemeralDiskArg = lib.optionalString (vm.rootOverlay.type == "raw") (
formatBlockArg ephemeralDiskConfig
);
# Kernel param for overlay type
rootOverlayKernelParam =
if vm.rootOverlay.type == "raw" then
''-p "vmsilo.rootOverlay=raw,${ephemeralDiskId}"''
else
''-p "vmsilo.rootOverlay=tmpfs"'';
# Shared home directory
sharedHomeEnabled = vm.sharedHome != false;
# Effective shared directories (user config + implicit sharedHome entry)
effectiveSharedDirs = mkEffectiveSharedDirs {
inherit (vm) sharedDirectories sharedHome;
vmName = vm.name;
inherit userUid userGid;
};
# virtiofsd vhost-user socket args for crosvm
virtiofsDirArgs = lib.concatMapStringsSep " " (
tag: "--vhost-user type=fs,socket=/run/vmsilo/${vm.name}-virtiofsd-${tag}.socket"
) (builtins.attrNames effectiveSharedDirs);
extraKernelParams = lib.concatMapStringsSep " " (p: "-p \"${p}\"") vm.kernelParams;
# GPU config: false = disabled, true = default (wayland+opengl), attrset = custom features
gpuConfig =
if vm.gpu == false then
null
else if vm.gpu == true then
{
wayland = true;
opengl = true;
vulkan = false;
}
else
vm.gpu;
# Build context-types string from enabled GPU features
gpuContextTypes = lib.optionalString (gpuConfig != null) (
lib.concatStringsSep ":" (
lib.filter (x: x != null) [
(if gpuConfig.wayland then "cross-domain" else null)
(if gpuConfig.opengl then "virgl2" else null)
(if gpuConfig.vulkan then "venus" else null)
]
)
);
# Sound config from playback/capture booleans
effectiveSound = mkSoundConfig vm.sound;
# Convert BDF to sysfs path
bdfToSysfs = bdf: "/sys/bus/pci/devices/${normalizeBdf bdf}";
# PCI devices for this VM (extract path from attrset, normalize BDF)
vmPciDevicePaths = map (
dev: if isBdf dev.path then normalizeBdf dev.path else dev.path
) vm.pciDevices;
# Format --vfio arguments with optional kv pairs
vfioArgs = lib.concatMapStringsSep " " (
dev:
let
sysfsPath = if isBdf dev.path then bdfToSysfs dev.path else dev.path;
remaining = lib.filterAttrs (k: v: k != "path" && v != null) dev;
kvPart = formatKVArgs "," remaining;
in
if kvPart == "" then "--vfio ${sysfsPath}" else "--vfio ${sysfsPath},${kvPart}"
) vm.pciDevices;
# vhost-user arguments
vhostUserArgs = lib.concatMapStringsSep " " (
vu: "--vhost-user ${formatKVArgs "," vu}"
) vm.vhostUser;
# Network interface crosvm arguments
# Sorted alphabetically by interface name for deterministic PCI slot assignment
networkArgs = lib.concatStringsSep " \\\n " (
lib.imap0 (
idx: entry:
let
ifName = entry.name;
iface = entry.value;
ifIndex = idxToIfIndex idx;
pciAddr = ifIndexToPciAddr ifIndex;
mac = getEffectiveIfaceMac vm ifName iface;
tapName = if iface.tap.name != null then iface.tap.name else makeTapName vm.name vm.id ifIndex;
in
"--net tap-name=${tapName},mac=${mac},pci-address=${pciAddr}"
) (sortedInterfaceList (getEffectiveInterfaces vm))
);
# Kernel params for network configuration (uses user-specified interface names)
networkKernelParams = lib.concatLists (
map (
entry:
let
ifName = entry.name;
iface = entry.value;
in
if iface.dhcp then
[ ''-p "ip=:::::${ifName}:dhcp"'' ]
else
# Static IPv4 addresses
(map (
addr:
let
parsed = parseCIDR addr;
in
''-p "ip=${parsed.ip}:::${prefixToNetmask parsed.prefix}::${ifName}:none"''
) iface.addresses)
# Static IPv6 addresses
++ (map (addr: ''-p "ip=[${addr}]:::::${ifName}:none"'') iface.v6Addresses)
# IPv4 routes
++ (lib.mapAttrsToList (dest: r: ''-p "rd.route=${dest}:${r.via}:${ifName}"'') iface.routes)
# IPv6 routes
++ (lib.mapAttrsToList (dest: r: ''-p "rd.route=[${dest}]:[${r.via}]:${ifName}"'') iface.v6Routes)
) (sortedInterfaceList (getEffectiveInterfaces vm))
);
# Kernel params for interface naming (vmsilo.ifname=<name>,<mac>)
interfaceNameKernelParams = map (
entry:
let
ifName = entry.name;
mac = getEffectiveIfaceMac vm ifName entry.value;
in
''-p "vmsilo.ifname=${ifName},${mac}"''
) (sortedInterfaceList (getEffectiveInterfaces vm));
# Nameserver params
nameserverParams = map (ns: ''-p "nameserver=${ns}"'') vm.network.nameservers;
# All network kernel params
allNetworkKernelParams = interfaceNameKernelParams ++ networkKernelParams ++ nameserverParams;
# Crosvm configuration (per-VM overrides global)
effectiveLogLevel = if vm.crosvm.logLevel != null then vm.crosvm.logLevel else cfg.crosvm.logLevel;
allExtraArgs = cfg.crosvm.extraArgs ++ vm.crosvm.extraArgs;
allExtraRunArgs = cfg.crosvm.extraRunArgs ++ vm.crosvm.extraRunArgs;
in
pkgs.writeShellScript "vmsilo-start-${vm.name}" ''
#!/bin/sh
set -e
${lib.optionalString (vm.pciDevices != [ ]) ''
# IOMMU group validation
check_iommu_group() {
local dev="$1"
local group_path="/sys/bus/pci/devices/$dev/iommu_group/devices"
if [ ! -d "$group_path" ]; then
echo "Error: IOMMU not enabled or device $dev not found" >&2
echo "Ensure IOMMU is enabled (intel_iommu=on or amd_iommu=on)" >&2
exit 1
fi
for peer in "$group_path"/*; do
peer_bdf=$(basename "$peer")
[ "$peer_bdf" = "$dev" ] && continue
# Check if peer is in our passthrough list
case "$peer_bdf" in
${lib.concatStringsSep "|" vmPciDevicePaths})
# Peer is being passed to this VM, OK
;;
*)
# Check if peer is unbound (no driver)
if [ -L "/sys/bus/pci/devices/$peer_bdf/driver" ]; then
peer_driver=$(basename "$(readlink "/sys/bus/pci/devices/$peer_bdf/driver")")
if [ "$peer_driver" != "vfio-pci" ]; then
# Check if peer is a PCI bridge (class 0x0604xx) - safe to leave bound
peer_class=$(cat "/sys/bus/pci/devices/$peer_bdf/class" 2>/dev/null || echo "")
case "$peer_class" in
0x0604*)
# PCI-to-PCI bridge, safe to leave bound to pcieport driver
;;
*)
echo "Error: Device $dev shares IOMMU group with $peer_bdf (bound to $peer_driver)" >&2
echo "All devices in an IOMMU group must be passed to the same VM or unbound" >&2
exit 1
;;
esac
fi
fi
;;
esac
done
}
# Check all PCI devices
for dev in ${lib.concatStringsSep " " vmPciDevicePaths}; do
check_iommu_group "$dev"
done
''}
# Clean up stale socket
rm -f /run/vmsilo/${vm.name}-crosvm-control.socket
exec ${cfg._internal.crosvm}/bin/crosvm \
--log-level=${effectiveLogLevel} \
--no-syslog \
--no-timestamps \
${lib.escapeShellArgs allExtraArgs} \
run \
--name ${vm.name} \
-m ${toString vm.memory} \
--hugepages \
--balloon-page-reporting \
--boost-uclamp \
--disable-virtio-intx \
--no-i8042 \
--no-rtc \
--s2idle \
${
if cfg.schedulerIsolation == "full" then
"--core-scheduling=true"
else if cfg.schedulerIsolation == "vm" then
"--per-vm-core-scheduling"
else
"--core-scheduling=false"
} \
--initrd=${initramfsPath} \
--serial=hardware=virtio-console,type=unix-stream,path=/run/vmsilo/${vm.name}-console-backend.socket,console,input-unix-stream,stream-non-blocking \
${formatBlockArg rootDiskConfig} \
${additionalDisksArgs} \
${ephemeralDiskArg} \
${lib.optionalString (rootfs != null) ''-p "init=${rootfs.config.system.build.toplevel}/init"''} \
-p "vmsilo.hostname=${vm.name}" \
${lib.concatStringsSep " \\\n " allNetworkKernelParams} \
${lib.optionalString vm.autoShutdown.enable ''
-p "autoShutdown.enable=1" \
-p "autoShutdown.after=${toString vm.autoShutdown.after}" \
''} \
${rootOverlayKernelParam} \
${lib.optionalString sharedHomeEnabled ''-p "systemd.mount-extra=home:/home/user:virtiofs:"''} \
${extraKernelParams} \
${virtiofsDirArgs} \
--cid ${toString vm.id} \
--cpus ${toString vm.cpus} \
${lib.optionalString (gpuConfig != null) "--gpu=context-types=${gpuContextTypes}"} \
${
lib.optionalString (
gpuConfig != null && gpuConfig.vulkan
) "--gpu-render-server=path=${pkgs.virglrenderer}/libexec/virgl_render_server"
} \
${lib.optionalString (effectiveSound != null) "--virtio-snd=${formatKVArgs "," effectiveSound}"} \
-s /run/vmsilo/${vm.name}-crosvm-control.socket \
${
lib.optionalString (gpuConfig != null)
"--wayland-security-context wayland_socket=${userRuntimeDir}/wayland-0,app_id=vmsilo:${vm.name}:${vm.color}"
} \
${vfioArgs} \
${networkArgs} \
${vhostUserArgs} \
${lib.escapeShellArgs allExtraRunArgs} \
${kernelPath}
'';
# Generate proxy script for a VM
mkProxyScript =
vm:
pkgs.writeShellScript "vmsilo-proxy-${vm.name}" ''
CID=${toString vm.id}
VSOCK_PORT=5000
TIMEOUT=30
# Wait for vsock to become available
ELAPSED=0
while [ $ELAPSED -lt $TIMEOUT ]; do
if ${pkgs.socat}/bin/socat -u OPEN:/dev/null VSOCK-CONNECT:$CID:$VSOCK_PORT 2>/dev/null; then
break
fi
sleep 0.5
ELAPSED=$((ELAPSED + 1))
done
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "Timeout waiting for VM ${vm.name} to start" >&2
exit 1
fi
# Forward stdin/stdout to vsock
exec ${pkgs.socat}/bin/socat - VSOCK-CONNECT:$CID:$VSOCK_PORT
'';
# Generate shell case statement for VM dispatch
mkVmCase = makeCase: ''
case "$VM_NAME" in
${lib.concatMapStringsSep "\n " makeCase vms}
*)
echo "Unknown VM: $VM_NAME" >&2
echo "Available VMs: ${lib.concatMapStringsSep ", " (vm: vm.name) vms}" >&2
exit 1
;;
esac
'';
# vm-run: Run command in VM (socket-activated)
vmRunScript = pkgs.writeShellScript "vm-run" ''
if [ $# -lt 2 ]; then
echo "Usage: vm-run <vm-name> <command> [args...]" >&2
exit 1
fi
VM_NAME="$1"
shift
SOCKET="/run/vmsilo/$VM_NAME-command.socket"
if [ ! -S "$SOCKET" ]; then
echo "Unknown VM or socket not active: $VM_NAME" >&2
echo "Available VMs: ${lib.concatMapStringsSep ", " (vm: vm.name) vms}" >&2
exit 1
fi
# Send command via socket (triggers activation if needed)
echo "$@" | ${pkgs.socat}/bin/socat - UNIX-CONNECT:"$SOCKET"
'';
# vm-start-debug: Start VM directly (bypasses socket activation, requires root)
vmStartDebugScript = pkgs.writeShellScript "vm-start-debug" ''
if [ $# -ne 1 ]; then
echo "Usage: vm-start-debug <vm-name>" >&2
echo "Note: Requires root privileges (use sudo)" >&2
exit 1
fi
if [ "$(id -u)" -ne 0 ]; then
echo "Error: vm-start-debug requires root privileges" >&2
echo "Run: sudo vm-start-debug $1" >&2
exit 1
fi
VM_NAME="$1"
${mkVmCase (vm: "${vm.name}) exec ${mkVmScript vm} ;;")}
'';
# vm-start: Start VM via systemd (uses polkit for authorization)
vmStartScript = pkgs.writeShellScript "vm-start" ''
if [ $# -ne 1 ]; then
echo "Usage: vm-start <vm-name>" >&2
exit 1
fi
VM_NAME="$1"
${mkVmCase (vm: "${vm.name}) systemctl start vmsilo-${vm.name}-vm.service ;;")}
'';
# vm-stop: Stop VM via systemd (uses polkit for authorization)
vmStopScript = pkgs.writeShellScript "vm-stop" ''
if [ $# -ne 1 ]; then
echo "Usage: vm-stop <vm-name>" >&2
exit 1
fi
VM_NAME="$1"
${mkVmCase (vm: "${vm.name}) systemctl stop vmsilo-${vm.name}-vm.service ;;")}
'';
# vm-shell: Connect to VM (serial console by default, SSH with --ssh)
vmShellScript = pkgs.writeShellScript "vm-shell" ''
usage() {
echo "Usage: vm-shell [--ssh [--root]] <vm-name>" >&2
echo "" >&2
echo "Options:" >&2
echo " --ssh Use SSH over vsock (requires SSH keys configured)" >&2
echo " --root Connect as root (only with --ssh)" >&2
echo "" >&2
echo "Without --ssh, connects to serial console." >&2
echo "Escape character is CTRL+]" >&2
exit 1
}
USE_SSH=0
USE_ROOT=0
while [ $# -gt 0 ]; do
case "$1" in
--ssh)
USE_SSH=1
shift
;;
--root)
USE_ROOT=1
shift
;;
-*)
usage
;;
*)
break
;;
esac
done
if [ $# -ne 1 ]; then
usage
fi
VM_NAME="$1"
if [ $USE_ROOT -eq 1 ] && [ $USE_SSH -eq 0 ]; then
echo "Error: --root requires --ssh" >&2
exit 1
fi
if [ $USE_SSH -eq 1 ]; then
if [ $USE_ROOT -eq 1 ]; then
USER_NAME="root"
else
USER_NAME="user"
fi
${mkVmCase (vm: "${vm.name}) exec ${pkgs.openssh}/bin/ssh $USER_NAME@vsock/${toString vm.id} ;;")}
else
CONSOLE="/run/vmsilo/$VM_NAME-console"
if [ ! -e "$CONSOLE" ]; then
echo "Console not found: $CONSOLE" >&2
echo "Is the VM running? Use: vm-start $VM_NAME" >&2
exit 1
fi
echo "Escape character is CTRL+]"
exec ${pkgs.socat}/bin/socat -,raw,echo=0,escape=0x1d "$CONSOLE"
fi
'';
in
{
config = lib.mkIf cfg.enable {
# Set internal options for other modules to consume
programs.vmsilo._internal = {
vmScripts = lib.listToAttrs (map (vm: lib.nameValuePair vm.name (mkVmScript vm)) vms);
proxyScripts = lib.listToAttrs (map (vm: lib.nameValuePair vm.name (mkProxyScript vm)) vms);
userScripts = {
vm-run = vmRunScript;
vm-start = vmStartScript;
vm-start-debug = vmStartDebugScript;
vm-stop = vmStopScript;
vm-shell = vmShellScript;
};
};
};
}