diff --git a/CLAUDE.md b/CLAUDE.md index f1299f1..fcf6d98 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -114,9 +114,10 @@ See README.md for full usage details and options. - **Session bind**: GPU-enabled VMs (default) are tied to the desktop session via per-VM systemd user services bound to `graphical-session.target`. For `autoStart` GPU VMs, the session-bind service also starts the VM on login. Non-GPU `autoStart` VMs start at `multi-user.target` (boot). - **Automatic DNS**: All VMs have `systemd-resolved` enabled by default (guest rootfs). Netvm VMs get `unbound` as a recursive resolver via `guestConfig` injection. Downstream VMs get nameserver kernel params pointing at their netvm's IP via `netvmInjections.nameservers`. VMs with `netvm = "host"` or no netvm need manual DNS config. - **GPU device backend**: `vmsilo--gpu` service runs the GPU device backend sandboxed; selectable via `gpu.backend` between `vhost-device-gpu` (default, vhost-device-gpu in rutabaga mode) and `crosvm` (crosvm device gpu). Both crosvm and cloud-hypervisor VMMs attach via vhost-user. `vmsilo--wayland-seccontext` must start first. GPU is enabled when any capability (`wayland`, `opengl`, `vulkan`) is true; `wayland` defaults true. Set `gpu.wayland = false` to disable. -- **Per-VM runtime dirs**: all sockets under `/run/vmsilo//` subdirectories (not flat). +- **Per-VM runtime dirs**: all sockets under `/run/vmsilo//` subdirectories (not flat). virtiofs instances get per-instance dirs at `/run/vmsilo//virtiofs-/`. - **USB passthrough**: usbip-over-vsock on port 5002. Guest runs `usbip-rs client listen`, host runs one `usbip-rs host connect` per device as `vmsilo--usb@.service`. Works with both crosvm and cloud-hypervisor. - **CH sandboxing**: CH VMs use NixOS confinement (chroot), PrivateUsers=identity, PrivateNetwork, PrivatePIDs, PrivateIPC, empty CapabilityBoundingSet. TAP FDs passed via `vmsilo-tap-open` + `ch-remote add-net`. All privileged operations in ExecStartPre=+/ExecStartPost=+/ExecStopPost=+. Gated by `cloud-hypervisor.disableSandbox`. +- **virtiofsd sandboxing**: virtiofsd has built-in sandboxing (`--sandbox=namespace`): creates mount/PID/network namespaces, does pivot_root, drops capabilities, and applies its own seccomp filter. The systemd unit adds non-overlapping hardening: IPC/UTS namespace isolation, seccomp-based protections (clock/modules/logs/personality), capability bounding set (as defense-in-depth), and `LimitNOFILE=1048576`. Per-instance runtime dirs at `/run/vmsilo//virtiofs-/`. Gated by `virtiofs.disableSandbox`; seccomp controlled independently by `virtiofs.seccompPolicy`. ### Gotchas diff --git a/README.md b/README.md index aafe7ec..08c7342 100644 --- a/README.md +++ b/README.md @@ -194,6 +194,8 @@ There are a lot of configuration options but you don't really need to touch most | `sound.logLevel` | string | `"info"` | RUST_LOG level for the sound device service | | `sound.seccompPolicy` | `"enforcing"` or `"log"` | `"enforcing"` | Seccomp policy for sound device service. `"enforcing"` blocks unlisted syscalls; `"log"` only logs them. | | `sharedDirectories` | attrsOf submodule | `{}` | Shared directories via virtiofsd (keys are fs tags, see below) | +| `virtiofs.seccompPolicy` | `"enforcing"` or `"log"` | `"enforcing"` | Seccomp policy for virtiofsd instances. `"enforcing"` blocks unlisted syscalls; `"log"` only logs them. | +| `virtiofs.disableSandbox` | bool | `false` | Disable non-seccomp sandboxing for virtiofsd instances. Useful for debugging. | | `pciDevices` | list of attrsets | `[]` | PCI devices to passthrough (path + optional kv pairs) | | `usbDevices` | list of attrsets | `[]` | USB devices to passthrough (vendorId, productId, optional serial) | | `guestPrograms` | list of packages | `[]` | VM-specific packages | diff --git a/docs/superpowers/specs/2026-03-25-virtiofsd-sandboxing-design.md b/docs/superpowers/specs/2026-03-25-virtiofsd-sandboxing-design.md new file mode 100644 index 0000000..a3dfe16 --- /dev/null +++ b/docs/superpowers/specs/2026-03-25-virtiofsd-sandboxing-design.md @@ -0,0 +1,94 @@ +# virtiofsd Sandboxing Design + +## Overview + +Add strong sandboxing to virtiofsd services using NixOS confinement, Linux namespaces, capability restrictions, and virtiofsd's built-in seccomp filter. Each virtiofsd instance is confined to only its exported directory and its own runtime directory. + +## New Options + +### `virtiofs.seccompPolicy` + +- Type: enum `"enforcing"` | `"log"` +- Default: `"enforcing"` +- Controls virtiofsd's built-in seccomp filter via the `--seccomp` CLI flag: + - `"enforcing"` → omit flag (virtiofsd defaults to `--seccomp kill`) + - `"log"` → `--seccomp log` (log violations without killing) +- Independent of `disableSandbox` — always applies. + +### `virtiofs.disableSandbox` + +- Type: bool +- Default: `false` +- When `false`: full confinement + namespace + capability hardening applied to all virtiofsd instances for this VM. +- When `true`: all systemd-level sandboxing skipped. +- Description: "Disable non-seccomp sandboxing for virtiofsd instances. Seccomp is controlled separately by virtiofs.seccompPolicy." + +## Per-Instance Runtime Directories + +**Current**: All virtiofsd instances share `/run/vmsilo//virtiofs/`. + +**New**: Each instance gets its own directory: `/run/vmsilo//virtiofs-/`. + +### Affected locations + +- **Prep service** (`services.nix`): Replace single `install -d -m 0755 /run/vmsilo/${vm.name}/virtiofs` with one `install` per share tag: `install -d -m 0755 /run/vmsilo/${vm.name}/virtiofs-${tag}`. +- **Socket path**: Changes from `/run/vmsilo//virtiofs/.socket` to `/run/vmsilo//virtiofs-/.socket`. +- **ExecStopPost cleanup**: Update path to `rm -f /run/vmsilo/${vm.name}/virtiofs-${tag}/${tag}.socket`. +- **crosvm vhost-user args** (`scripts.nix`): Update socket path in `--vhost-user` args. +- **cloud-hypervisor JSON config** (`vm-config.nix`): Update socket path in `fs` entries. +- **vhostUserSockets list**: Update the map to use new per-instance path. +- **`mkVirtiofsdCmd`** (`services.nix`): Update `--socket-path` argument to new per-instance path. Add `--seccomp` flag based on `seccompPolicy`. + +## Sandboxing Configuration + +virtiofsd has built-in sandboxing (`--sandbox=namespace`): it creates mount/PID/network namespaces, does `pivot_root` into the shared directory, drops capabilities to a minimal set, and applies its own seccomp filter. The systemd unit adds only non-overlapping hardening, gated by `lib.optionalAttrs (!vm.virtiofs.disableSandbox)`. + +virtiofsd runs as root (no `User` directive). It manages its own capability dropping after sandbox setup. + +### virtiofsd's built-in sandbox + +- **Mount namespace** (`CLONE_NEWNS`) + `pivot_root` into shared directory +- **PID namespace** (`CLONE_NEWPID`) +- **Network namespace** (`CLONE_NEWNET`) +- **Capability dropping** to: `CAP_CHOWN`, `CAP_DAC_OVERRIDE`, `CAP_FOWNER`, `CAP_FSETID`, `CAP_SETGID`, `CAP_SETUID`, `CAP_MKNOD`, `CAP_SETFCAP`, optionally `CAP_DAC_READ_SEARCH` +- **Seccomp filter** (allowlist of ~100 syscalls) +- **`RLIMIT_NOFILE`** raised to 1,000,000 + +### Systemd-level hardening (non-overlapping) + +| Directive | Value | Purpose | +|---|---|---| +| `LimitNOFILE` | `1048576` | Allow virtiofsd to set its desired fd limit | +| `PrivateIPC` | `true` | IPC namespace — virtiofsd doesn't isolate IPC | +| `ProtectHostname` | `true` | UTS namespace — virtiofsd doesn't isolate UTS | +| `ProtectClock` | `true` | Block clock modification syscalls | +| `ProtectKernelModules` | `true` | Block module load/unload | +| `ProtectKernelLogs` | `true` | Block kernel log access | +| `LockPersonality` | `true` | Prevent personality() changes | +| `SystemCallArchitectures` | `"native"` | Block non-native ABI | +| `MemoryDenyWriteExecute` | `true` | Block W+X memory mappings | +| `NoNewPrivileges` | `true` | Prevent privilege escalation via execve | +| `CapabilityBoundingSet` | (see below) | Defense-in-depth ceiling on capabilities | + +`CapabilityBoundingSet` includes `CAP_SYS_ADMIN` (needed for `unshare`/`mount`/`pivot_root` during sandbox setup, dropped by virtiofsd after) plus the 9 operational capabilities virtiofsd retains. + +Directives NOT applied (virtiofsd handles these internally or they are undone by `pivot_root`): +- No confinement/chroot — virtiofsd does `pivot_root` +- No `PrivateUsers`, `PrivateNetwork`, `PrivatePIDs` — virtiofsd creates these namespaces +- No `PrivateTmp`, `PrivateDevices`, `ProtectKernelTunables`, `ProtectControlGroups` — mount-based, undone by `pivot_root` +- No `RestrictNamespaces` — would block virtiofsd's `unshare()` +- No `BindPaths`/`BindReadOnlyPaths` — undone by `pivot_root` + +### sharedHome ExecStartPre + +The "home" tag's `createSharedHome` script needs host filesystem access to create the shared home directory from a template. Prefix with `+` so it runs outside the sandbox: `ExecStartPre = [ "+${createSharedHome}" ]`. + +## Seccomp + +Handled entirely by virtiofsd's built-in seccomp filter (defined in `seccomp.rs`), not by systemd's `SystemCallFilter`. + +The `virtiofs.seccompPolicy` option adds `--seccomp ` to the virtiofsd command line: +- `"enforcing"` → omit flag (virtiofsd defaults to `--seccomp kill`) +- `"log"` → `--seccomp log` + +`disableSandbox` has no effect on seccomp — it is always controlled independently via `seccompPolicy`. diff --git a/modules/lib/vm-config.nix b/modules/lib/vm-config.nix index 8528b69..412885a 100644 --- a/modules/lib/vm-config.nix +++ b/modules/lib/vm-config.nix @@ -148,7 +148,7 @@ let vhostUserSockets = lib.optional gpuEnabled "/run/vmsilo/${vm.name}/gpu/gpu.socket" ++ lib.optional soundEnabled "/run/vmsilo/${vm.name}/sound/sound.socket" - ++ map (tag: "/run/vmsilo/${vm.name}/virtiofs/${tag}.socket") ( + ++ map (tag: "/run/vmsilo/${vm.name}/virtiofs-${tag}/${tag}.socket") ( builtins.attrNames effectiveSharedDirs ); @@ -392,7 +392,7 @@ let chFsEntries = map (tag: { tag = tag; - socket = "/run/vmsilo/${vm.name}/virtiofs/${tag}.socket"; + socket = "/run/vmsilo/${vm.name}/virtiofs-${tag}/${tag}.socket"; }) (builtins.attrNames effectiveSharedDirs); chDeviceEntries = map (dev: { diff --git a/modules/options.nix b/modules/options.nix index 9f216b7..1e59e67 100644 --- a/modules/options.nix +++ b/modules/options.nix @@ -501,6 +501,22 @@ let }; }; + virtiofs = { + seccompPolicy = lib.mkOption { + type = lib.types.enum [ + "enforcing" + "log" + ]; + default = "enforcing"; + description = "Seccomp policy for virtiofsd instances. Controls virtiofsd's built-in --seccomp flag. 'enforcing' kills on violation; 'log' only logs."; + }; + disableSandbox = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Disable non-seccomp sandboxing for virtiofsd instances. Seccomp is controlled separately by virtiofs.seccompPolicy."; + }; + }; + sharedDirectories = lib.mkOption { type = lib.types.attrsOf ( lib.types.submodule { diff --git a/modules/scripts.nix b/modules/scripts.nix index cc8a959..16c8ecb 100644 --- a/modules/scripts.nix +++ b/modules/scripts.nix @@ -52,7 +52,7 @@ let # virtiofsd vhost-user socket args virtiofsDirArgs = lib.concatMapStringsSep " " ( - tag: "--vhost-user type=fs,socket=/run/vmsilo/${vm.name}/virtiofs/${tag}.socket" + tag: "--vhost-user type=fs,socket=/run/vmsilo/${vm.name}/virtiofs-${tag}/${tag}.socket" ) (builtins.attrNames c.effectiveSharedDirs); # Kernel params wrapped with -p for crosvm CLI diff --git a/modules/services.nix b/modules/services.nix index 95d5033..abd7b52 100644 --- a/modules/services.nix +++ b/modules/services.nix @@ -83,8 +83,19 @@ let /run/vmsilo/${vm.name}/gpu \ /run/vmsilo/${vm.name}/gpu/shader-cache \ /run/vmsilo/${vm.name}/sound - ${pkgs.coreutils}/bin/install -d -m 0755 \ - /run/vmsilo/${vm.name}/virtiofs + ${lib.concatMapStringsSep "\n" + (tag: '' + ${pkgs.coreutils}/bin/install -d -m 0755 \ + /run/vmsilo/${vm.name}/virtiofs-${tag} + '') + ( + builtins.attrNames (mkEffectiveSharedDirs { + inherit (vm) sharedDirectories sharedHome; + vmName = vm.name; + inherit userUid userGid; + }) + ) + } ''; }; } @@ -308,7 +319,7 @@ let "${pkgs.virtiofsd}/bin/virtiofsd" "--shared-dir ${d.path}" "--tag ${tag}" - "--socket-path /run/vmsilo/${vm.name}/virtiofs/${tag}.socket" + "--socket-path /run/vmsilo/${vm.name}/virtiofs-${tag}/${tag}.socket" "--thread-pool-size ${toString d.threadPoolSize}" "--inode-file-handles=${d.inodeFileHandles}" "--cache ${d.cache}" @@ -327,6 +338,8 @@ let ++ lib.optional (d.gidMap != null) "--gid-map ${d.gidMap}" ++ lib.optional (d.translateUid != null) "--translate-uid ${d.translateUid}" ++ lib.optional (d.translateGid != null) "--translate-gid ${d.translateGid}" + ++ lib.optional (vm.virtiofs.seccompPolicy == "log") "--seccomp log" + ++ lib.optional vm.virtiofs.disableSandbox "--sandbox none" ); in lib.mapAttrsToList ( @@ -337,15 +350,32 @@ let before = [ "vmsilo-${vm.name}-vm.service" ]; requiredBy = [ "vmsilo-${vm.name}-vm.service" ]; bindsTo = [ "vmsilo-${vm.name}-vm.service" ]; + # virtiofsd has built-in sandboxing (--sandbox=namespace): creates mount/PID/network + # namespaces, does pivot_root into the shared directory, drops capabilities to a + # minimal set, and applies its own seccomp filter (--seccomp). We only add hardening + # that doesn't overlap: IPC/UTS namespace isolation and seccomp-based protections + # (clock/modules/logs/personality). No CapabilityBoundingSet or NoNewPrivileges — + # virtiofsd manages its own capabilities via capng and both interfere with that. serviceConfig = { Type = "simple"; ExecStart = mkVirtiofsdCmd tag dirConfig; ExecStopPost = pkgs.writeShellScript "cleanup-virtiofsd-${vm.name}-${tag}" '' - rm -f /run/vmsilo/${vm.name}/virtiofs/${tag}.socket + rm -f /run/vmsilo/${vm.name}/virtiofs-${tag}/${tag}.socket ''; + LimitNOFILE = "1048576"; } // lib.optionalAttrs (tag == "home" && sharedHomeEnabled) { - ExecStartPre = [ "${createSharedHome}" ]; + ExecStartPre = [ "+${createSharedHome}" ]; + } + // lib.optionalAttrs (!vm.virtiofs.disableSandbox) { + PrivateIPC = true; + ProtectHostname = true; + ProtectClock = true; + ProtectKernelModules = true; + ProtectKernelLogs = true; + LockPersonality = true; + SystemCallArchitectures = "native"; + MemoryDenyWriteExecute = true; }; } ) effectiveSharedDirs @@ -782,7 +812,6 @@ in "d /run/vmsilo/${vm.name} 0775 root ${config.users.users.${cfg.user}.group} -" "d /run/vmsilo/${vm.name}/gpu 0755 ${cfg.user} ${config.users.users.${cfg.user}.group} -" "d /run/vmsilo/${vm.name}/sound 0755 ${cfg.user} ${config.users.users.${cfg.user}.group} -" - "d /run/vmsilo/${vm.name}/virtiofs 0755 root root -" "d /var/lib/vmsilo/${vm.name} 0755 root root -" ]) allVms ++ lib.optionals anySharedHome [