fix: add missing syscalls to GPU device seccomp allowlist

The allowlist was derived from crosvm's gpu_common + gpu_device seccomp
policies, but those are applied after process startup. systemd applies
the filter before exec, so process lifecycle (execve, wait4, arch_prctl,
set_tid_address), capability management (capget, capset), and socket
server (bind, listen, accept4, socketpair) syscalls are also needed.

Also create a shader cache directory at /run/vmsilo/<name>/gpu/cache and
set __GL_SHADER_DISK_CACHE_PATH so the GPU device backend doesn't fail
trying to create /home for shader cache in the sandboxed mount namespace.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Davíð Steinn Geirsson 2026-03-18 18:54:11 +00:00
parent e9495f043d
commit 21e0a68023
2 changed files with 34 additions and 16 deletions

View file

@ -164,24 +164,26 @@ RestrictNamespaces = yes
**Seccomp allowlist** (from crosvm `gpu_common.policy` + `gpu_device.policy`):
```
access brk clock_gettime clock_nanosleep clone clone3 close connect
dup dup2 epoll_create1 epoll_ctl epoll_pwait epoll_wait eventfd2
exit exit_group fallocate fcntl flock fstat fstatfs ftruncate futex
getcwd getdents getdents64 getegid geteuid getgid getpid getrandom
getsockopt gettid gettimeofday getuid inotify_add_watch inotify_init1
inotify_rm_watch io_uring_enter io_uring_register io_uring_setup ioctl
kcmp kill lseek lstat madvise membarrier memfd_create mkdir mknodat
mmap mprotect mremap munmap nanosleep newfstatat open openat pipe2
poll ppoll prctl pread64 prlimit64 read readlink readlinkat readv
recvfrom recvmsg rename restart_syscall rseq rt_sigaction rt_sigprocmask
rt_sigreturn sched_get_priority_max sched_get_priority_min
sched_getaffinity sched_setaffinity sched_setscheduler sched_yield
sendmmsg sendmsg sendto set_robust_list setpriority setsockopt shutdown
sigaltstack socket stat statfs statx sysinfo tgkill uname unlink
unlinkat userfaultfd write writev
accept4 access arch_prctl bind brk capget capset clock_gettime
clock_nanosleep clone clone3 close connect dup dup2 epoll_create1
epoll_ctl epoll_pwait epoll_wait eventfd2 execve exit exit_group
fallocate fcntl flock fstat fstatfs ftruncate futex getcwd getdents
getdents64 getegid geteuid getgid getpgrp getpid getppid getrandom
getresgid getresuid getsockopt gettid gettimeofday getuid
inotify_add_watch inotify_init1 inotify_rm_watch io_uring_enter
io_uring_register io_uring_setup ioctl kcmp kill listen lseek lstat
madvise membarrier memfd_create mkdir mknodat mmap mprotect mremap
munmap nanosleep newfstatat open openat pipe2 poll ppoll prctl pread64
prlimit64 read readlink readlinkat readv recvfrom recvmsg rename
restart_syscall rseq rt_sigaction rt_sigprocmask rt_sigreturn
sched_get_priority_max sched_get_priority_min sched_getaffinity
sched_setaffinity sched_setscheduler sched_yield sendmmsg sendmsg
sendto set_robust_list set_tid_address setpriority setsockopt shutdown
sigaltstack socket socketpair stat statfs statx sysinfo tgkill uname
unlink unlinkat userfaultfd wait4 write writev
```
Note: systemd cannot do argument-level filtering like crosvm's seccomp policies (e.g., restricting `socket()` to AF_UNIX, `clone()` to CLONE_THREAD, `mmap` to specific PROT flags). The syscall set is identical but without argument restrictions.
Note: This allowlist is a superset of crosvm's `gpu_common.policy` + `gpu_device.policy` — it additionally includes process lifecycle syscalls (`execve`, `wait4`, `arch_prctl`, `set_tid_address`), capability management (`capget`, `capset`), and socket server syscalls (`bind`, `listen`, `accept4`) that crosvm's own seccomp doesn't need (crosvm applies its policy after startup; systemd applies it before). systemd also cannot do argument-level filtering like crosvm's seccomp policies (e.g., restricting `socket()` to AF_UNIX, `clone()` to CLONE_THREAD).
**Service dependencies:**
- `After=vmsilo-<vmname>-wayland-seccontext.service`

View file

@ -58,8 +58,13 @@ let
vm.gpu;
gpuSyscallAllowlist = [
"accept4"
"access"
"arch_prctl"
"bind"
"brk"
"capget"
"capset"
"clock_gettime"
"clock_nanosleep"
"clone"
@ -73,6 +78,7 @@ let
"epoll_pwait"
"epoll_wait"
"eventfd2"
"execve"
"exit"
"exit_group"
"fallocate"
@ -88,8 +94,12 @@ let
"getegid"
"geteuid"
"getgid"
"getpgrp"
"getpid"
"getppid"
"getrandom"
"getresgid"
"getresuid"
"getsockopt"
"gettid"
"gettimeofday"
@ -103,6 +113,7 @@ let
"ioctl"
"kcmp"
"kill"
"listen"
"lseek"
"lstat"
"madvise"
@ -146,11 +157,13 @@ let
"sendmsg"
"sendto"
"set_robust_list"
"set_tid_address"
"setpriority"
"setsockopt"
"shutdown"
"sigaltstack"
"socket"
"socketpair"
"stat"
"statfs"
"statx"
@ -160,6 +173,7 @@ let
"unlink"
"unlinkat"
"userfaultfd"
"wait4"
"write"
"writev"
];
@ -252,6 +266,7 @@ in
${pkgs.coreutils}/bin/install -d -m 0755 -o ${toString userUid} -g ${toString userGid} \
/run/vmsilo/${vm.name} \
/run/vmsilo/${vm.name}/gpu \
/run/vmsilo/${vm.name}/gpu/shader-cache \
/run/vmsilo/${vm.name}/sound
${pkgs.coreutils}/bin/install -d -m 0755 \
/run/vmsilo/${vm.name}/virtiofs
@ -571,6 +586,7 @@ in
environment = {
LD_LIBRARY_PATH = "${pkgs.vulkan-loader}/lib";
__GL_SHADER_DISK_CACHE_PATH = "/run/vmsilo/${vm.name}/gpu/shader-cache";
};
serviceConfig = {