vmsilo/vm-switch/src/child/process.rs
Davíð Steinn Geirsson 6941d2fe4c feat(vm-switch): add process isolation with namespace sandbox and seccomp
Replace the thread-based vhost-user backend architecture with a
fork-based process model where each VM gets its own child process.
This enables strong isolation between VMs handling untrusted network
traffic, with multiple layers of defense in depth.

Process model:
- Main process watches config directory and orchestrates child lifecycle
- One child process forked per VM, running as vhost-user net backend
- Children communicate via SOCK_SEQPACKET control channel with SCM_RIGHTS
- Automatic child restart on crash/disconnect, with peer notification
- Ping/pong heartbeat monitoring for worker health (1s interval, 100ms timeout)
- SIGCHLD handling integrated into tokio event loop

Inter-process packet forwarding:
- Lock-free SPSC ring buffers in shared memory (memfd + mmap)
- 64-slot rings (~598KB each) with atomic head/tail, no locks in datapath
- Eventfd signaling for empty-to-non-empty transitions
- Main orchestrates buffer exchange: GetBuffer -> BufferReady -> PutBuffer
- Zero-copy path: producers write directly into consumer's shared memory

Namespace sandbox (applied before tokio, single-threaded):
- User namespace: unprivileged outside, UID 0 inside
- PID namespace: main is PID 1, children invisible to host
- Mount namespace: minimal tmpfs root with /config, /dev, /proc, /tmp
- IPC namespace: isolated System V IPC
- Network namespace: empty, communication only via inherited FDs
- Controllable via --no-sandbox flag

Seccomp BPF filtering (two-tier whitelist):
- Main filter: allows fork, socket creation, inotify, openat
- Child filter: strict subset - no fork, no socket, no file open
- Child filter applied after vhost setup, before event loop
- Modes: kill (default), trap (SIGSYS debug), log, disabled

Also adds vm-switch service dependencies to VM units in the NixOS
module so VMs wait for their network switch before starting.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 20:19:26 +00:00

239 lines
9 KiB
Rust

//! Child process main loop.
use std::os::fd::{AsRawFd, OwnedFd, RawFd};
use std::os::unix::net::UnixListener;
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::thread;
use nix::unistd::pipe;
use tracing::{debug, error, info, warn};
use vhost_user_backend::VhostUserDaemon;
use vm_memory::{GuestMemoryAtomic, GuestMemoryMmap};
use crate::control::{ChildToMain, ControlChannel, ControlError, MainToChild};
use crate::mac::Mac;
use crate::ring::{Consumer, Producer};
use crate::seccomp::{apply_child_seccomp, SeccompMode};
use super::forwarder::PacketForwarder;
use super::poll::{poll_events, PollResult};
use super::vhost::ChildVhostBackend;
/// Run the child process.
///
/// This is the entry point after fork(). Does not return.
pub fn run_child_process(
vm_name: &str,
mac: Mac,
control_fd: OwnedFd,
socket_path: &Path,
seccomp_mode: SeccompMode,
) -> ! {
// Set process name for log prefix before any logging
crate::args::set_process_name(format!("worker-{}", vm_name));
info!(vm = %vm_name, mac = %mac, socket = ?socket_path, "child starting");
// Reconstruct control channel from owned fd
let control = ControlChannel::from_fd(control_fd);
// Send Ready to main
let msg = ChildToMain::Ready;
if let Err(e) = control.send(&msg) {
error!(vm = %vm_name, error = %e, "failed to send Ready");
std::process::exit(1)
}
debug!("control: worker-{} -> main Ready", vm_name);
// Create packet forwarder
let forwarder = Arc::new(Mutex::new(PacketForwarder::new(mac)));
// Create vhost backend
let backend = ChildVhostBackend::new(vm_name.to_string(), mac);
// Set TX callback
let fwd = Arc::clone(&forwarder);
backend.set_tx_callback(Box::new(move |frame| {
fwd.lock().unwrap().forward_tx(frame);
}));
// Create vhost socket
if socket_path.exists() {
let _ = std::fs::remove_file(socket_path);
}
let listener = match UnixListener::bind(socket_path) {
Ok(l) => l,
Err(e) => {
error!(vm = %vm_name, error = %e, "failed to bind socket");
std::process::exit(1)
}
};
let _ = listener.set_nonblocking(true);
// Start vhost daemon thread
let mem = GuestMemoryAtomic::new(GuestMemoryMmap::<()>::new());
let mut daemon = match VhostUserDaemon::new(vm_name.to_string(), backend.clone(), mem) {
Ok(d) => d,
Err(e) => {
error!(vm = %vm_name, error = %e, "failed to create daemon");
std::process::exit(1)
}
};
// Create pipe to detect daemon thread exit. The write end is moved into
// the daemon thread; when the thread exits for any reason, the write end
// is dropped, causing POLLHUP on the read end.
let (pipe_rd, pipe_wr) = match pipe() {
Ok((rd, wr)) => (rd, wr),
Err(e) => {
error!(vm = %vm_name, error = %e, "failed to create pipe");
std::process::exit(1)
}
};
let vhost_listener = vhost::vhost_user::Listener::from(listener);
let name = vm_name.to_string();
thread::spawn(move || {
let _pipe_wr = pipe_wr; // dropped on thread exit → POLLHUP on read end
let mut l = vhost_listener;
if let Err(e) = daemon.start(&mut l) {
warn!(vm = %name, error = %e, "daemon start failed");
return;
}
if let Err(e) = daemon.wait() {
debug!(vm = %name, error = %e, "daemon wait returned error");
}
});
// Apply seccomp filter now that setup is complete
// (socket created, thread spawned, signals configured)
if let Err(e) = apply_child_seccomp(seccomp_mode) {
error!(vm = %vm_name, error = %e, "failed to apply seccomp");
std::process::exit(1);
}
if seccomp_mode != SeccompMode::Disabled {
debug!(vm = %vm_name, mode = ?seccomp_mode, "seccomp filter applied");
}
// Main event loop
let daemon_exit_fd = pipe_rd.as_raw_fd();
match event_loop(vm_name, control, forwarder, backend, daemon_exit_fd) {
Ok(()) => {
info!(vm = %vm_name, "exiting normally");
std::process::exit(0)
}
Err(e) => {
error!(vm = %vm_name, error = %e, "exiting with error");
std::process::exit(1)
}
}
}
fn event_loop(
vm_name: &str,
control: ControlChannel,
forwarder: Arc<Mutex<PacketForwarder>>,
backend: Arc<ChildVhostBackend>,
daemon_exit_fd: RawFd,
) -> Result<(), ControlError> {
let control_fd = control.as_raw_fd();
loop {
let ingress_fds = forwarder.lock().unwrap().ingress_eventfds();
match poll_events(control_fd, Some(daemon_exit_fd), &ingress_fds, 100) {
PollResult::TxKick => {
// Daemon thread exited (pipe write end closed → POLLHUP)
info!(vm = %vm_name, "vhost daemon exited, shutting down");
return Ok(());
}
PollResult::Control => {
let (msg, fds) = match control.recv_with_fds_typed() {
Ok(r) => r,
Err(ControlError::Closed) => {
debug!(vm = %vm_name, "control closed");
return Ok(());
}
Err(e) => return Err(e),
};
match msg {
MainToChild::GetBuffer { peer_name, peer_mac } => {
debug!(
"control: main -> worker-{} GetBuffer({}, {})",
vm_name, peer_name, Mac::from_bytes(peer_mac)
);
// Create ingress buffer (we are Consumer)
match Consumer::new() {
Ok(consumer) => {
let response_fds = [
consumer.memfd().as_raw_fd(),
consumer.eventfd().as_raw_fd(),
];
let response = ChildToMain::BufferReady {
peer_name: peer_name.clone(),
};
if let Err(e) = control.send_with_fds_typed(&response, &response_fds) {
warn!(vm = %vm_name, error = %e, "failed to send BufferReady");
} else {
debug!(
"control: worker-{} -> main BufferReady({})",
vm_name, peer_name
);
forwarder.lock().unwrap().add_ingress(peer_name, peer_mac, consumer);
}
}
Err(e) => warn!(vm = %vm_name, error = %e, "failed to create ingress buffer"),
}
}
MainToChild::PutBuffer { peer_name, peer_mac, broadcast } => {
debug!(
"control: main -> worker-{} PutBuffer({}, {}, broadcast={})",
vm_name, peer_name, Mac::from_bytes(peer_mac), broadcast
);
if fds.len() == 2 {
let mut fds = fds.into_iter();
match Producer::from_fds(fds.next().unwrap(), fds.next().unwrap()) {
Ok(producer) => {
forwarder.lock().unwrap().add_egress(
peer_name,
peer_mac,
producer,
broadcast,
);
}
Err(e) => warn!(vm = %vm_name, error = %e, "failed to map egress buffer"),
}
} else {
warn!(vm = %vm_name, "PutBuffer with wrong number of FDs: {}", fds.len());
}
}
MainToChild::RemovePeer { peer_name } => {
debug!("control: main -> worker-{} RemovePeer({})", vm_name, peer_name);
forwarder.lock().unwrap().remove_peer(&peer_name);
}
MainToChild::Ping => {
control.send(&ChildToMain::Pong)?;
}
}
}
PollResult::Ingress(_) | PollResult::Timeout => {
let frames = forwarder.lock().unwrap().poll_ingress();
for frame in frames {
if !backend.inject_rx_frame(&frame) {
debug!(vm = %vm_name, "RX inject failed (queue full)");
}
}
}
PollResult::Error(e) => {
warn!(vm = %vm_name, error = ?e, "poll error");
}
}
}
}