Replace the thread-based vhost-user backend architecture with a fork-based process model where each VM gets its own child process. This enables strong isolation between VMs handling untrusted network traffic, with multiple layers of defense in depth. Process model: - Main process watches config directory and orchestrates child lifecycle - One child process forked per VM, running as vhost-user net backend - Children communicate via SOCK_SEQPACKET control channel with SCM_RIGHTS - Automatic child restart on crash/disconnect, with peer notification - Ping/pong heartbeat monitoring for worker health (1s interval, 100ms timeout) - SIGCHLD handling integrated into tokio event loop Inter-process packet forwarding: - Lock-free SPSC ring buffers in shared memory (memfd + mmap) - 64-slot rings (~598KB each) with atomic head/tail, no locks in datapath - Eventfd signaling for empty-to-non-empty transitions - Main orchestrates buffer exchange: GetBuffer -> BufferReady -> PutBuffer - Zero-copy path: producers write directly into consumer's shared memory Namespace sandbox (applied before tokio, single-threaded): - User namespace: unprivileged outside, UID 0 inside - PID namespace: main is PID 1, children invisible to host - Mount namespace: minimal tmpfs root with /config, /dev, /proc, /tmp - IPC namespace: isolated System V IPC - Network namespace: empty, communication only via inherited FDs - Controllable via --no-sandbox flag Seccomp BPF filtering (two-tier whitelist): - Main filter: allows fork, socket creation, inotify, openat - Child filter: strict subset - no fork, no socket, no file open - Child filter applied after vhost setup, before event loop - Modes: kill (default), trap (SIGSYS debug), log, disabled Also adds vm-switch service dependencies to VM units in the NixOS module so VMs wait for their network switch before starting. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
239 lines
9 KiB
Rust
239 lines
9 KiB
Rust
//! Child process main loop.
|
|
|
|
use std::os::fd::{AsRawFd, OwnedFd, RawFd};
|
|
use std::os::unix::net::UnixListener;
|
|
use std::path::Path;
|
|
use std::sync::{Arc, Mutex};
|
|
use std::thread;
|
|
|
|
use nix::unistd::pipe;
|
|
use tracing::{debug, error, info, warn};
|
|
use vhost_user_backend::VhostUserDaemon;
|
|
use vm_memory::{GuestMemoryAtomic, GuestMemoryMmap};
|
|
|
|
use crate::control::{ChildToMain, ControlChannel, ControlError, MainToChild};
|
|
use crate::mac::Mac;
|
|
use crate::ring::{Consumer, Producer};
|
|
use crate::seccomp::{apply_child_seccomp, SeccompMode};
|
|
|
|
use super::forwarder::PacketForwarder;
|
|
use super::poll::{poll_events, PollResult};
|
|
use super::vhost::ChildVhostBackend;
|
|
|
|
/// Run the child process.
|
|
///
|
|
/// This is the entry point after fork(). Does not return.
|
|
pub fn run_child_process(
|
|
vm_name: &str,
|
|
mac: Mac,
|
|
control_fd: OwnedFd,
|
|
socket_path: &Path,
|
|
seccomp_mode: SeccompMode,
|
|
) -> ! {
|
|
// Set process name for log prefix before any logging
|
|
crate::args::set_process_name(format!("worker-{}", vm_name));
|
|
|
|
info!(vm = %vm_name, mac = %mac, socket = ?socket_path, "child starting");
|
|
|
|
// Reconstruct control channel from owned fd
|
|
let control = ControlChannel::from_fd(control_fd);
|
|
|
|
// Send Ready to main
|
|
let msg = ChildToMain::Ready;
|
|
if let Err(e) = control.send(&msg) {
|
|
error!(vm = %vm_name, error = %e, "failed to send Ready");
|
|
std::process::exit(1)
|
|
}
|
|
debug!("control: worker-{} -> main Ready", vm_name);
|
|
|
|
// Create packet forwarder
|
|
let forwarder = Arc::new(Mutex::new(PacketForwarder::new(mac)));
|
|
|
|
// Create vhost backend
|
|
let backend = ChildVhostBackend::new(vm_name.to_string(), mac);
|
|
|
|
// Set TX callback
|
|
let fwd = Arc::clone(&forwarder);
|
|
backend.set_tx_callback(Box::new(move |frame| {
|
|
fwd.lock().unwrap().forward_tx(frame);
|
|
}));
|
|
|
|
// Create vhost socket
|
|
if socket_path.exists() {
|
|
let _ = std::fs::remove_file(socket_path);
|
|
}
|
|
let listener = match UnixListener::bind(socket_path) {
|
|
Ok(l) => l,
|
|
Err(e) => {
|
|
error!(vm = %vm_name, error = %e, "failed to bind socket");
|
|
std::process::exit(1)
|
|
}
|
|
};
|
|
let _ = listener.set_nonblocking(true);
|
|
|
|
// Start vhost daemon thread
|
|
let mem = GuestMemoryAtomic::new(GuestMemoryMmap::<()>::new());
|
|
let mut daemon = match VhostUserDaemon::new(vm_name.to_string(), backend.clone(), mem) {
|
|
Ok(d) => d,
|
|
Err(e) => {
|
|
error!(vm = %vm_name, error = %e, "failed to create daemon");
|
|
std::process::exit(1)
|
|
}
|
|
};
|
|
|
|
// Create pipe to detect daemon thread exit. The write end is moved into
|
|
// the daemon thread; when the thread exits for any reason, the write end
|
|
// is dropped, causing POLLHUP on the read end.
|
|
let (pipe_rd, pipe_wr) = match pipe() {
|
|
Ok((rd, wr)) => (rd, wr),
|
|
Err(e) => {
|
|
error!(vm = %vm_name, error = %e, "failed to create pipe");
|
|
std::process::exit(1)
|
|
}
|
|
};
|
|
|
|
let vhost_listener = vhost::vhost_user::Listener::from(listener);
|
|
let name = vm_name.to_string();
|
|
thread::spawn(move || {
|
|
let _pipe_wr = pipe_wr; // dropped on thread exit → POLLHUP on read end
|
|
let mut l = vhost_listener;
|
|
if let Err(e) = daemon.start(&mut l) {
|
|
warn!(vm = %name, error = %e, "daemon start failed");
|
|
return;
|
|
}
|
|
if let Err(e) = daemon.wait() {
|
|
debug!(vm = %name, error = %e, "daemon wait returned error");
|
|
}
|
|
});
|
|
|
|
// Apply seccomp filter now that setup is complete
|
|
// (socket created, thread spawned, signals configured)
|
|
if let Err(e) = apply_child_seccomp(seccomp_mode) {
|
|
error!(vm = %vm_name, error = %e, "failed to apply seccomp");
|
|
std::process::exit(1);
|
|
}
|
|
if seccomp_mode != SeccompMode::Disabled {
|
|
debug!(vm = %vm_name, mode = ?seccomp_mode, "seccomp filter applied");
|
|
}
|
|
|
|
// Main event loop
|
|
let daemon_exit_fd = pipe_rd.as_raw_fd();
|
|
match event_loop(vm_name, control, forwarder, backend, daemon_exit_fd) {
|
|
Ok(()) => {
|
|
info!(vm = %vm_name, "exiting normally");
|
|
std::process::exit(0)
|
|
}
|
|
Err(e) => {
|
|
error!(vm = %vm_name, error = %e, "exiting with error");
|
|
std::process::exit(1)
|
|
}
|
|
}
|
|
}
|
|
|
|
fn event_loop(
|
|
vm_name: &str,
|
|
control: ControlChannel,
|
|
forwarder: Arc<Mutex<PacketForwarder>>,
|
|
backend: Arc<ChildVhostBackend>,
|
|
daemon_exit_fd: RawFd,
|
|
) -> Result<(), ControlError> {
|
|
let control_fd = control.as_raw_fd();
|
|
|
|
loop {
|
|
let ingress_fds = forwarder.lock().unwrap().ingress_eventfds();
|
|
|
|
match poll_events(control_fd, Some(daemon_exit_fd), &ingress_fds, 100) {
|
|
PollResult::TxKick => {
|
|
// Daemon thread exited (pipe write end closed → POLLHUP)
|
|
info!(vm = %vm_name, "vhost daemon exited, shutting down");
|
|
return Ok(());
|
|
}
|
|
PollResult::Control => {
|
|
let (msg, fds) = match control.recv_with_fds_typed() {
|
|
Ok(r) => r,
|
|
Err(ControlError::Closed) => {
|
|
debug!(vm = %vm_name, "control closed");
|
|
return Ok(());
|
|
}
|
|
Err(e) => return Err(e),
|
|
};
|
|
|
|
match msg {
|
|
MainToChild::GetBuffer { peer_name, peer_mac } => {
|
|
debug!(
|
|
"control: main -> worker-{} GetBuffer({}, {})",
|
|
vm_name, peer_name, Mac::from_bytes(peer_mac)
|
|
);
|
|
|
|
// Create ingress buffer (we are Consumer)
|
|
match Consumer::new() {
|
|
Ok(consumer) => {
|
|
let response_fds = [
|
|
consumer.memfd().as_raw_fd(),
|
|
consumer.eventfd().as_raw_fd(),
|
|
];
|
|
let response = ChildToMain::BufferReady {
|
|
peer_name: peer_name.clone(),
|
|
};
|
|
if let Err(e) = control.send_with_fds_typed(&response, &response_fds) {
|
|
warn!(vm = %vm_name, error = %e, "failed to send BufferReady");
|
|
} else {
|
|
debug!(
|
|
"control: worker-{} -> main BufferReady({})",
|
|
vm_name, peer_name
|
|
);
|
|
forwarder.lock().unwrap().add_ingress(peer_name, peer_mac, consumer);
|
|
}
|
|
}
|
|
Err(e) => warn!(vm = %vm_name, error = %e, "failed to create ingress buffer"),
|
|
}
|
|
}
|
|
|
|
MainToChild::PutBuffer { peer_name, peer_mac, broadcast } => {
|
|
debug!(
|
|
"control: main -> worker-{} PutBuffer({}, {}, broadcast={})",
|
|
vm_name, peer_name, Mac::from_bytes(peer_mac), broadcast
|
|
);
|
|
|
|
if fds.len() == 2 {
|
|
let mut fds = fds.into_iter();
|
|
match Producer::from_fds(fds.next().unwrap(), fds.next().unwrap()) {
|
|
Ok(producer) => {
|
|
forwarder.lock().unwrap().add_egress(
|
|
peer_name,
|
|
peer_mac,
|
|
producer,
|
|
broadcast,
|
|
);
|
|
}
|
|
Err(e) => warn!(vm = %vm_name, error = %e, "failed to map egress buffer"),
|
|
}
|
|
} else {
|
|
warn!(vm = %vm_name, "PutBuffer with wrong number of FDs: {}", fds.len());
|
|
}
|
|
}
|
|
|
|
MainToChild::RemovePeer { peer_name } => {
|
|
debug!("control: main -> worker-{} RemovePeer({})", vm_name, peer_name);
|
|
forwarder.lock().unwrap().remove_peer(&peer_name);
|
|
}
|
|
|
|
MainToChild::Ping => {
|
|
control.send(&ChildToMain::Pong)?;
|
|
}
|
|
}
|
|
}
|
|
PollResult::Ingress(_) | PollResult::Timeout => {
|
|
let frames = forwarder.lock().unwrap().poll_ingress();
|
|
for frame in frames {
|
|
if !backend.inject_rx_frame(&frame) {
|
|
debug!(vm = %vm_name, "RX inject failed (queue full)");
|
|
}
|
|
}
|
|
}
|
|
PollResult::Error(e) => {
|
|
warn!(vm = %vm_name, error = ?e, "poll error");
|
|
}
|
|
}
|
|
}
|
|
}
|