Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 76 additions & 34 deletions src/guest/init/src/exec_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,52 +67,88 @@ const EXEC_CONTROL_FLUSH: &[u8] = b"flush";
/// match the host's `EXEC_FLUSH_ACK` in `runtime/src/grpc/exec.rs`.
const EXEC_FLUSH_ACK: &[u8] = b"flush-ack";

/// Run the exec server, listening on vsock port 4089.
/// A bound, listening exec-server socket — produced by [`bind_exec_server`] and
/// consumed by [`serve_exec_server`].
///
/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`.
/// On non-Linux platforms, this is a no-op (development stub).
pub fn run_exec_server() -> Result<(), Box<dyn std::error::Error>> {
info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT);
/// Splitting bind from serve lets guest-init bind the exec vsock port EARLY on
/// the main thread (pure socket/bind/listen syscalls — no thread spawn, so the
/// later single-threaded container `fork()` stays fork-safe) while the accept
/// loop runs afterwards in its own thread. Binding early fills the listen
/// backlog from the start of boot, so a host connect QUEUES instead of being
/// refused while the slower boot steps (network, container spawn) finish — this
/// removes the "Connection refused" / heartbeat race of issue #3. On non-Linux
/// this is an inert placeholder so callers stay platform-agnostic.
#[cfg(target_os = "linux")]
pub struct ExecListener(std::os::fd::OwnedFd);
#[cfg(not(target_os = "linux"))]
pub struct ExecListener;

/// Bind + listen the exec vsock socket (port 4089). Pure socket syscalls, safe
/// to call on the main thread before the container fork.
pub fn bind_exec_server() -> Result<ExecListener, Box<dyn std::error::Error>> {
#[cfg(target_os = "linux")]
{
run_vsock_server()?;
use nix::sys::socket::{
bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
};
use std::os::fd::AsRawFd;

let sock_fd = socket(
AddressFamily::Vsock,
SockType::Stream,
SockFlag::empty(),
None,
)?;

// Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on
// macOS — and so the forked container never inherits the listening socket.
unsafe {
libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
}

let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT);
bind(sock_fd.as_raw_fd(), &addr)?;
listen(&sock_fd, Backlog::new(4)?)?;

info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT);
Ok(ExecListener(sock_fd))
}

#[cfg(not(target_os = "linux"))]
{
info!("Exec server not available on non-Linux platform (development mode)");
Ok(ExecListener)
}

Ok(())
}

/// Linux vsock server implementation.
#[cfg(target_os = "linux")]
fn run_vsock_server() -> Result<(), Box<dyn std::error::Error>> {
use nix::sys::socket::{
accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
};
use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
use tracing::error;

let sock_fd = socket(
AddressFamily::Vsock,
SockType::Stream,
SockFlag::empty(),
None,
)?;
/// Run the exec accept loop on an already-bound listener. Intended to run on its
/// own thread for the VM's lifetime; never returns under normal operation.
pub fn serve_exec_server(listener: ExecListener) -> Result<(), Box<dyn std::error::Error>> {
#[cfg(target_os = "linux")]
{
run_accept_loop(listener.0)
}

// Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS
unsafe {
libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
#[cfg(not(target_os = "linux"))]
{
let _ = listener;
Ok(())
}
}

let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT);
bind(sock_fd.as_raw_fd(), &addr)?;
listen(&sock_fd, Backlog::new(4)?)?;
/// Bind then serve in one call. Kept for callers that don't need the early-bind
/// split (e.g. tests); guest-init's boot path uses `bind_*` + `serve_*` directly.
pub fn run_exec_server() -> Result<(), Box<dyn std::error::Error>> {
info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT);
serve_exec_server(bind_exec_server()?)
}

info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT);
/// The exec server accept loop.
#[cfg(target_os = "linux")]
fn run_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box<dyn std::error::Error>> {
use nix::sys::socket::accept;
use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
use tracing::error;

loop {
match accept(sock_fd.as_raw_fd()) {
Expand Down Expand Up @@ -715,8 +751,12 @@ fn execute_command(
Err(output) => return output,
};

let mut child = match command.spawn() {
Ok(child) => child,
// Spawn under the reaper registry: the pid is marked MANAGED before the PID 1
// supervision loop can see it, so the loop leaves this child for us to reap
// (and read its real exit code) instead of stealing it. The guard unregisters
// the pid when this function returns (all paths).
let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) {
Ok(pair) => pair,
Err(e) => {
return ExecOutput {
stdout: vec![],
Expand Down Expand Up @@ -1007,8 +1047,10 @@ fn execute_command_streaming(
}
};

let mut child = match command.spawn() {
Ok(child) => child,
// Spawn under the reaper registry (see one-shot path) so PID 1 leaves this
// streaming child for us to reap; the guard unregisters on return.
let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) {
Ok(pair) => pair,
Err(e) => {
let output = ExecOutput {
stdout: vec![],
Expand Down
1 change: 1 addition & 0 deletions src/guest/init/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub mod namespace;
pub mod network;
pub mod port_forward;
pub mod pty_server;
pub mod reaper;
pub mod user;

pub use namespace::{spawn_isolated, NamespaceConfig, NamespaceError};
Expand Down
132 changes: 101 additions & 31 deletions src/guest/init/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,11 @@ impl ExecConfig {
/// - BOX_EXEC_ENV_*: container environment variables
/// - BOX_EXEC_WORKDIR: working directory (defaults to "/")
fn from_env() -> Self {
let executable =
std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/sbin/init".to_string());
// The runtime always sets BOX_EXEC_EXEC when guest-init is PID 1
// (runtime/src/vm/spec.rs), so this default is only a defensive fallback.
// Use /bin/sh — universal across distros — never /sbin/init, which does
// not exist on Alpine and was the original cause of issue #3.
let executable = std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/bin/sh".to_string());

// Parse args from individual env vars (BOX_EXEC_ARGC + BOX_EXEC_ARG_0..N)
let args: Vec<String> = match std::env::var("BOX_EXEC_ARGC")
Expand Down Expand Up @@ -268,6 +271,18 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
// Step 2.5: Mount tmpfs volumes
mount_tmpfs_volumes()?;

// Step 2.6: Bind the exec (vsock 4089) and PTY (vsock 4090) listening sockets
// NOW, before the slower network bring-up and container spawn below. These are
// pure socket/bind/listen syscalls on this (still single-threaded) main thread,
// so the later container fork stays fork-safe; the accept loops are spawned as
// threads only after the fork (Step 8). Binding this early fills the listen
// backlog from the start of boot, so a host connect QUEUES instead of being
// refused while network setup and the container spawn finish — closing the
// exec/PTY startup race of issue #3. CLOEXEC on the fds keeps the forked
// container from inheriting the listeners.
let exec_listener = exec_server::bind_exec_server()?;
let pty_listener = pty_server::bind_pty_server()?;

// Step 3: Configure guest network (if passt mode is active).
// Network setup may write /etc/resolv.conf — must run before read-only remount.
network::configure_guest_network()?;
Expand Down Expand Up @@ -362,9 +377,11 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {

expose_container_env_to_exec(&exec_config);

// Step 8: Start exec server in background thread
std::thread::spawn(|| {
if let Err(e) = exec_server::run_exec_server() {
// Step 8: Start the exec server accept loop on the socket bound in Step 2.6.
// (set_container_pid above ran first, so a host signal-main frame still finds
// the PID once the loop is serving.)
std::thread::spawn(move || {
if let Err(e) = exec_server::serve_exec_server(exec_listener) {
error!("Exec server failed: {}", e);
}
});
Expand All @@ -376,9 +393,9 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
}
});

// Step 8.5: Start PTY server in background thread
std::thread::spawn(|| {
if let Err(e) = pty_server::run_pty_server() {
// Step 8.5: Start the PTY server accept loop on the socket bound in Step 2.6.
std::thread::spawn(move || {
if let Err(e) = pty_server::serve_pty_server(pty_listener) {
error!("PTY server failed: {}", e);
}
});
Expand Down Expand Up @@ -969,55 +986,105 @@ fn remount_rootfs_readonly() -> Result<(), Box<dyn std::error::Error>> {
Ok(())
}

/// Wait for the main container process.
/// Supervise children as PID 1: propagate the container's exit, and reap orphans.
///
/// Exec and PTY requests run in other guest-init threads and wait for their
/// own child processes. The main supervision loop must not call waitpid(-1),
/// otherwise it can reap those children before the request handler observes
/// their exit status.
/// Exec and PTY request handlers reap their OWN children (each `waitpid`s a
/// specific pid) to read the real exit status, so this loop must not steal them
/// with a blind `waitpid(-1)`. It peeks exited children non-destructively with
/// `waitid(WNOWAIT)` and, via the [`reaper`](a3s_box_guest_init::reaper)
/// registry, reaps only the container (→ VM lifecycle / exit code) and UNMANAGED
/// children — reparented grandchildren and the sidecar — leaving handler-managed
/// children for their handler. This propagates the container exit code AND fixes
/// the zombie leak (orphans were previously never reaped until shutdown).
#[cfg(target_os = "linux")]
fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box<dyn std::error::Error>> {
use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
use a3s_box_guest_init::reaper;
use nix::sys::wait::{waitid, waitpid, Id, WaitPidFlag, WaitStatus};

/// Maximum time to wait for children after forwarding SIGTERM (5 seconds).
const CHILD_SHUTDOWN_TIMEOUT_MS: u64 = 5000;

info!("Waiting for container process {}", container_pid);
info!(
"Supervising children as PID 1; container PID {}",
container_pid
);

loop {
// Check if shutdown was requested via SIGTERM
if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) {
info!("SIGTERM received, initiating graceful shutdown");
graceful_shutdown(CHILD_SHUTDOWN_TIMEOUT_MS);
return Ok(());
}

// Drain currently-exited children. `WNOWAIT` peeks without reaping, so a
// handler-managed child stays reapable by its handler; we break on it and
// revisit next tick (the handler clears it within its own poll interval).
loop {
let (pid, code, signaled) = match waitid(
Id::All,
WaitPidFlag::WEXITED | WaitPidFlag::WNOWAIT | WaitPidFlag::WNOHANG,
) {
Ok(WaitStatus::Exited(pid, status)) => (pid, status, false),
Ok(WaitStatus::Signaled(pid, signal, _)) => (pid, 128 + signal as i32, true),
// No exited child right now: stop draining and poll again later.
Ok(_) => break,
// No children at all (container already gone): nothing to supervise.
Err(nix::errno::Errno::ECHILD) => return Ok(()),
// Transient error: retry on the next tick.
Err(_) => break,
};

if pid == container_pid {
// The container drives the VM lifecycle: reap it and exit with its
// status so the host (and detached `run -d wait`) sees the real code.
let _ = waitpid(pid, None);
if signaled {
error!("Container process {} terminated (exit code {})", pid, code);
} else {
info!("Container process {} exited with status {}", pid, code);
}
persist_exit_code(code);
process::exit(code);
} else if reaper::is_managed(pid.as_raw()) {
// Owned by an exec/PTY handler, which reaps it for the real status.
// Stop draining; it clears shortly and we revisit on the next tick.
break;
} else {
// Orphan (reparented grandchild) or the sidecar: reap it here so it
// does not linger as a zombie. Keep draining for more.
let _ = waitpid(pid, Some(WaitPidFlag::WNOHANG));
}
}

std::thread::sleep(std::time::Duration::from_millis(100));
}
}

/// Non-Linux development stub: just wait for the container process to exit.
#[cfg(not(target_os = "linux"))]
fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box<dyn std::error::Error>> {
use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};

loop {
if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) {
return Ok(());
}
match waitpid(container_pid, Some(WaitPidFlag::WNOHANG)) {
Ok(WaitStatus::Exited(pid, status)) => {
info!("Container process {} exited with status {}", pid, status);
Ok(WaitStatus::Exited(_, status)) => {
persist_exit_code(status);
process::exit(status);
}
Ok(WaitStatus::Signaled(pid, signal, _)) => {
error!("Container process {} killed by signal {:?}", pid, signal);
Ok(WaitStatus::Signaled(_, signal, _)) => {
persist_exit_code(128 + signal as i32);
process::exit(128 + signal as i32);
}
Ok(WaitStatus::StillAlive) => {
std::thread::sleep(std::time::Duration::from_millis(100));
}
Ok(_) => {
// Other status, continue waiting
}
Err(nix::errno::Errno::ECHILD) => {
info!("Container process {} is no longer a child", container_pid);
break;
}
Err(e) => {
return Err(format!("waitpid failed: {}", e).into());
}
Ok(_) => {}
Err(_) => break,
}
}

Ok(())
}

Expand All @@ -1036,6 +1103,9 @@ fn persist_exit_code(code: i32) {
}

/// Perform graceful shutdown: forward SIGTERM to children, wait, then force-kill.
/// Only the Linux supervision loop drives this (the non-Linux dev stub exits the
/// process directly), so it is gated to avoid a dead-code warning on macOS.
#[cfg(target_os = "linux")]
fn graceful_shutdown(timeout_ms: u64) {
// Step 1: Send SIGTERM to all processes (except ourselves, PID 1)
#[cfg(target_os = "linux")]
Expand Down
Loading
Loading