From c44e4a49e7782d7fd45d8930746b44eb3029dc8d Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 10:47:46 +0800 Subject: [PATCH 1/2] refactor(init): early-bind vsock servers + event-driven readiness (issue #3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructures the exec/PTY readiness path so boot waits for a real readiness EVENT bounded by VM liveness, instead of guessing a fixed timeout — replacing the interim 10s→30s band-aid. P1 — bind early, serve late. Split exec_server/pty_server into bind_*()->Listener (pure socket/bind/listen syscalls) and serve_*(listener) (the accept loop). run_init now binds both vsock listeners on the main thread right after the filesystem mounts (Step 2.6), BEFORE the slow network bring-up and the container fork, then spawns the accept loops after the fork (Step 8). Binding adds no thread, so the single-threaded-at-fork invariant that keeps spawn_isolated safe is preserved. The listen backlog is filled from boot, so a host connect QUEUES instead of being refused — this removes the `run -it` PTY "Connection refused". CLOEXEC keeps the forked container from inheriting the listeners. P2 — event-driven, liveness-bounded readiness. Early binding makes the host `connect` succeed immediately, so heartbeat()'s (timeout-less) read would block until the guest's accept loop runs. wait_for_exec_ready is rewritten to bound each connect+heartbeat attempt (tokio timeout), return at once when the VM exits (has_exited, zombie-aware — fast-exit containers never stall), and treat a large absolute cap purely as a backstop against a wedged-but-alive guest. A healthy guest passes the heartbeat the moment its accept loop runs, however late in a slow cold boot — so the false "heartbeat failed" warning is gone without a fixed budget to outrun. Also folds in the issue-#3 cleanups: dead `/sbin/init` BOX_EXEC_EXEC default → `/bin/sh`, and the stale resolve_oci_entrypoint doc comment. Deferred: an explicit guest→host "ready" beacon on a new vsock port was considered but NOT wired — port_forward uses add_vsock_port(listen=true) with a guest connect-out, which contradicts the assumed listen=false direction for guest→host, and that is only verifiable on KVM. The liveness-bounded heartbeat achieves the same correctness without guessing cross-process vsock semantics. Supersedes the interim 30s fix (PR #14). --- src/guest/init/src/exec_server.rs | 96 ++++++++++++++++++--------- src/guest/init/src/main.rs | 33 ++++++--- src/guest/init/src/pty_server.rs | 90 ++++++++++++++++--------- src/runtime/src/vm/ready.rs | 107 ++++++++++++++---------------- src/runtime/src/vm/spec.rs | 3 +- 5 files changed, 201 insertions(+), 128 deletions(-) diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs index a593ef7a..eb7da21a 100644 --- a/src/guest/init/src/exec_server.rs +++ b/src/guest/init/src/exec_server.rs @@ -67,52 +67,88 @@ const EXEC_CONTROL_FLUSH: &[u8] = b"flush"; /// match the host's `EXEC_FLUSH_ACK` in `runtime/src/grpc/exec.rs`. const EXEC_FLUSH_ACK: &[u8] = b"flush-ack"; -/// Run the exec server, listening on vsock port 4089. +/// A bound, listening exec-server socket — produced by [`bind_exec_server`] and +/// consumed by [`serve_exec_server`]. /// -/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`. -/// On non-Linux platforms, this is a no-op (development stub). -pub fn run_exec_server() -> Result<(), Box> { - info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT); +/// Splitting bind from serve lets guest-init bind the exec vsock port EARLY on +/// the main thread (pure socket/bind/listen syscalls — no thread spawn, so the +/// later single-threaded container `fork()` stays fork-safe) while the accept +/// loop runs afterwards in its own thread. Binding early fills the listen +/// backlog from the start of boot, so a host connect QUEUES instead of being +/// refused while the slower boot steps (network, container spawn) finish — this +/// removes the "Connection refused" / heartbeat race of issue #3. On non-Linux +/// this is an inert placeholder so callers stay platform-agnostic. +#[cfg(target_os = "linux")] +pub struct ExecListener(std::os::fd::OwnedFd); +#[cfg(not(target_os = "linux"))] +pub struct ExecListener; +/// Bind + listen the exec vsock socket (port 4089). Pure socket syscalls, safe +/// to call on the main thread before the container fork. +pub fn bind_exec_server() -> Result> { #[cfg(target_os = "linux")] { - run_vsock_server()?; + use nix::sys::socket::{ + bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, + }; + use std::os::fd::AsRawFd; + + let sock_fd = socket( + AddressFamily::Vsock, + SockType::Stream, + SockFlag::empty(), + None, + )?; + + // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on + // macOS — and so the forked container never inherits the listening socket. + unsafe { + libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + } + + let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT); + bind(sock_fd.as_raw_fd(), &addr)?; + listen(&sock_fd, Backlog::new(4)?)?; + + info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT); + Ok(ExecListener(sock_fd)) } #[cfg(not(target_os = "linux"))] { info!("Exec server not available on non-Linux platform (development mode)"); + Ok(ExecListener) } - - Ok(()) } -/// Linux vsock server implementation. -#[cfg(target_os = "linux")] -fn run_vsock_server() -> Result<(), Box> { - use nix::sys::socket::{ - accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, - }; - use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; - use tracing::error; - - let sock_fd = socket( - AddressFamily::Vsock, - SockType::Stream, - SockFlag::empty(), - None, - )?; +/// Run the exec accept loop on an already-bound listener. Intended to run on its +/// own thread for the VM's lifetime; never returns under normal operation. +pub fn serve_exec_server(listener: ExecListener) -> Result<(), Box> { + #[cfg(target_os = "linux")] + { + run_accept_loop(listener.0) + } - // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS - unsafe { - libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + #[cfg(not(target_os = "linux"))] + { + let _ = listener; + Ok(()) } +} - let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT); - bind(sock_fd.as_raw_fd(), &addr)?; - listen(&sock_fd, Backlog::new(4)?)?; +/// Bind then serve in one call. Kept for callers that don't need the early-bind +/// split (e.g. tests); guest-init's boot path uses `bind_*` + `serve_*` directly. +pub fn run_exec_server() -> Result<(), Box> { + info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT); + serve_exec_server(bind_exec_server()?) +} - info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT); +/// The exec server accept loop. +#[cfg(target_os = "linux")] +fn run_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box> { + use nix::sys::socket::accept; + use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; + use tracing::error; loop { match accept(sock_fd.as_raw_fd()) { diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs index b765163c..9fb85523 100644 --- a/src/guest/init/src/main.rs +++ b/src/guest/init/src/main.rs @@ -43,8 +43,11 @@ impl ExecConfig { /// - BOX_EXEC_ENV_*: container environment variables /// - BOX_EXEC_WORKDIR: working directory (defaults to "/") fn from_env() -> Self { - let executable = - std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/sbin/init".to_string()); + // The runtime always sets BOX_EXEC_EXEC when guest-init is PID 1 + // (runtime/src/vm/spec.rs), so this default is only a defensive fallback. + // Use /bin/sh — universal across distros — never /sbin/init, which does + // not exist on Alpine and was the original cause of issue #3. + let executable = std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/bin/sh".to_string()); // Parse args from individual env vars (BOX_EXEC_ARGC + BOX_EXEC_ARG_0..N) let args: Vec = match std::env::var("BOX_EXEC_ARGC") @@ -268,6 +271,18 @@ fn run_init() -> Result<(), Box> { // Step 2.5: Mount tmpfs volumes mount_tmpfs_volumes()?; + // Step 2.6: Bind the exec (vsock 4089) and PTY (vsock 4090) listening sockets + // NOW, before the slower network bring-up and container spawn below. These are + // pure socket/bind/listen syscalls on this (still single-threaded) main thread, + // so the later container fork stays fork-safe; the accept loops are spawned as + // threads only after the fork (Step 8). Binding this early fills the listen + // backlog from the start of boot, so a host connect QUEUES instead of being + // refused while network setup and the container spawn finish — closing the + // exec/PTY startup race of issue #3. CLOEXEC on the fds keeps the forked + // container from inheriting the listeners. + let exec_listener = exec_server::bind_exec_server()?; + let pty_listener = pty_server::bind_pty_server()?; + // Step 3: Configure guest network (if passt mode is active). // Network setup may write /etc/resolv.conf — must run before read-only remount. network::configure_guest_network()?; @@ -362,9 +377,11 @@ fn run_init() -> Result<(), Box> { expose_container_env_to_exec(&exec_config); - // Step 8: Start exec server in background thread - std::thread::spawn(|| { - if let Err(e) = exec_server::run_exec_server() { + // Step 8: Start the exec server accept loop on the socket bound in Step 2.6. + // (set_container_pid above ran first, so a host signal-main frame still finds + // the PID once the loop is serving.) + std::thread::spawn(move || { + if let Err(e) = exec_server::serve_exec_server(exec_listener) { error!("Exec server failed: {}", e); } }); @@ -376,9 +393,9 @@ fn run_init() -> Result<(), Box> { } }); - // Step 8.5: Start PTY server in background thread - std::thread::spawn(|| { - if let Err(e) = pty_server::run_pty_server() { + // Step 8.5: Start the PTY server accept loop on the socket bound in Step 2.6. + std::thread::spawn(move || { + if let Err(e) = pty_server::serve_pty_server(pty_listener) { error!("PTY server failed: {}", e); } }); diff --git a/src/guest/init/src/pty_server.rs b/src/guest/init/src/pty_server.rs index c50b41f0..c4572570 100644 --- a/src/guest/init/src/pty_server.rs +++ b/src/guest/init/src/pty_server.rs @@ -15,51 +15,81 @@ use tracing::{error, warn}; #[cfg(target_os = "linux")] use crate::user::parse_process_user; -/// Run the PTY server, listening on vsock port 4090. -/// -/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`. -/// On non-Linux platforms, this is a no-op (development stub). -pub fn run_pty_server() -> Result<(), Box> { - info!("Starting PTY server on vsock port {}", PTY_VSOCK_PORT); +/// A bound, listening PTY-server socket — produced by [`bind_pty_server`] and +/// consumed by [`serve_pty_server`]. Same early-bind rationale as +/// [`crate::exec_server::ExecListener`]: bind on the main thread before the +/// container fork (fork-safe, fills the listen backlog), accept later in a +/// thread. On non-Linux this is an inert placeholder. +#[cfg(target_os = "linux")] +pub struct PtyListener(std::os::fd::OwnedFd); +#[cfg(not(target_os = "linux"))] +pub struct PtyListener; +/// Bind + listen the PTY vsock socket (port 4090). Pure socket syscalls, safe to +/// call on the main thread before the container fork. +pub fn bind_pty_server() -> Result> { #[cfg(target_os = "linux")] { - run_vsock_pty_server()?; + use nix::sys::socket::{ + bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, + }; + use std::os::fd::AsRawFd; + + let sock_fd = socket( + AddressFamily::Vsock, + SockType::Stream, + SockFlag::empty(), + None, + )?; + + // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on + // macOS — and so the forked container never inherits the listening socket. + unsafe { + libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + } + + let addr = VsockAddr::new(libc::VMADDR_CID_ANY, PTY_VSOCK_PORT); + bind(sock_fd.as_raw_fd(), &addr)?; + listen(&sock_fd, Backlog::new(4)?)?; + + info!("PTY server listening on vsock port {}", PTY_VSOCK_PORT); + Ok(PtyListener(sock_fd)) } #[cfg(not(target_os = "linux"))] { info!("PTY server not available on non-Linux platform (development mode)"); + Ok(PtyListener) } - - Ok(()) } -/// Linux vsock PTY server implementation. -#[cfg(target_os = "linux")] -fn run_vsock_pty_server() -> Result<(), Box> { - use nix::sys::socket::{ - accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr, - }; - use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; - - let sock_fd = socket( - AddressFamily::Vsock, - SockType::Stream, - SockFlag::empty(), - None, - )?; +/// Run the PTY accept loop on an already-bound listener. Intended to run on its +/// own thread for the VM's lifetime; never returns under normal operation. +pub fn serve_pty_server(listener: PtyListener) -> Result<(), Box> { + #[cfg(target_os = "linux")] + { + run_pty_accept_loop(listener.0) + } - // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS - unsafe { - libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC); + #[cfg(not(target_os = "linux"))] + { + let _ = listener; + Ok(()) } +} - let addr = VsockAddr::new(libc::VMADDR_CID_ANY, PTY_VSOCK_PORT); - bind(sock_fd.as_raw_fd(), &addr)?; - listen(&sock_fd, Backlog::new(4)?)?; +/// Bind then serve in one call. Kept for callers that don't need the early-bind +/// split; guest-init's boot path uses `bind_*` + `serve_*` directly. +pub fn run_pty_server() -> Result<(), Box> { + info!("Starting PTY server on vsock port {}", PTY_VSOCK_PORT); + serve_pty_server(bind_pty_server()?) +} - info!("PTY server listening on vsock port {}", PTY_VSOCK_PORT); +/// The PTY server accept loop. +#[cfg(target_os = "linux")] +fn run_pty_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box> { + use nix::sys::socket::accept; + use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; loop { match accept(sock_fd.as_raw_fd()) { diff --git a/src/runtime/src/vm/ready.rs b/src/runtime/src/vm/ready.rs index 7cc48ed5..8bac1871 100644 --- a/src/runtime/src/vm/ready.rs +++ b/src/runtime/src/vm/ready.rs @@ -30,91 +30,80 @@ impl VmManager { Ok(()) } - /// Wait for the exec server socket to become ready. + /// Wait for the exec server to become ready (a Frame Heartbeat round-trip). /// - /// Polls for the socket file to appear, then verifies the exec server - /// is healthy via a Frame Heartbeat round-trip. This is best-effort: - /// if the exec socket never appears (e.g., older guest init without - /// exec server), the VM still boots successfully. + /// Waits for the readiness EVENT — a successful heartbeat — bounded by VM + /// liveness, instead of guessing a fixed timeout. guest-init binds the exec + /// socket early (before the slow network bring-up and container spawn), so the + /// host connect succeeds immediately and the heartbeat passes the moment the + /// guest's accept loop runs — however late in a slow cold boot. Each attempt + /// is individually time-bounded (the early-bound socket makes a host `connect` + /// succeed and then block on read until the guest accepts), the loop returns + /// at once if the VM has exited (a fast-exiting container never stalls), and a + /// large absolute cap is only a last-resort backstop against a wedged-but-alive + /// guest — not the expected wait. Best-effort: exec/attach also connect on + /// demand, so even a timed-out probe does not mean exec is unavailable. #[cfg(unix)] pub(crate) async fn wait_for_exec_ready( &mut self, exec_socket_path: &std::path::Path, ) -> Result<()> { - const MAX_WAIT_MS: u64 = 10000; - const POLL_INTERVAL_MS: u64 = 200; + use tokio::time::Duration; + + // Per-attempt cap on one connect + heartbeat round-trip. guest-init binds + // the exec socket early, so the host `connect` succeeds as soon as the VM + // boots and `heartbeat()`'s read then blocks until the guest's accept loop + // runs; bounding each attempt keeps the loop checking VM liveness instead + // of hanging in that read. + const ATTEMPT_TIMEOUT: Duration = Duration::from_millis(500); + const POLL_INTERVAL: Duration = Duration::from_millis(200); + // Last-resort backstop against a wedged-but-alive guest that binds but + // never accepts. A healthy guest passes the heartbeat the instant its + // accept loop runs (however late), and an exited VM returns immediately + // below — so this cap is not the expected wait. + const MAX_WAIT_MS: u64 = 120_000; tracing::debug!( socket_path = %exec_socket_path.display(), - "Waiting for exec server socket" + "Waiting for exec server readiness" ); let start = std::time::Instant::now(); - // Phase 1: Wait for socket file to appear loop { - if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 { - tracing::warn!("Exec socket did not appear, exec will not be available"); - return Ok(()); - } - - if exec_socket_path.exists() { - tracing::debug!("Exec socket file detected"); - break; - } - - // Check if VM is still running (has_exited treats a zombie shim as - // exited; is_running's kill(pid,0) would not). + // Return at once if the VM has already exited (zombie-aware: has_exited + // treats a zombie shim as exited, unlike is_running's kill(pid,0)). A + // fast-exiting container never stalls here. if let Some(ref handler) = *self.handler.read().await { if handler.has_exited() { - tracing::warn!("VM exited before exec socket appeared"); + tracing::debug!("VM exited before exec server became ready"); return Ok(()); } } - tokio::time::sleep(tokio::time::Duration::from_millis(POLL_INTERVAL_MS)).await; - } - - // Phase 2: Connect and verify with Heartbeat health check - while start.elapsed().as_millis() < MAX_WAIT_MS as u128 { - // Stop waiting if the VM has already exited: the exec socket can - // appear during guest init and then vanish when a fast-exiting - // container shuts the VM down. The shim becomes a zombie the moment - // the VM halts, so use has_exited (zombie-aware) rather than - // is_running — without this, a container that exits before its first - // heartbeat stalls the whole boot for MAX_WAIT_MS (~10s), which hit - // every short-lived `run` that lost the heartbeat race and every - // monitor restart of a fast-exiting container. - if let Some(ref handler) = *self.handler.read().await { - if handler.has_exited() { - tracing::debug!("VM exited before exec server became ready"); + // One bounded connect + heartbeat attempt. A timeout (early-bound + // socket, guest not yet accepting) or any error just means "retry". + if let Ok(Ok(client)) = + tokio::time::timeout(ATTEMPT_TIMEOUT, ExecClient::connect(exec_socket_path)).await + { + if let Ok(Ok(true)) = + tokio::time::timeout(ATTEMPT_TIMEOUT, client.heartbeat()).await + { + tracing::debug!("Exec server heartbeat passed"); + self.exec_client = Some(client); return Ok(()); } } - match ExecClient::connect(exec_socket_path).await { - Ok(client) => match client.heartbeat().await { - Ok(true) => { - tracing::debug!("Exec server heartbeat passed"); - self.exec_client = Some(client); - return Ok(()); - } - Ok(false) => { - tracing::debug!("Exec server heartbeat failed, retrying"); - } - Err(e) => { - tracing::debug!(error = %e, "Exec heartbeat error, retrying"); - } - }, - Err(e) => { - tracing::debug!(error = %e, "Exec connect failed, retrying"); - } + if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 { + tracing::warn!( + timeout_ms = MAX_WAIT_MS, + "Exec server did not become ready within the safety cap; exec/attach connect on demand and may still succeed once the guest finishes starting" + ); + return Ok(()); } - tokio::time::sleep(tokio::time::Duration::from_millis(POLL_INTERVAL_MS)).await; + tokio::time::sleep(POLL_INTERVAL).await; } - - tracing::warn!("Exec socket appeared but heartbeat failed, exec will not be available"); - Ok(()) } } diff --git a/src/runtime/src/vm/spec.rs b/src/runtime/src/vm/spec.rs index 7b66429f..f38bca88 100644 --- a/src/runtime/src/vm/spec.rs +++ b/src/runtime/src/vm/spec.rs @@ -342,7 +342,8 @@ impl VmManager { /// - If `entrypoint_override` is set, it replaces the OCI ENTRYPOINT /// - If ENTRYPOINT is set: executable = ENTRYPOINT[0], args = ENTRYPOINT[1:] + CMD /// - If only CMD is set: executable = CMD[0], args = CMD[1:] - /// - If neither: fall back to `/sbin/init` + /// - If neither: fall back to `/bin/sh` (universal across distros; `/sbin/init` + /// does not exist on Alpine, which was the original cause of issue #3) /// - If `cmd_override` is non-empty, it replaces the OCI CMD /// /// Paths are used as-is since the OCI image is always extracted at rootfs root. From f1db2cf9a05dcef7f75b771a333384ec2b8b8ca9 Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Thu, 11 Jun 2026 12:09:32 +0800 Subject: [PATCH 2/2] =?UTF-8?q?fix(init):=20real=20PID1=20reaper=20?= =?UTF-8?q?=E2=80=94=20reap=20orphans=20without=20stealing=20exec/PTY=20ex?= =?UTF-8?q?it=20codes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit guest-init runs as PID 1 but only waited on the container pid, so reparented grandchildren and the sidecar were never reaped and accumulated as zombies for the VM's lifetime. The earlier code couldn't just waitpid(-1): that races with the exec/PTY handlers, which waitpid their own children to read the real exit code — a stolen child makes the handler see ECHILD and report a bogus exit 0 (exec_server.rs). That tension is exactly why a prior fix narrowed the loop to waitpid(container_pid), trading the zombie leak for correct exec codes. Resolve both with a small reaper registry: - New `reaper` module: handlers mark their child pid MANAGED across the spawn (the lock is held across fork, closing the spawn/register race for fast-exiting commands like `exec -- false`); an RAII guard unregisters on every return path. - The supervision loop now peeks exited children non-destructively with `waitid(WNOWAIT)` and routes: the container -> reap + propagate exit code (VM lifecycle, unchanged); MANAGED children -> left for their handler to reap (real exit codes preserved); everything else (orphans + sidecar) -> reaped here. - exec one-shot + streaming spawns and the PTY fork register their children; their existing waitpid/try_wait paths are unchanged. Fixes the zombie leak and makes the long-standing "reaped by the zombie-reaper loop" comments true again, with no regression to exec/PTY or container exit codes. Unit-tested (reaper registry); needs KVM verification of exec exit codes + orphan reaping. Builds on P1+P2 (issue #3). --- src/guest/init/src/exec_server.rs | 14 +++- src/guest/init/src/lib.rs | 1 + src/guest/init/src/main.rs | 99 +++++++++++++++++++------ src/guest/init/src/pty_server.rs | 5 ++ src/guest/init/src/reaper.rs | 119 ++++++++++++++++++++++++++++++ 5 files changed, 211 insertions(+), 27 deletions(-) create mode 100644 src/guest/init/src/reaper.rs diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs index eb7da21a..5e9f4c3b 100644 --- a/src/guest/init/src/exec_server.rs +++ b/src/guest/init/src/exec_server.rs @@ -751,8 +751,12 @@ fn execute_command( Err(output) => return output, }; - let mut child = match command.spawn() { - Ok(child) => child, + // Spawn under the reaper registry: the pid is marked MANAGED before the PID 1 + // supervision loop can see it, so the loop leaves this child for us to reap + // (and read its real exit code) instead of stealing it. The guard unregisters + // the pid when this function returns (all paths). + let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) { + Ok(pair) => pair, Err(e) => { return ExecOutput { stdout: vec![], @@ -1043,8 +1047,10 @@ fn execute_command_streaming( } }; - let mut child = match command.spawn() { - Ok(child) => child, + // Spawn under the reaper registry (see one-shot path) so PID 1 leaves this + // streaming child for us to reap; the guard unregisters on return. + let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) { + Ok(pair) => pair, Err(e) => { let output = ExecOutput { stdout: vec![], diff --git a/src/guest/init/src/lib.rs b/src/guest/init/src/lib.rs index 57b05efc..f6a6f348 100644 --- a/src/guest/init/src/lib.rs +++ b/src/guest/init/src/lib.rs @@ -14,6 +14,7 @@ pub mod namespace; pub mod network; pub mod port_forward; pub mod pty_server; +pub mod reaper; pub mod user; pub use namespace::{spawn_isolated, NamespaceConfig, NamespaceError}; diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs index 9fb85523..72fb1fe7 100644 --- a/src/guest/init/src/main.rs +++ b/src/guest/init/src/main.rs @@ -986,55 +986,105 @@ fn remount_rootfs_readonly() -> Result<(), Box> { Ok(()) } -/// Wait for the main container process. +/// Supervise children as PID 1: propagate the container's exit, and reap orphans. /// -/// Exec and PTY requests run in other guest-init threads and wait for their -/// own child processes. The main supervision loop must not call waitpid(-1), -/// otherwise it can reap those children before the request handler observes -/// their exit status. +/// Exec and PTY request handlers reap their OWN children (each `waitpid`s a +/// specific pid) to read the real exit status, so this loop must not steal them +/// with a blind `waitpid(-1)`. It peeks exited children non-destructively with +/// `waitid(WNOWAIT)` and, via the [`reaper`](a3s_box_guest_init::reaper) +/// registry, reaps only the container (→ VM lifecycle / exit code) and UNMANAGED +/// children — reparented grandchildren and the sidecar — leaving handler-managed +/// children for their handler. This propagates the container exit code AND fixes +/// the zombie leak (orphans were previously never reaped until shutdown). +#[cfg(target_os = "linux")] fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box> { - use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus}; + use a3s_box_guest_init::reaper; + use nix::sys::wait::{waitid, waitpid, Id, WaitPidFlag, WaitStatus}; /// Maximum time to wait for children after forwarding SIGTERM (5 seconds). const CHILD_SHUTDOWN_TIMEOUT_MS: u64 = 5000; - info!("Waiting for container process {}", container_pid); + info!( + "Supervising children as PID 1; container PID {}", + container_pid + ); loop { - // Check if shutdown was requested via SIGTERM if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) { info!("SIGTERM received, initiating graceful shutdown"); graceful_shutdown(CHILD_SHUTDOWN_TIMEOUT_MS); return Ok(()); } + // Drain currently-exited children. `WNOWAIT` peeks without reaping, so a + // handler-managed child stays reapable by its handler; we break on it and + // revisit next tick (the handler clears it within its own poll interval). + loop { + let (pid, code, signaled) = match waitid( + Id::All, + WaitPidFlag::WEXITED | WaitPidFlag::WNOWAIT | WaitPidFlag::WNOHANG, + ) { + Ok(WaitStatus::Exited(pid, status)) => (pid, status, false), + Ok(WaitStatus::Signaled(pid, signal, _)) => (pid, 128 + signal as i32, true), + // No exited child right now: stop draining and poll again later. + Ok(_) => break, + // No children at all (container already gone): nothing to supervise. + Err(nix::errno::Errno::ECHILD) => return Ok(()), + // Transient error: retry on the next tick. + Err(_) => break, + }; + + if pid == container_pid { + // The container drives the VM lifecycle: reap it and exit with its + // status so the host (and detached `run -d wait`) sees the real code. + let _ = waitpid(pid, None); + if signaled { + error!("Container process {} terminated (exit code {})", pid, code); + } else { + info!("Container process {} exited with status {}", pid, code); + } + persist_exit_code(code); + process::exit(code); + } else if reaper::is_managed(pid.as_raw()) { + // Owned by an exec/PTY handler, which reaps it for the real status. + // Stop draining; it clears shortly and we revisit on the next tick. + break; + } else { + // Orphan (reparented grandchild) or the sidecar: reap it here so it + // does not linger as a zombie. Keep draining for more. + let _ = waitpid(pid, Some(WaitPidFlag::WNOHANG)); + } + } + + std::thread::sleep(std::time::Duration::from_millis(100)); + } +} + +/// Non-Linux development stub: just wait for the container process to exit. +#[cfg(not(target_os = "linux"))] +fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box> { + use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus}; + + loop { + if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) { + return Ok(()); + } match waitpid(container_pid, Some(WaitPidFlag::WNOHANG)) { - Ok(WaitStatus::Exited(pid, status)) => { - info!("Container process {} exited with status {}", pid, status); + Ok(WaitStatus::Exited(_, status)) => { persist_exit_code(status); process::exit(status); } - Ok(WaitStatus::Signaled(pid, signal, _)) => { - error!("Container process {} killed by signal {:?}", pid, signal); + Ok(WaitStatus::Signaled(_, signal, _)) => { persist_exit_code(128 + signal as i32); process::exit(128 + signal as i32); } Ok(WaitStatus::StillAlive) => { std::thread::sleep(std::time::Duration::from_millis(100)); } - Ok(_) => { - // Other status, continue waiting - } - Err(nix::errno::Errno::ECHILD) => { - info!("Container process {} is no longer a child", container_pid); - break; - } - Err(e) => { - return Err(format!("waitpid failed: {}", e).into()); - } + Ok(_) => {} + Err(_) => break, } } - Ok(()) } @@ -1053,6 +1103,9 @@ fn persist_exit_code(code: i32) { } /// Perform graceful shutdown: forward SIGTERM to children, wait, then force-kill. +/// Only the Linux supervision loop drives this (the non-Linux dev stub exits the +/// process directly), so it is gated to avoid a dead-code warning on macOS. +#[cfg(target_os = "linux")] fn graceful_shutdown(timeout_ms: u64) { // Step 1: Send SIGTERM to all processes (except ourselves, PID 1) #[cfg(target_os = "linux")] diff --git a/src/guest/init/src/pty_server.rs b/src/guest/init/src/pty_server.rs index c4572570..b1648f45 100644 --- a/src/guest/init/src/pty_server.rs +++ b/src/guest/init/src/pty_server.rs @@ -246,6 +246,11 @@ fn handle_pty_connection(fd: std::os::fd::OwnedFd) -> Result<(), Box> = Mutex::new(Vec::new()); + +/// RAII guard: removes its pid from the MANAGED set when dropped (i.e. when the +/// handler returns, having reaped its own child). Drop runs on every handler +/// exit path — normal, timeout-kill, and error — so the set never leaks. +pub struct ManagedChild(i32); + +impl Drop for ManagedChild { + fn drop(&mut self) { + // Recover from a poisoned lock rather than panicking inside PID 1. + let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner()); + managed.retain(|&p| p != self.0); + } +} + +/// Spawn a child while atomically marking its pid MANAGED. +/// +/// The MANAGED lock is held across the `spawn` call, so the supervision loop +/// cannot conclude the new pid is unmanaged (and reap it as an orphan) in the +/// window between fork and registration — which closes the race for commands +/// that exit almost immediately (e.g. `exec … -- false`). Holding the lock +/// across the fork is safe: the forked child runs only async-signal-safe code +/// before `exec` and never touches this mutex, so its inherited (locked) copy is +/// discarded at `exec` without deadlock. +pub fn spawn_managed(spawn: F) -> std::io::Result<(std::process::Child, ManagedChild)> +where + F: FnOnce() -> std::io::Result, +{ + let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner()); + let child = spawn()?; + let pid = child.id() as i32; + managed.push(pid); + drop(managed); + Ok((child, ManagedChild(pid))) +} + +/// Mark an already-forked pid MANAGED (raw-fork callers, e.g. the PTY server). +/// Call immediately after `fork` in the parent. Returns the RAII guard. +pub fn manage_pid(pid: i32) -> ManagedChild { + let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner()); + if !managed.contains(&pid) { + managed.push(pid); + } + drop(managed); + ManagedChild(pid) +} + +/// Whether `pid` is owned by a request handler (the loop must not reap it). +pub fn is_managed(pid: i32) -> bool { + MANAGED + .lock() + .map(|m| m.contains(&pid)) + .unwrap_or_else(|e| e.into_inner().contains(&pid)) +} + +#[cfg(test)] +mod tests { + use super::*; + + // Distinct synthetic pids per test so the shared MANAGED static doesn't + // collide across parallel test threads. + #[test] + fn manage_pid_registers_until_guard_dropped() { + let pid = 0x7fff_fff0; + assert!(!is_managed(pid)); + { + let _guard = manage_pid(pid); + assert!( + is_managed(pid), + "pid should be managed while the guard lives" + ); + } + assert!(!is_managed(pid), "guard drop must unregister the pid"); + } + + #[test] + fn unregistered_pid_is_not_managed() { + assert!(!is_managed(0x7fff_fff1)); + } + + #[test] + fn spawn_managed_marks_then_releases_real_child() { + // A child that exits immediately is the worst case the lock-across-spawn + // protects: it must still be MANAGED for the handler (here, the test) to + // reap, never the supervision loop. + let (mut child, guard) = + spawn_managed(|| std::process::Command::new("true").spawn()).expect("spawn true"); + let pid = child.id() as i32; + assert!( + is_managed(pid), + "spawned pid must be registered before we observe it" + ); + let _ = child.wait(); // the handler (test) reaps its own child + drop(guard); + assert!(!is_managed(pid), "guard drop must unregister after reap"); + } +}