AI45Lab · ZhiXiao-Lin · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs
@@ -67,52 +67,88 @@ const EXEC_CONTROL_FLUSH: &[u8] = b"flush";
 /// match the host's `EXEC_FLUSH_ACK` in `runtime/src/grpc/exec.rs`.
 const EXEC_FLUSH_ACK: &[u8] = b"flush-ack";
 
-/// Run the exec server, listening on vsock port 4089.
+/// A bound, listening exec-server socket — produced by [`bind_exec_server`] and
+/// consumed by [`serve_exec_server`].
 ///
-/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`.
-/// On non-Linux platforms, this is a no-op (development stub).
-pub fn run_exec_server() -> Result<(), Box<dyn std::error::Error>> {
-    info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT);
+/// Splitting bind from serve lets guest-init bind the exec vsock port EARLY on
+/// the main thread (pure socket/bind/listen syscalls — no thread spawn, so the
+/// later single-threaded container `fork()` stays fork-safe) while the accept
+/// loop runs afterwards in its own thread. Binding early fills the listen
+/// backlog from the start of boot, so a host connect QUEUES instead of being
+/// refused while the slower boot steps (network, container spawn) finish — this
+/// removes the "Connection refused" / heartbeat race of issue #3. On non-Linux
+/// this is an inert placeholder so callers stay platform-agnostic.
+#[cfg(target_os = "linux")]
+pub struct ExecListener(std::os::fd::OwnedFd);
+#[cfg(not(target_os = "linux"))]
+pub struct ExecListener;
 
+/// Bind + listen the exec vsock socket (port 4089). Pure socket syscalls, safe
+/// to call on the main thread before the container fork.
+pub fn bind_exec_server() -> Result<ExecListener, Box<dyn std::error::Error>> {
     #[cfg(target_os = "linux")]
     {
-        run_vsock_server()?;
+        use nix::sys::socket::{
+            bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
+        };
+        use std::os::fd::AsRawFd;
+
+        let sock_fd = socket(
+            AddressFamily::Vsock,
+            SockType::Stream,
+            SockFlag::empty(),
+            None,
+        )?;
+
+        // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on
+        // macOS — and so the forked container never inherits the listening socket.
+        unsafe {
+            libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
+        }
+
+        let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT);
+        bind(sock_fd.as_raw_fd(), &addr)?;
+        listen(&sock_fd, Backlog::new(4)?)?;
+
+        info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT);
+        Ok(ExecListener(sock_fd))
     }
 
     #[cfg(not(target_os = "linux"))]
     {
         info!("Exec server not available on non-Linux platform (development mode)");
+        Ok(ExecListener)
     }
-
-    Ok(())
 }
 
-/// Linux vsock server implementation.
-#[cfg(target_os = "linux")]
-fn run_vsock_server() -> Result<(), Box<dyn std::error::Error>> {
-    use nix::sys::socket::{
-        accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
-    };
-    use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
-    use tracing::error;
-
-    let sock_fd = socket(
-        AddressFamily::Vsock,
-        SockType::Stream,
-        SockFlag::empty(),
-        None,
-    )?;
+/// Run the exec accept loop on an already-bound listener. Intended to run on its
+/// own thread for the VM's lifetime; never returns under normal operation.
+pub fn serve_exec_server(listener: ExecListener) -> Result<(), Box<dyn std::error::Error>> {
+    #[cfg(target_os = "linux")]
+    {
+        run_accept_loop(listener.0)
+    }
 
-    // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS
-    unsafe {
-        libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
+    #[cfg(not(target_os = "linux"))]
+    {
+        let _ = listener;
+        Ok(())
     }
+}
 
-    let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT);
-    bind(sock_fd.as_raw_fd(), &addr)?;
-    listen(&sock_fd, Backlog::new(4)?)?;
+/// Bind then serve in one call. Kept for callers that don't need the early-bind
+/// split (e.g. tests); guest-init's boot path uses `bind_*` + `serve_*` directly.
+pub fn run_exec_server() -> Result<(), Box<dyn std::error::Error>> {
+    info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT);
+    serve_exec_server(bind_exec_server()?)
+}
 
-    info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT);
+/// The exec server accept loop.
+#[cfg(target_os = "linux")]
+fn run_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box<dyn std::error::Error>> {
+    use nix::sys::socket::accept;
+    use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
+    use tracing::error;
 
     loop {
         match accept(sock_fd.as_raw_fd()) {
@@ -715,8 +751,12 @@ fn execute_command(
         Err(output) => return output,
     };
 
-    let mut child = match command.spawn() {
-        Ok(child) => child,
+    // Spawn under the reaper registry: the pid is marked MANAGED before the PID 1
+    // supervision loop can see it, so the loop leaves this child for us to reap
+    // (and read its real exit code) instead of stealing it. The guard unregisters
+    // the pid when this function returns (all paths).
+    let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) {
+        Ok(pair) => pair,
         Err(e) => {
             return ExecOutput {
                 stdout: vec![],
@@ -1007,8 +1047,10 @@ fn execute_command_streaming(
         }
     };
 
-    let mut child = match command.spawn() {
-        Ok(child) => child,
+    // Spawn under the reaper registry (see one-shot path) so PID 1 leaves this
+    // streaming child for us to reap; the guard unregisters on return.
+    let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) {
+        Ok(pair) => pair,
         Err(e) => {
             let output = ExecOutput {
                 stdout: vec![],

diff --git a/src/guest/init/src/lib.rs b/src/guest/init/src/lib.rs
@@ -14,6 +14,7 @@ pub mod namespace;
 pub mod network;
 pub mod port_forward;
 pub mod pty_server;
+pub mod reaper;
 pub mod user;
 
 pub use namespace::{spawn_isolated, NamespaceConfig, NamespaceError};

diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs
@@ -43,8 +43,11 @@ impl ExecConfig {
     /// - BOX_EXEC_ENV_*: container environment variables
     /// - BOX_EXEC_WORKDIR: working directory (defaults to "/")
     fn from_env() -> Self {
-        let executable =
-            std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/sbin/init".to_string());
+        // The runtime always sets BOX_EXEC_EXEC when guest-init is PID 1
+        // (runtime/src/vm/spec.rs), so this default is only a defensive fallback.
+        // Use /bin/sh — universal across distros — never /sbin/init, which does
+        // not exist on Alpine and was the original cause of issue #3.
+        let executable = std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/bin/sh".to_string());
 
         // Parse args from individual env vars (BOX_EXEC_ARGC + BOX_EXEC_ARG_0..N)
         let args: Vec<String> = match std::env::var("BOX_EXEC_ARGC")
@@ -268,6 +271,18 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
     // Step 2.5: Mount tmpfs volumes
     mount_tmpfs_volumes()?;
 
+    // Step 2.6: Bind the exec (vsock 4089) and PTY (vsock 4090) listening sockets
+    // NOW, before the slower network bring-up and container spawn below. These are
+    // pure socket/bind/listen syscalls on this (still single-threaded) main thread,
+    // so the later container fork stays fork-safe; the accept loops are spawned as
+    // threads only after the fork (Step 8). Binding this early fills the listen
+    // backlog from the start of boot, so a host connect QUEUES instead of being
+    // refused while network setup and the container spawn finish — closing the
+    // exec/PTY startup race of issue #3. CLOEXEC on the fds keeps the forked
+    // container from inheriting the listeners.
+    let exec_listener = exec_server::bind_exec_server()?;
+    let pty_listener = pty_server::bind_pty_server()?;
+
     // Step 3: Configure guest network (if passt mode is active).
     // Network setup may write /etc/resolv.conf — must run before read-only remount.
     network::configure_guest_network()?;
@@ -362,9 +377,11 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
 
     expose_container_env_to_exec(&exec_config);
 
-    // Step 8: Start exec server in background thread
-    std::thread::spawn(|| {
-        if let Err(e) = exec_server::run_exec_server() {
+    // Step 8: Start the exec server accept loop on the socket bound in Step 2.6.
+    // (set_container_pid above ran first, so a host signal-main frame still finds
+    // the PID once the loop is serving.)
+    std::thread::spawn(move || {
+        if let Err(e) = exec_server::serve_exec_server(exec_listener) {
             error!("Exec server failed: {}", e);
         }
     });
@@ -376,9 +393,9 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
         }
     });
 
-    // Step 8.5: Start PTY server in background thread
-    std::thread::spawn(|| {
-        if let Err(e) = pty_server::run_pty_server() {
+    // Step 8.5: Start the PTY server accept loop on the socket bound in Step 2.6.
+    std::thread::spawn(move || {
+        if let Err(e) = pty_server::serve_pty_server(pty_listener) {
             error!("PTY server failed: {}", e);
         }
     });
@@ -969,55 +986,105 @@ fn remount_rootfs_readonly() -> Result<(), Box<dyn std::error::Error>> {
     Ok(())
 }
 
-/// Wait for the main container process.
+/// Supervise children as PID 1: propagate the container's exit, and reap orphans.
 ///
-/// Exec and PTY requests run in other guest-init threads and wait for their
-/// own child processes. The main supervision loop must not call waitpid(-1),
-/// otherwise it can reap those children before the request handler observes
-/// their exit status.
+/// Exec and PTY request handlers reap their OWN children (each `waitpid`s a
+/// specific pid) to read the real exit status, so this loop must not steal them
+/// with a blind `waitpid(-1)`. It peeks exited children non-destructively with
+/// `waitid(WNOWAIT)` and, via the [`reaper`](a3s_box_guest_init::reaper)
+/// registry, reaps only the container (→ VM lifecycle / exit code) and UNMANAGED
+/// children — reparented grandchildren and the sidecar — leaving handler-managed
+/// children for their handler. This propagates the container exit code AND fixes
+/// the zombie leak (orphans were previously never reaped until shutdown).
+#[cfg(target_os = "linux")]
 fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box<dyn std::error::Error>> {
-    use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
+    use a3s_box_guest_init::reaper;
+    use nix::sys::wait::{waitid, waitpid, Id, WaitPidFlag, WaitStatus};
 
     /// Maximum time to wait for children after forwarding SIGTERM (5 seconds).
     const CHILD_SHUTDOWN_TIMEOUT_MS: u64 = 5000;
 
-    info!("Waiting for container process {}", container_pid);
+    info!(
+        "Supervising children as PID 1; container PID {}",
+        container_pid
+    );
 
     loop {
-        // Check if shutdown was requested via SIGTERM
         if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) {
             info!("SIGTERM received, initiating graceful shutdown");
             graceful_shutdown(CHILD_SHUTDOWN_TIMEOUT_MS);
             return Ok(());
         }
 
+        // Drain currently-exited children. `WNOWAIT` peeks without reaping, so a
+        // handler-managed child stays reapable by its handler; we break on it and
+        // revisit next tick (the handler clears it within its own poll interval).
+        loop {
+            let (pid, code, signaled) = match waitid(
+                Id::All,
+                WaitPidFlag::WEXITED | WaitPidFlag::WNOWAIT | WaitPidFlag::WNOHANG,
+            ) {
+                Ok(WaitStatus::Exited(pid, status)) => (pid, status, false),
+                Ok(WaitStatus::Signaled(pid, signal, _)) => (pid, 128 + signal as i32, true),
+                // No exited child right now: stop draining and poll again later.
+                Ok(_) => break,
+                // No children at all (container already gone): nothing to supervise.
+                Err(nix::errno::Errno::ECHILD) => return Ok(()),
+                // Transient error: retry on the next tick.
+                Err(_) => break,
+            };
+
+            if pid == container_pid {
+                // The container drives the VM lifecycle: reap it and exit with its
+                // status so the host (and detached `run -d wait`) sees the real code.
+                let _ = waitpid(pid, None);
+                if signaled {
+                    error!("Container process {} terminated (exit code {})", pid, code);
+                } else {
+                    info!("Container process {} exited with status {}", pid, code);
+                }
+                persist_exit_code(code);
+                process::exit(code);
+            } else if reaper::is_managed(pid.as_raw()) {
+                // Owned by an exec/PTY handler, which reaps it for the real status.
+                // Stop draining; it clears shortly and we revisit on the next tick.
+                break;
+            } else {
+                // Orphan (reparented grandchild) or the sidecar: reap it here so it
+                // does not linger as a zombie. Keep draining for more.
+                let _ = waitpid(pid, Some(WaitPidFlag::WNOHANG));
+            }
+        }
+
+        std::thread::sleep(std::time::Duration::from_millis(100));
+    }
+}
+
+/// Non-Linux development stub: just wait for the container process to exit.
+#[cfg(not(target_os = "linux"))]
+fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box<dyn std::error::Error>> {
+    use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
+
+    loop {
+        if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) {
+            return Ok(());
+        }
         match waitpid(container_pid, Some(WaitPidFlag::WNOHANG)) {
-            Ok(WaitStatus::Exited(pid, status)) => {
-                info!("Container process {} exited with status {}", pid, status);
+            Ok(WaitStatus::Exited(_, status)) => {
                 persist_exit_code(status);
                 process::exit(status);
             }
-            Ok(WaitStatus::Signaled(pid, signal, _)) => {
-                error!("Container process {} killed by signal {:?}", pid, signal);
+            Ok(WaitStatus::Signaled(_, signal, _)) => {
                 persist_exit_code(128 + signal as i32);
                 process::exit(128 + signal as i32);
             }
             Ok(WaitStatus::StillAlive) => {
                 std::thread::sleep(std::time::Duration::from_millis(100));
             }
-            Ok(_) => {
-                // Other status, continue waiting
-            }
-            Err(nix::errno::Errno::ECHILD) => {
-                info!("Container process {} is no longer a child", container_pid);
-                break;
-            }
-            Err(e) => {
-                return Err(format!("waitpid failed: {}", e).into());
-            }
+            Ok(_) => {}
+            Err(_) => break,
         }
     }
-
     Ok(())
 }
 
@@ -1036,6 +1103,9 @@ fn persist_exit_code(code: i32) {
 }
 
 /// Perform graceful shutdown: forward SIGTERM to children, wait, then force-kill.
+/// Only the Linux supervision loop drives this (the non-Linux dev stub exits the
+/// process directly), so it is gated to avoid a dead-code warning on macOS.
+#[cfg(target_os = "linux")]
 fn graceful_shutdown(timeout_ms: u64) {
     // Step 1: Send SIGTERM to all processes (except ourselves, PID 1)
     #[cfg(target_os = "linux")]