From c44e4a49e7782d7fd45d8930746b44eb3029dc8d Mon Sep 17 00:00:00 2001
From: Roy Lin <roylin@a3s.box>
Date: Thu, 11 Jun 2026 10:47:46 +0800
Subject: [PATCH 1/2] refactor(init): early-bind vsock servers + event-driven
 readiness (issue #3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restructures the exec/PTY readiness path so boot waits for a real readiness
EVENT bounded by VM liveness, instead of guessing a fixed timeout — replacing
the interim 10s→30s band-aid.

P1 — bind early, serve late. Split exec_server/pty_server into
bind_*()->Listener (pure socket/bind/listen syscalls) and serve_*(listener)
(the accept loop). run_init now binds both vsock listeners on the main thread
right after the filesystem mounts (Step 2.6), BEFORE the slow network bring-up
and the container fork, then spawns the accept loops after the fork (Step 8).
Binding adds no thread, so the single-threaded-at-fork invariant that keeps
spawn_isolated safe is preserved. The listen backlog is filled from boot, so a
host connect QUEUES instead of being refused — this removes the `run -it` PTY
"Connection refused". CLOEXEC keeps the forked container from inheriting the
listeners.

P2 — event-driven, liveness-bounded readiness. Early binding makes the host
`connect` succeed immediately, so heartbeat()'s (timeout-less) read would block
until the guest's accept loop runs. wait_for_exec_ready is rewritten to bound
each connect+heartbeat attempt (tokio timeout), return at once when the VM
exits (has_exited, zombie-aware — fast-exit containers never stall), and treat
a large absolute cap purely as a backstop against a wedged-but-alive guest. A
healthy guest passes the heartbeat the moment its accept loop runs, however
late in a slow cold boot — so the false "heartbeat failed" warning is gone
without a fixed budget to outrun.

Also folds in the issue-#3 cleanups: dead `/sbin/init` BOX_EXEC_EXEC default →
`/bin/sh`, and the stale resolve_oci_entrypoint doc comment.

Deferred: an explicit guest→host "ready" beacon on a new vsock port was
considered but NOT wired — port_forward uses add_vsock_port(listen=true) with a
guest connect-out, which contradicts the assumed listen=false direction for
guest→host, and that is only verifiable on KVM. The liveness-bounded heartbeat
achieves the same correctness without guessing cross-process vsock semantics.
Supersedes the interim 30s fix (PR #14).
---
 src/guest/init/src/exec_server.rs |  96 ++++++++++++++++++---------
 src/guest/init/src/main.rs        |  33 ++++++---
 src/guest/init/src/pty_server.rs  |  90 ++++++++++++++++---------
 src/runtime/src/vm/ready.rs       | 107 ++++++++++++++----------------
 src/runtime/src/vm/spec.rs        |   3 +-
 5 files changed, 201 insertions(+), 128 deletions(-)
diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs
index a593ef7a..eb7da21a 100644
--- a/src/guest/init/src/exec_server.rs
+++ b/src/guest/init/src/exec_server.rs
@@ -67,52 +67,88 @@ const EXEC_CONTROL_FLUSH: &[u8] = b"flush";
 /// match the host's `EXEC_FLUSH_ACK` in `runtime/src/grpc/exec.rs`.
 const EXEC_FLUSH_ACK: &[u8] = b"flush-ack";
 
-/// Run the exec server, listening on vsock port 4089.
+/// A bound, listening exec-server socket — produced by [`bind_exec_server`] and
+/// consumed by [`serve_exec_server`].
 ///
-/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`.
-/// On non-Linux platforms, this is a no-op (development stub).
-pub fn run_exec_server() -> Result<(), Box<dyn std::error::Error>> {
-    info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT);
+/// Splitting bind from serve lets guest-init bind the exec vsock port EARLY on
+/// the main thread (pure socket/bind/listen syscalls — no thread spawn, so the
+/// later single-threaded container `fork()` stays fork-safe) while the accept
+/// loop runs afterwards in its own thread. Binding early fills the listen
+/// backlog from the start of boot, so a host connect QUEUES instead of being
+/// refused while the slower boot steps (network, container spawn) finish — this
+/// removes the "Connection refused" / heartbeat race of issue #3. On non-Linux
+/// this is an inert placeholder so callers stay platform-agnostic.
+#[cfg(target_os = "linux")]
+pub struct ExecListener(std::os::fd::OwnedFd);
+#[cfg(not(target_os = "linux"))]
+pub struct ExecListener;
 
+/// Bind + listen the exec vsock socket (port 4089). Pure socket syscalls, safe
+/// to call on the main thread before the container fork.
+pub fn bind_exec_server() -> Result<ExecListener, Box<dyn std::error::Error>> {
     #[cfg(target_os = "linux")]
     {
-        run_vsock_server()?;
+        use nix::sys::socket::{
+            bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
+        };
+        use std::os::fd::AsRawFd;
+
+        let sock_fd = socket(
+            AddressFamily::Vsock,
+            SockType::Stream,
+            SockFlag::empty(),
+            None,
+        )?;
+
+        // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on
+        // macOS — and so the forked container never inherits the listening socket.
+        unsafe {
+            libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
+        }
+
+        let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT);
+        bind(sock_fd.as_raw_fd(), &addr)?;
+        listen(&sock_fd, Backlog::new(4)?)?;
+
+        info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT);
+        Ok(ExecListener(sock_fd))
     }
 
     #[cfg(not(target_os = "linux"))]
     {
         info!("Exec server not available on non-Linux platform (development mode)");
+        Ok(ExecListener)
     }
-
-    Ok(())
 }
 
-/// Linux vsock server implementation.
-#[cfg(target_os = "linux")]
-fn run_vsock_server() -> Result<(), Box<dyn std::error::Error>> {
-    use nix::sys::socket::{
-        accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
-    };
-    use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
-    use tracing::error;
-
-    let sock_fd = socket(
-        AddressFamily::Vsock,
-        SockType::Stream,
-        SockFlag::empty(),
-        None,
-    )?;
+/// Run the exec accept loop on an already-bound listener. Intended to run on its
+/// own thread for the VM's lifetime; never returns under normal operation.
+pub fn serve_exec_server(listener: ExecListener) -> Result<(), Box<dyn std::error::Error>> {
+    #[cfg(target_os = "linux")]
+    {
+        run_accept_loop(listener.0)
+    }
 
-    // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS
-    unsafe {
-        libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
+    #[cfg(not(target_os = "linux"))]
+    {
+        let _ = listener;
+        Ok(())
     }
+}
 
-    let addr = VsockAddr::new(libc::VMADDR_CID_ANY, EXEC_VSOCK_PORT);
-    bind(sock_fd.as_raw_fd(), &addr)?;
-    listen(&sock_fd, Backlog::new(4)?)?;
+/// Bind then serve in one call. Kept for callers that don't need the early-bind
+/// split (e.g. tests); guest-init's boot path uses `bind_*` + `serve_*` directly.
+pub fn run_exec_server() -> Result<(), Box<dyn std::error::Error>> {
+    info!("Starting exec server on vsock port {}", EXEC_VSOCK_PORT);
+    serve_exec_server(bind_exec_server()?)
+}
 
-    info!("Exec server listening on vsock port {}", EXEC_VSOCK_PORT);
+/// The exec server accept loop.
+#[cfg(target_os = "linux")]
+fn run_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box<dyn std::error::Error>> {
+    use nix::sys::socket::accept;
+    use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
+    use tracing::error;
 
     loop {
         match accept(sock_fd.as_raw_fd()) {
diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs
index b765163c..9fb85523 100644
--- a/src/guest/init/src/main.rs
+++ b/src/guest/init/src/main.rs
@@ -43,8 +43,11 @@ impl ExecConfig {
     /// - BOX_EXEC_ENV_*: container environment variables
     /// - BOX_EXEC_WORKDIR: working directory (defaults to "/")
     fn from_env() -> Self {
-        let executable =
-            std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/sbin/init".to_string());
+        // The runtime always sets BOX_EXEC_EXEC when guest-init is PID 1
+        // (runtime/src/vm/spec.rs), so this default is only a defensive fallback.
+        // Use /bin/sh — universal across distros — never /sbin/init, which does
+        // not exist on Alpine and was the original cause of issue #3.
+        let executable = std::env::var("BOX_EXEC_EXEC").unwrap_or_else(|_| "/bin/sh".to_string());
 
         // Parse args from individual env vars (BOX_EXEC_ARGC + BOX_EXEC_ARG_0..N)
         let args: Vec<String> = match std::env::var("BOX_EXEC_ARGC")
@@ -268,6 +271,18 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
     // Step 2.5: Mount tmpfs volumes
     mount_tmpfs_volumes()?;
 
+    // Step 2.6: Bind the exec (vsock 4089) and PTY (vsock 4090) listening sockets
+    // NOW, before the slower network bring-up and container spawn below. These are
+    // pure socket/bind/listen syscalls on this (still single-threaded) main thread,
+    // so the later container fork stays fork-safe; the accept loops are spawned as
+    // threads only after the fork (Step 8). Binding this early fills the listen
+    // backlog from the start of boot, so a host connect QUEUES instead of being
+    // refused while network setup and the container spawn finish — closing the
+    // exec/PTY startup race of issue #3. CLOEXEC on the fds keeps the forked
+    // container from inheriting the listeners.
+    let exec_listener = exec_server::bind_exec_server()?;
+    let pty_listener = pty_server::bind_pty_server()?;
+
     // Step 3: Configure guest network (if passt mode is active).
     // Network setup may write /etc/resolv.conf — must run before read-only remount.
     network::configure_guest_network()?;
@@ -362,9 +377,11 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
 
     expose_container_env_to_exec(&exec_config);
 
-    // Step 8: Start exec server in background thread
-    std::thread::spawn(|| {
-        if let Err(e) = exec_server::run_exec_server() {
+    // Step 8: Start the exec server accept loop on the socket bound in Step 2.6.
+    // (set_container_pid above ran first, so a host signal-main frame still finds
+    // the PID once the loop is serving.)
+    std::thread::spawn(move || {
+        if let Err(e) = exec_server::serve_exec_server(exec_listener) {
             error!("Exec server failed: {}", e);
         }
     });
@@ -376,9 +393,9 @@ fn run_init() -> Result<(), Box<dyn std::error::Error>> {
         }
     });
 
-    // Step 8.5: Start PTY server in background thread
-    std::thread::spawn(|| {
-        if let Err(e) = pty_server::run_pty_server() {
+    // Step 8.5: Start the PTY server accept loop on the socket bound in Step 2.6.
+    std::thread::spawn(move || {
+        if let Err(e) = pty_server::serve_pty_server(pty_listener) {
             error!("PTY server failed: {}", e);
         }
     });
diff --git a/src/guest/init/src/pty_server.rs b/src/guest/init/src/pty_server.rs
index c50b41f0..c4572570 100644
--- a/src/guest/init/src/pty_server.rs
+++ b/src/guest/init/src/pty_server.rs
@@ -15,51 +15,81 @@ use tracing::{error, warn};
 #[cfg(target_os = "linux")]
 use crate::user::parse_process_user;
 
-/// Run the PTY server, listening on vsock port 4090.
-///
-/// On Linux, binds to `AF_VSOCK` with `VMADDR_CID_ANY`.
-/// On non-Linux platforms, this is a no-op (development stub).
-pub fn run_pty_server() -> Result<(), Box<dyn std::error::Error>> {
-    info!("Starting PTY server on vsock port {}", PTY_VSOCK_PORT);
+/// A bound, listening PTY-server socket — produced by [`bind_pty_server`] and
+/// consumed by [`serve_pty_server`]. Same early-bind rationale as
+/// [`crate::exec_server::ExecListener`]: bind on the main thread before the
+/// container fork (fork-safe, fills the listen backlog), accept later in a
+/// thread. On non-Linux this is an inert placeholder.
+#[cfg(target_os = "linux")]
+pub struct PtyListener(std::os::fd::OwnedFd);
+#[cfg(not(target_os = "linux"))]
+pub struct PtyListener;
 
+/// Bind + listen the PTY vsock socket (port 4090). Pure socket syscalls, safe to
+/// call on the main thread before the container fork.
+pub fn bind_pty_server() -> Result<PtyListener, Box<dyn std::error::Error>> {
     #[cfg(target_os = "linux")]
     {
-        run_vsock_pty_server()?;
+        use nix::sys::socket::{
+            bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
+        };
+        use std::os::fd::AsRawFd;
+
+        let sock_fd = socket(
+            AddressFamily::Vsock,
+            SockType::Stream,
+            SockFlag::empty(),
+            None,
+        )?;
+
+        // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on
+        // macOS — and so the forked container never inherits the listening socket.
+        unsafe {
+            libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
+        }
+
+        let addr = VsockAddr::new(libc::VMADDR_CID_ANY, PTY_VSOCK_PORT);
+        bind(sock_fd.as_raw_fd(), &addr)?;
+        listen(&sock_fd, Backlog::new(4)?)?;
+
+        info!("PTY server listening on vsock port {}", PTY_VSOCK_PORT);
+        Ok(PtyListener(sock_fd))
     }
 
     #[cfg(not(target_os = "linux"))]
     {
         info!("PTY server not available on non-Linux platform (development mode)");
+        Ok(PtyListener)
     }
-
-    Ok(())
 }
 
-/// Linux vsock PTY server implementation.
-#[cfg(target_os = "linux")]
-fn run_vsock_pty_server() -> Result<(), Box<dyn std::error::Error>> {
-    use nix::sys::socket::{
-        accept, bind, listen, socket, AddressFamily, Backlog, SockFlag, SockType, VsockAddr,
-    };
-    use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
-
-    let sock_fd = socket(
-        AddressFamily::Vsock,
-        SockType::Stream,
-        SockFlag::empty(),
-        None,
-    )?;
+/// Run the PTY accept loop on an already-bound listener. Intended to run on its
+/// own thread for the VM's lifetime; never returns under normal operation.
+pub fn serve_pty_server(listener: PtyListener) -> Result<(), Box<dyn std::error::Error>> {
+    #[cfg(target_os = "linux")]
+    {
+        run_pty_accept_loop(listener.0)
+    }
 
-    // Set CLOEXEC manually since SOCK_CLOEXEC isn't available in nix 0.29 on macOS
-    unsafe {
-        libc::fcntl(sock_fd.as_raw_fd(), libc::F_SETFD, libc::FD_CLOEXEC);
+    #[cfg(not(target_os = "linux"))]
+    {
+        let _ = listener;
+        Ok(())
     }
+}
 
-    let addr = VsockAddr::new(libc::VMADDR_CID_ANY, PTY_VSOCK_PORT);
-    bind(sock_fd.as_raw_fd(), &addr)?;
-    listen(&sock_fd, Backlog::new(4)?)?;
+/// Bind then serve in one call. Kept for callers that don't need the early-bind
+/// split; guest-init's boot path uses `bind_*` + `serve_*` directly.
+pub fn run_pty_server() -> Result<(), Box<dyn std::error::Error>> {
+    info!("Starting PTY server on vsock port {}", PTY_VSOCK_PORT);
+    serve_pty_server(bind_pty_server()?)
+}
 
-    info!("PTY server listening on vsock port {}", PTY_VSOCK_PORT);
+/// The PTY server accept loop.
+#[cfg(target_os = "linux")]
+fn run_pty_accept_loop(sock_fd: std::os::fd::OwnedFd) -> Result<(), Box<dyn std::error::Error>> {
+    use nix::sys::socket::accept;
+    use std::os::fd::{AsRawFd, FromRawFd, OwnedFd};
 
     loop {
         match accept(sock_fd.as_raw_fd()) {
diff --git a/src/runtime/src/vm/ready.rs b/src/runtime/src/vm/ready.rs
index 7cc48ed5..8bac1871 100644
--- a/src/runtime/src/vm/ready.rs
+++ b/src/runtime/src/vm/ready.rs
@@ -30,91 +30,80 @@ impl VmManager {
         Ok(())
     }
 
-    /// Wait for the exec server socket to become ready.
+    /// Wait for the exec server to become ready (a Frame Heartbeat round-trip).
     ///
-    /// Polls for the socket file to appear, then verifies the exec server
-    /// is healthy via a Frame Heartbeat round-trip. This is best-effort:
-    /// if the exec socket never appears (e.g., older guest init without
-    /// exec server), the VM still boots successfully.
+    /// Waits for the readiness EVENT — a successful heartbeat — bounded by VM
+    /// liveness, instead of guessing a fixed timeout. guest-init binds the exec
+    /// socket early (before the slow network bring-up and container spawn), so the
+    /// host connect succeeds immediately and the heartbeat passes the moment the
+    /// guest's accept loop runs — however late in a slow cold boot. Each attempt
+    /// is individually time-bounded (the early-bound socket makes a host `connect`
+    /// succeed and then block on read until the guest accepts), the loop returns
+    /// at once if the VM has exited (a fast-exiting container never stalls), and a
+    /// large absolute cap is only a last-resort backstop against a wedged-but-alive
+    /// guest — not the expected wait. Best-effort: exec/attach also connect on
+    /// demand, so even a timed-out probe does not mean exec is unavailable.
     #[cfg(unix)]
     pub(crate) async fn wait_for_exec_ready(
         &mut self,
         exec_socket_path: &std::path::Path,
     ) -> Result<()> {
-        const MAX_WAIT_MS: u64 = 10000;
-        const POLL_INTERVAL_MS: u64 = 200;
+        use tokio::time::Duration;
+
+        // Per-attempt cap on one connect + heartbeat round-trip. guest-init binds
+        // the exec socket early, so the host `connect` succeeds as soon as the VM
+        // boots and `heartbeat()`'s read then blocks until the guest's accept loop
+        // runs; bounding each attempt keeps the loop checking VM liveness instead
+        // of hanging in that read.
+        const ATTEMPT_TIMEOUT: Duration = Duration::from_millis(500);
+        const POLL_INTERVAL: Duration = Duration::from_millis(200);
+        // Last-resort backstop against a wedged-but-alive guest that binds but
+        // never accepts. A healthy guest passes the heartbeat the instant its
+        // accept loop runs (however late), and an exited VM returns immediately
+        // below — so this cap is not the expected wait.
+        const MAX_WAIT_MS: u64 = 120_000;
 
         tracing::debug!(
             socket_path = %exec_socket_path.display(),
-            "Waiting for exec server socket"
+            "Waiting for exec server readiness"
         );
 
         let start = std::time::Instant::now();
 
-        // Phase 1: Wait for socket file to appear
         loop {
-            if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 {
-                tracing::warn!("Exec socket did not appear, exec will not be available");
-                return Ok(());
-            }
-
-            if exec_socket_path.exists() {
-                tracing::debug!("Exec socket file detected");
-                break;
-            }
-
-            // Check if VM is still running (has_exited treats a zombie shim as
-            // exited; is_running's kill(pid,0) would not).
+            // Return at once if the VM has already exited (zombie-aware: has_exited
+            // treats a zombie shim as exited, unlike is_running's kill(pid,0)). A
+            // fast-exiting container never stalls here.
             if let Some(ref handler) = *self.handler.read().await {
                 if handler.has_exited() {
-                    tracing::warn!("VM exited before exec socket appeared");
+                    tracing::debug!("VM exited before exec server became ready");
                     return Ok(());
                 }
             }
 
-            tokio::time::sleep(tokio::time::Duration::from_millis(POLL_INTERVAL_MS)).await;
-        }
-
-        // Phase 2: Connect and verify with Heartbeat health check
-        while start.elapsed().as_millis() < MAX_WAIT_MS as u128 {
-            // Stop waiting if the VM has already exited: the exec socket can
-            // appear during guest init and then vanish when a fast-exiting
-            // container shuts the VM down. The shim becomes a zombie the moment
-            // the VM halts, so use has_exited (zombie-aware) rather than
-            // is_running — without this, a container that exits before its first
-            // heartbeat stalls the whole boot for MAX_WAIT_MS (~10s), which hit
-            // every short-lived `run` that lost the heartbeat race and every
-            // monitor restart of a fast-exiting container.
-            if let Some(ref handler) = *self.handler.read().await {
-                if handler.has_exited() {
-                    tracing::debug!("VM exited before exec server became ready");
+            // One bounded connect + heartbeat attempt. A timeout (early-bound
+            // socket, guest not yet accepting) or any error just means "retry".
+            if let Ok(Ok(client)) =
+                tokio::time::timeout(ATTEMPT_TIMEOUT, ExecClient::connect(exec_socket_path)).await
+            {
+                if let Ok(Ok(true)) =
+                    tokio::time::timeout(ATTEMPT_TIMEOUT, client.heartbeat()).await
+                {
+                    tracing::debug!("Exec server heartbeat passed");
+                    self.exec_client = Some(client);
                     return Ok(());
                 }
             }
 
-            match ExecClient::connect(exec_socket_path).await {
-                Ok(client) => match client.heartbeat().await {
-                    Ok(true) => {
-                        tracing::debug!("Exec server heartbeat passed");
-                        self.exec_client = Some(client);
-                        return Ok(());
-                    }
-                    Ok(false) => {
-                        tracing::debug!("Exec server heartbeat failed, retrying");
-                    }
-                    Err(e) => {
-                        tracing::debug!(error = %e, "Exec heartbeat error, retrying");
-                    }
-                },
-                Err(e) => {
-                    tracing::debug!(error = %e, "Exec connect failed, retrying");
-                }
+            if start.elapsed().as_millis() >= MAX_WAIT_MS as u128 {
+                tracing::warn!(
+                    timeout_ms = MAX_WAIT_MS,
+                    "Exec server did not become ready within the safety cap; exec/attach connect on demand and may still succeed once the guest finishes starting"
+                );
+                return Ok(());
             }
 
-            tokio::time::sleep(tokio::time::Duration::from_millis(POLL_INTERVAL_MS)).await;
+            tokio::time::sleep(POLL_INTERVAL).await;
         }
-
-        tracing::warn!("Exec socket appeared but heartbeat failed, exec will not be available");
-        Ok(())
     }
 }
diff --git a/src/runtime/src/vm/spec.rs b/src/runtime/src/vm/spec.rs
index 7b66429f..f38bca88 100644
--- a/src/runtime/src/vm/spec.rs
+++ b/src/runtime/src/vm/spec.rs
@@ -342,7 +342,8 @@ impl VmManager {
     /// - If `entrypoint_override` is set, it replaces the OCI ENTRYPOINT
     /// - If ENTRYPOINT is set: executable = ENTRYPOINT[0], args = ENTRYPOINT[1:] + CMD
     /// - If only CMD is set: executable = CMD[0], args = CMD[1:]
-    /// - If neither: fall back to `/sbin/init`
+    /// - If neither: fall back to `/bin/sh` (universal across distros; `/sbin/init`
+    ///   does not exist on Alpine, which was the original cause of issue #3)
     /// - If `cmd_override` is non-empty, it replaces the OCI CMD
     ///
     /// Paths are used as-is since the OCI image is always extracted at rootfs root.

From f1db2cf9a05dcef7f75b771a333384ec2b8b8ca9 Mon Sep 17 00:00:00 2001
From: Roy Lin <roylin@a3s.box>
Date: Thu, 11 Jun 2026 12:09:32 +0800
Subject: [PATCH 2/2] =?UTF-8?q?fix(init):=20real=20PID1=20reaper=20?=
 =?UTF-8?q?=E2=80=94=20reap=20orphans=20without=20stealing=20exec/PTY=20ex?=
 =?UTF-8?q?it=20codes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

guest-init runs as PID 1 but only waited on the container pid, so reparented
grandchildren and the sidecar were never reaped and accumulated as zombies for
the VM's lifetime. The earlier code couldn't just waitpid(-1): that races with
the exec/PTY handlers, which waitpid their own children to read the real exit
code — a stolen child makes the handler see ECHILD and report a bogus exit 0
(exec_server.rs). That tension is exactly why a prior fix narrowed the loop to
waitpid(container_pid), trading the zombie leak for correct exec codes.

Resolve both with a small reaper registry:
- New `reaper` module: handlers mark their child pid MANAGED across the spawn
  (the lock is held across fork, closing the spawn/register race for fast-exiting
  commands like `exec -- false`); an RAII guard unregisters on every return path.
- The supervision loop now peeks exited children non-destructively with
  `waitid(WNOWAIT)` and routes: the container -> reap + propagate exit code (VM
  lifecycle, unchanged); MANAGED children -> left for their handler to reap (real
  exit codes preserved); everything else (orphans + sidecar) -> reaped here.
- exec one-shot + streaming spawns and the PTY fork register their children;
  their existing waitpid/try_wait paths are unchanged.

Fixes the zombie leak and makes the long-standing "reaped by the zombie-reaper
loop" comments true again, with no regression to exec/PTY or container exit
codes. Unit-tested (reaper registry); needs KVM verification of exec exit codes
+ orphan reaping. Builds on P1+P2 (issue #3).
---
 src/guest/init/src/exec_server.rs |  14 +++-
 src/guest/init/src/lib.rs         |   1 +
 src/guest/init/src/main.rs        |  99 +++++++++++++++++++------
 src/guest/init/src/pty_server.rs  |   5 ++
 src/guest/init/src/reaper.rs      | 119 ++++++++++++++++++++++++++++++
 5 files changed, 211 insertions(+), 27 deletions(-)
 create mode 100644 src/guest/init/src/reaper.rs

diff --git a/src/guest/init/src/exec_server.rs b/src/guest/init/src/exec_server.rs
index eb7da21a..5e9f4c3b 100644
--- a/src/guest/init/src/exec_server.rs
+++ b/src/guest/init/src/exec_server.rs
@@ -751,8 +751,12 @@ fn execute_command(
         Err(output) => return output,
     };
 
-    let mut child = match command.spawn() {
-        Ok(child) => child,
+    // Spawn under the reaper registry: the pid is marked MANAGED before the PID 1
+    // supervision loop can see it, so the loop leaves this child for us to reap
+    // (and read its real exit code) instead of stealing it. The guard unregisters
+    // the pid when this function returns (all paths).
+    let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) {
+        Ok(pair) => pair,
         Err(e) => {
             return ExecOutput {
                 stdout: vec![],
@@ -1043,8 +1047,10 @@ fn execute_command_streaming(
         }
     };
 
-    let mut child = match command.spawn() {
-        Ok(child) => child,
+    // Spawn under the reaper registry (see one-shot path) so PID 1 leaves this
+    // streaming child for us to reap; the guard unregisters on return.
+    let (mut child, _reap_guard) = match crate::reaper::spawn_managed(|| command.spawn()) {
+        Ok(pair) => pair,
         Err(e) => {
             let output = ExecOutput {
                 stdout: vec![],
diff --git a/src/guest/init/src/lib.rs b/src/guest/init/src/lib.rs
index 57b05efc..f6a6f348 100644
--- a/src/guest/init/src/lib.rs
+++ b/src/guest/init/src/lib.rs
@@ -14,6 +14,7 @@ pub mod namespace;
 pub mod network;
 pub mod port_forward;
 pub mod pty_server;
+pub mod reaper;
 pub mod user;
 
 pub use namespace::{spawn_isolated, NamespaceConfig, NamespaceError};
diff --git a/src/guest/init/src/main.rs b/src/guest/init/src/main.rs
index 9fb85523..72fb1fe7 100644
--- a/src/guest/init/src/main.rs
+++ b/src/guest/init/src/main.rs
@@ -986,55 +986,105 @@ fn remount_rootfs_readonly() -> Result<(), Box<dyn std::error::Error>> {
     Ok(())
 }
 
-/// Wait for the main container process.
+/// Supervise children as PID 1: propagate the container's exit, and reap orphans.
 ///
-/// Exec and PTY requests run in other guest-init threads and wait for their
-/// own child processes. The main supervision loop must not call waitpid(-1),
-/// otherwise it can reap those children before the request handler observes
-/// their exit status.
+/// Exec and PTY request handlers reap their OWN children (each `waitpid`s a
+/// specific pid) to read the real exit status, so this loop must not steal them
+/// with a blind `waitpid(-1)`. It peeks exited children non-destructively with
+/// `waitid(WNOWAIT)` and, via the [`reaper`](a3s_box_guest_init::reaper)
+/// registry, reaps only the container (→ VM lifecycle / exit code) and UNMANAGED
+/// children — reparented grandchildren and the sidecar — leaving handler-managed
+/// children for their handler. This propagates the container exit code AND fixes
+/// the zombie leak (orphans were previously never reaped until shutdown).
+#[cfg(target_os = "linux")]
 fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box<dyn std::error::Error>> {
-    use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
+    use a3s_box_guest_init::reaper;
+    use nix::sys::wait::{waitid, waitpid, Id, WaitPidFlag, WaitStatus};
 
     /// Maximum time to wait for children after forwarding SIGTERM (5 seconds).
     const CHILD_SHUTDOWN_TIMEOUT_MS: u64 = 5000;
 
-    info!("Waiting for container process {}", container_pid);
+    info!(
+        "Supervising children as PID 1; container PID {}",
+        container_pid
+    );
 
     loop {
-        // Check if shutdown was requested via SIGTERM
         if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) {
             info!("SIGTERM received, initiating graceful shutdown");
             graceful_shutdown(CHILD_SHUTDOWN_TIMEOUT_MS);
             return Ok(());
         }
 
+        // Drain currently-exited children. `WNOWAIT` peeks without reaping, so a
+        // handler-managed child stays reapable by its handler; we break on it and
+        // revisit next tick (the handler clears it within its own poll interval).
+        loop {
+            let (pid, code, signaled) = match waitid(
+                Id::All,
+                WaitPidFlag::WEXITED | WaitPidFlag::WNOWAIT | WaitPidFlag::WNOHANG,
+            ) {
+                Ok(WaitStatus::Exited(pid, status)) => (pid, status, false),
+                Ok(WaitStatus::Signaled(pid, signal, _)) => (pid, 128 + signal as i32, true),
+                // No exited child right now: stop draining and poll again later.
+                Ok(_) => break,
+                // No children at all (container already gone): nothing to supervise.
+                Err(nix::errno::Errno::ECHILD) => return Ok(()),
+                // Transient error: retry on the next tick.
+                Err(_) => break,
+            };
+
+            if pid == container_pid {
+                // The container drives the VM lifecycle: reap it and exit with its
+                // status so the host (and detached `run -d wait`) sees the real code.
+                let _ = waitpid(pid, None);
+                if signaled {
+                    error!("Container process {} terminated (exit code {})", pid, code);
+                } else {
+                    info!("Container process {} exited with status {}", pid, code);
+                }
+                persist_exit_code(code);
+                process::exit(code);
+            } else if reaper::is_managed(pid.as_raw()) {
+                // Owned by an exec/PTY handler, which reaps it for the real status.
+                // Stop draining; it clears shortly and we revisit on the next tick.
+                break;
+            } else {
+                // Orphan (reparented grandchild) or the sidecar: reap it here so it
+                // does not linger as a zombie. Keep draining for more.
+                let _ = waitpid(pid, Some(WaitPidFlag::WNOHANG));
+            }
+        }
+
+        std::thread::sleep(std::time::Duration::from_millis(100));
+    }
+}
+
+/// Non-Linux development stub: just wait for the container process to exit.
+#[cfg(not(target_os = "linux"))]
+fn wait_for_children(container_pid: nix::unistd::Pid) -> Result<(), Box<dyn std::error::Error>> {
+    use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
+
+    loop {
+        if SHUTDOWN_REQUESTED.load(Ordering::SeqCst) {
+            return Ok(());
+        }
         match waitpid(container_pid, Some(WaitPidFlag::WNOHANG)) {
-            Ok(WaitStatus::Exited(pid, status)) => {
-                info!("Container process {} exited with status {}", pid, status);
+            Ok(WaitStatus::Exited(_, status)) => {
                 persist_exit_code(status);
                 process::exit(status);
             }
-            Ok(WaitStatus::Signaled(pid, signal, _)) => {
-                error!("Container process {} killed by signal {:?}", pid, signal);
+            Ok(WaitStatus::Signaled(_, signal, _)) => {
                 persist_exit_code(128 + signal as i32);
                 process::exit(128 + signal as i32);
             }
             Ok(WaitStatus::StillAlive) => {
                 std::thread::sleep(std::time::Duration::from_millis(100));
             }
-            Ok(_) => {
-                // Other status, continue waiting
-            }
-            Err(nix::errno::Errno::ECHILD) => {
-                info!("Container process {} is no longer a child", container_pid);
-                break;
-            }
-            Err(e) => {
-                return Err(format!("waitpid failed: {}", e).into());
-            }
+            Ok(_) => {}
+            Err(_) => break,
         }
     }
-
     Ok(())
 }
 
@@ -1053,6 +1103,9 @@ fn persist_exit_code(code: i32) {
 }
 
 /// Perform graceful shutdown: forward SIGTERM to children, wait, then force-kill.
+/// Only the Linux supervision loop drives this (the non-Linux dev stub exits the
+/// process directly), so it is gated to avoid a dead-code warning on macOS.
+#[cfg(target_os = "linux")]
 fn graceful_shutdown(timeout_ms: u64) {
     // Step 1: Send SIGTERM to all processes (except ourselves, PID 1)
     #[cfg(target_os = "linux")]
diff --git a/src/guest/init/src/pty_server.rs b/src/guest/init/src/pty_server.rs
index c4572570..b1648f45 100644
--- a/src/guest/init/src/pty_server.rs
+++ b/src/guest/init/src/pty_server.rs
@@ -246,6 +246,11 @@ fn handle_pty_connection(fd: std::os::fd::OwnedFd) -> Result<(), Box<dyn std::er
             // Parent: relay data between vsock and PTY master
             drop(slave_fd);
 
+            // Register the PTY child with the reaper so PID 1 leaves it for us to
+            // reap (relay_pty_data waitpid's it for the real exit code). The guard
+            // unregisters when this branch returns.
+            let _reap_guard = crate::reaper::manage_pid(child.as_raw());
+
             let exit_code = relay_pty_data(&mut stream, &master_fd, child);
 
             // Send exit frame
diff --git a/src/guest/init/src/reaper.rs b/src/guest/init/src/reaper.rs
new file mode 100644
index 00000000..889e1c75
--- /dev/null
+++ b/src/guest/init/src/reaper.rs
@@ -0,0 +1,119 @@
+//! Central PID 1 child reaper registry.
+//!
+//! As PID 1, guest-init must reap orphaned/reparented grandchildren and the
+//! sidecar so they do not accumulate as zombies for the VM's lifetime. But it
+//! must NOT steal the exit status of the children that the exec/PTY request
+//! handlers are waiting on: each handler reaps its own child (via `waitpid` on a
+//! specific pid) to read the real exit code, and a `waitpid(-1)` in the
+//! supervision loop would race and reap it first — making the handler observe
+//! `ECHILD` and report a bogus exit code 0.
+//!
+//! This registry lets the supervision loop tell the two apart. A handler marks
+//! its child pid MANAGED across the spawn (so the loop cannot reap-decide the new
+//! pid before it is registered), and the loop peeks exited children
+//! non-destructively (`waitid` with `WNOWAIT`): it reaps only the container (→
+//! lifecycle) and unmanaged children (true orphans + the sidecar), and leaves
+//! managed children for their handler to reap.
+
+use std::sync::Mutex;
+
+/// PIDs currently owned by an exec/PTY request handler (it will reap them).
+/// A small `Vec` — only a handful of in-flight exec/PTY children at once, so the
+/// linear scans are cheaper than a hashed set and `Vec::new()` is a `const` init.
+static MANAGED: Mutex<Vec<i32>> = Mutex::new(Vec::new());
+
+/// RAII guard: removes its pid from the MANAGED set when dropped (i.e. when the
+/// handler returns, having reaped its own child). Drop runs on every handler
+/// exit path — normal, timeout-kill, and error — so the set never leaks.
+pub struct ManagedChild(i32);
+
+impl Drop for ManagedChild {
+    fn drop(&mut self) {
+        // Recover from a poisoned lock rather than panicking inside PID 1.
+        let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner());
+        managed.retain(|&p| p != self.0);
+    }
+}
+
+/// Spawn a child while atomically marking its pid MANAGED.
+///
+/// The MANAGED lock is held across the `spawn` call, so the supervision loop
+/// cannot conclude the new pid is unmanaged (and reap it as an orphan) in the
+/// window between fork and registration — which closes the race for commands
+/// that exit almost immediately (e.g. `exec … -- false`). Holding the lock
+/// across the fork is safe: the forked child runs only async-signal-safe code
+/// before `exec` and never touches this mutex, so its inherited (locked) copy is
+/// discarded at `exec` without deadlock.
+pub fn spawn_managed<F>(spawn: F) -> std::io::Result<(std::process::Child, ManagedChild)>
+where
+    F: FnOnce() -> std::io::Result<std::process::Child>,
+{
+    let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner());
+    let child = spawn()?;
+    let pid = child.id() as i32;
+    managed.push(pid);
+    drop(managed);
+    Ok((child, ManagedChild(pid)))
+}
+
+/// Mark an already-forked pid MANAGED (raw-fork callers, e.g. the PTY server).
+/// Call immediately after `fork` in the parent. Returns the RAII guard.
+pub fn manage_pid(pid: i32) -> ManagedChild {
+    let mut managed = MANAGED.lock().unwrap_or_else(|e| e.into_inner());
+    if !managed.contains(&pid) {
+        managed.push(pid);
+    }
+    drop(managed);
+    ManagedChild(pid)
+}
+
+/// Whether `pid` is owned by a request handler (the loop must not reap it).
+pub fn is_managed(pid: i32) -> bool {
+    MANAGED
+        .lock()
+        .map(|m| m.contains(&pid))
+        .unwrap_or_else(|e| e.into_inner().contains(&pid))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Distinct synthetic pids per test so the shared MANAGED static doesn't
+    // collide across parallel test threads.
+    #[test]
+    fn manage_pid_registers_until_guard_dropped() {
+        let pid = 0x7fff_fff0;
+        assert!(!is_managed(pid));
+        {
+            let _guard = manage_pid(pid);
+            assert!(
+                is_managed(pid),
+                "pid should be managed while the guard lives"
+            );
+        }
+        assert!(!is_managed(pid), "guard drop must unregister the pid");
+    }
+
+    #[test]
+    fn unregistered_pid_is_not_managed() {
+        assert!(!is_managed(0x7fff_fff1));
+    }
+
+    #[test]
+    fn spawn_managed_marks_then_releases_real_child() {
+        // A child that exits immediately is the worst case the lock-across-spawn
+        // protects: it must still be MANAGED for the handler (here, the test) to
+        // reap, never the supervision loop.
+        let (mut child, guard) =
+            spawn_managed(|| std::process::Command::new("true").spawn()).expect("spawn true");
+        let pid = child.id() as i32;
+        assert!(
+            is_managed(pid),
+            "spawned pid must be registered before we observe it"
+        );
+        let _ = child.wait(); // the handler (test) reaps its own child
+        drop(guard);
+        assert!(!is_managed(pid), "guard drop must unregister after reap");
+    }
+}