From 4f39c2ab74193b1e29126c10fabb320f5393b31b Mon Sep 17 00:00:00 2001 From: Roy Lin Date: Mon, 15 Jun 2026 19:35:09 +0800 Subject: [PATCH] fix: CRI CreateContainer TOCTOU orphan + kill -9 paused-box hang (audit #20/#24) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #20 (MEDIUM, leak): CreateContainer validated the sandbox Ready ONCE, then did unbounded async work (image resolve + prepare_container_rootfs, which yields) and never re-checked before add_container. A concurrent StopPodSandbox + RemovePodSandbox could tear the sandbox (and its rootfs tree) down in that window; CreateContainer would resume, recreate the rootfs under the now-deleted sandbox tree, and register an orphan container whose sandbox is gone — which nothing reaps. Re-validate the sandbox immediately before add_container; if it is no longer Ready, clean up the rootfs and return failed_precondition. #24 (MEDIUM, dos): `kill -9` of a PAUSED box skipped the resume (gated to signal != SIGKILL) and then routed SIGKILL through the guest exec server — but the guest is SIGSTOP-frozen and can never ack, and the read has no timeout, so the kill HANGS before reaching the host-SIGKILL fallback. SIGKILL cannot be caught, so route it straight to the host shim (force-killing the VM, which is what -9 wants) and resume a paused box first. cri 242 + cli kill tests pass; fmt + clippy clean. Found by the adversarial multi-agent audit (double-skeptic verified). --- src/cli/src/commands/kill.rs | 17 +++++++++++++++-- src/cri/src/runtime_service/mod.rs | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/cli/src/commands/kill.rs b/src/cli/src/commands/kill.rs index be9e592..db08a20 100644 --- a/src/cli/src/commands/kill.rs +++ b/src/cli/src/commands/kill.rs @@ -105,7 +105,10 @@ async fn kill_one( let box_id = record.id.clone(); let name = record.name.clone(); - if record.status == "paused" && is_stopping_signal(signal) && signal != SIGKILL { + // Resume a paused box before terminating it. This now also applies to + // SIGKILL: a paused box is SIGSTOP'd, and leaving it frozen would otherwise + // strand the VM (and the via-guest path below cannot reach a frozen guest). + if record.status == "paused" && is_stopping_signal(signal) { lifecycle::resume_paused_for_termination(&record, pid, "kill") .map_err(|error| -> Box { error.into() })?; } @@ -116,8 +119,18 @@ async fn kill_one( // signalling the host shim never reaches the container and would kill the // VM abruptly. Fall back to a host signal only when no guest exec server // is reachable (older box / socket gone). + // + // SIGKILL is the exception: it cannot be caught/handled, so routing it + // through the guest exec server is pointless AND it HANGS on a box whose + // guest was frozen (the read has no timeout). Force-kill the host shim + // directly — abruptly tearing down the VM is exactly what -9 wants. let exec_socket = crate::socket_paths::exec(&record); - if !process::deliver_signal_via_guest(&exec_socket, signal).await { + let delivered = if signal == SIGKILL { + false + } else { + process::deliver_signal_via_guest(&exec_socket, signal).await + }; + if !delivered { process::send_signal(pid, signal).map_err(|err| { format!( "Failed to send signal {signal} to box {} (PID {pid}): {err}", diff --git a/src/cri/src/runtime_service/mod.rs b/src/cri/src/runtime_service/mod.rs index df1f390..bd1f17a 100644 --- a/src/cri/src/runtime_service/mod.rs +++ b/src/cri/src/runtime_service/mod.rs @@ -1266,6 +1266,25 @@ impl RuntimeService for BoxRuntimeService { rootfs_guest_path, }; + // Re-validate the sandbox right before registering the container. The + // heavy async work above (image resolve + rootfs build, which yields the + // task) could have run concurrently with a StopPodSandbox/ + // RemovePodSandbox that tore the sandbox (and its rootfs tree) down. + // Without this re-check we would register an orphan container whose + // sandbox is gone — and whose rootfs we just recreated under a + // now-deleted sandbox tree — that nothing ever reaps. + match self.store.sandboxes.get(&container.sandbox_id).await { + Some(sb) if sb.state == SandboxState::Ready => {} + _ => { + self.cleanup_container_rootfs_path(&container.rootfs_path) + .await; + return Err(Status::failed_precondition(format!( + "Sandbox {} is no longer ready; aborting CreateContainer", + container.sandbox_id + ))); + } + } + self.store.add_container(container.clone()).await; self.emit_container_event( &container.id,