diff --git a/src/cli/src/commands/kill.rs b/src/cli/src/commands/kill.rs index be9e592..db08a20 100644 --- a/src/cli/src/commands/kill.rs +++ b/src/cli/src/commands/kill.rs @@ -105,7 +105,10 @@ async fn kill_one( let box_id = record.id.clone(); let name = record.name.clone(); - if record.status == "paused" && is_stopping_signal(signal) && signal != SIGKILL { + // Resume a paused box before terminating it. This now also applies to + // SIGKILL: a paused box is SIGSTOP'd, and leaving it frozen would otherwise + // strand the VM (and the via-guest path below cannot reach a frozen guest). + if record.status == "paused" && is_stopping_signal(signal) { lifecycle::resume_paused_for_termination(&record, pid, "kill") .map_err(|error| -> Box { error.into() })?; } @@ -116,8 +119,18 @@ async fn kill_one( // signalling the host shim never reaches the container and would kill the // VM abruptly. Fall back to a host signal only when no guest exec server // is reachable (older box / socket gone). + // + // SIGKILL is the exception: it cannot be caught/handled, so routing it + // through the guest exec server is pointless AND it HANGS on a box whose + // guest was frozen (the read has no timeout). Force-kill the host shim + // directly — abruptly tearing down the VM is exactly what -9 wants. let exec_socket = crate::socket_paths::exec(&record); - if !process::deliver_signal_via_guest(&exec_socket, signal).await { + let delivered = if signal == SIGKILL { + false + } else { + process::deliver_signal_via_guest(&exec_socket, signal).await + }; + if !delivered { process::send_signal(pid, signal).map_err(|err| { format!( "Failed to send signal {signal} to box {} (PID {pid}): {err}", diff --git a/src/cri/src/runtime_service/mod.rs b/src/cri/src/runtime_service/mod.rs index df1f390..bd1f17a 100644 --- a/src/cri/src/runtime_service/mod.rs +++ b/src/cri/src/runtime_service/mod.rs @@ -1266,6 +1266,25 @@ impl RuntimeService for BoxRuntimeService { rootfs_guest_path, }; + // Re-validate the sandbox right before registering the container. The + // heavy async work above (image resolve + rootfs build, which yields the + // task) could have run concurrently with a StopPodSandbox/ + // RemovePodSandbox that tore the sandbox (and its rootfs tree) down. + // Without this re-check we would register an orphan container whose + // sandbox is gone — and whose rootfs we just recreated under a + // now-deleted sandbox tree — that nothing ever reaps. + match self.store.sandboxes.get(&container.sandbox_id).await { + Some(sb) if sb.state == SandboxState::Ready => {} + _ => { + self.cleanup_container_rootfs_path(&container.rootfs_path) + .await; + return Err(Status::failed_precondition(format!( + "Sandbox {} is no longer ready; aborting CreateContainer", + container.sandbox_id + ))); + } + } + self.store.add_container(container.clone()).await; self.emit_container_event( &container.id,