From f173b2ab54e1ddc3a169b4997ef3d465b4aa4d7d Mon Sep 17 00:00:00 2001 From: Rhys Sullivan <39114868+RhysSullivan@users.noreply.github.com> Date: Sun, 28 Jun 2026 11:25:05 -0700 Subject: [PATCH 1/2] e2e: browse/promote authoring loop and cross-OS packaged-desktop targets Two additions to the e2e setup sharing one idea: the interfaces you develop the product with are the same ones the generated tests drive, and every run is watchable. Authoring loop (src/journey/, scripts/cli.ts): - `bun run cli browse ` drives the live web UI one step at a time (real logged-in Chromium), each step replaying the whole flow and printing the page's controls plus a screenshot. Steps span the browser, a terminal (`run`), and HTTP (`request`), interleaved in one session. - `bun run cli promote ""` turns the recorded journey into a committed scenario() and runs it. One Step DSL is the source of truth for both live execution and codegen, so the generated test drives the same surfaces the exploration drove and cannot quietly diverge. Cross-OS packaged desktop (setup/desktop-*, desktop-vm/, src/vm/desktop.ts): - desktop-macos / desktop-linux / desktop-windows run the real electron-builder bundle inside a guest, drive it over a CDP tunnel, and film the console into runs// alongside test.ts and step screenshots. One shared scenario and driver; only launch and capture differ per OS: macOS: autologin Aqua session, launchctl asuser, screencapture linux: Xvfb + openbox, xdotool window resize, ffmpeg x11grab windows: dockur (QEMU) interactive session, QEMU screendump - macOS and Linux auto-provision a tart guest and build the bundle locally (the executor binary cross-compiles via BUN_TARGET); Windows attaches to a dockur host configured through E2E_DESKTOP_WIN_* env. The desktop targets are not in the default test chain and skip honestly without a guest. Also: - Force tart SSH to password-only (PubkeyAuthentication=no, IdentitiesOnly=yes) so a loaded SSH agent does not exhaust the guest's MaxAuthTries, an intermittent failure the existing cli-{os} lanes also hit. - build-sidecar keys the executable-bit chmod on the build target, not the host, so a windows-target cross-build no longer fails looking for a unix executor binary. --- apps/desktop/scripts/build-sidecar.ts | 6 +- e2e/AGENTS.md | 74 +++++ e2e/desktop-vm/console-renders.test.ts | 75 ++++++ e2e/scripts/cli.ts | 326 ++++++++++++++++++++++ e2e/setup/desktop-linux.globalsetup.ts | 125 +++++++++ e2e/setup/desktop-macos.globalsetup.ts | 108 ++++++++ e2e/setup/desktop-vm.ts | 91 +++++++ e2e/setup/desktop-windows.globalsetup.ts | 106 ++++++++ e2e/src/journey/codegen.ts | 78 ++++++ e2e/src/journey/run.ts | 104 +++++++ e2e/src/journey/steps.ts | 236 ++++++++++++++++ e2e/src/vm/desktop.ts | 327 +++++++++++++++++++++++ e2e/src/vm/tart.ts | 18 +- e2e/targets/desktop.ts | 5 +- e2e/targets/registry.ts | 6 + e2e/vitest.config.ts | 20 ++ 16 files changed, 1702 insertions(+), 3 deletions(-) create mode 100644 e2e/desktop-vm/console-renders.test.ts create mode 100644 e2e/setup/desktop-linux.globalsetup.ts create mode 100644 e2e/setup/desktop-macos.globalsetup.ts create mode 100644 e2e/setup/desktop-vm.ts create mode 100644 e2e/setup/desktop-windows.globalsetup.ts create mode 100644 e2e/src/journey/codegen.ts create mode 100644 e2e/src/journey/run.ts create mode 100644 e2e/src/journey/steps.ts create mode 100644 e2e/src/vm/desktop.ts diff --git a/apps/desktop/scripts/build-sidecar.ts b/apps/desktop/scripts/build-sidecar.ts index cf248e910..74b56f9e8 100644 --- a/apps/desktop/scripts/build-sidecar.ts +++ b/apps/desktop/scripts/build-sidecar.ts @@ -45,7 +45,11 @@ await rm(EXECUTOR_OUT_DIR, { recursive: true, force: true }); await mkdir(EXECUTOR_OUT_DIR, { recursive: true }); await cp(sourceBinDir, EXECUTOR_OUT_DIR, { recursive: true }); -if (process.platform !== "win32") { +// Restore the unix executable bit — keyed on the TARGET, not the host. A +// windows-target cross-build (BUN_TARGET=bun-windows-x64 on macOS/linux) stages +// `executor.exe`, which needs no bit; chmod'ing a non-existent `executor` there +// would ENOENT. +if (!targetPackage.includes("windows")) { await chmod(join(EXECUTOR_OUT_DIR, "executor"), 0o755); } diff --git a/e2e/AGENTS.md b/e2e/AGENTS.md index a77694871..78964b0bd 100644 --- a/e2e/AGENTS.md +++ b/e2e/AGENTS.md @@ -130,6 +130,80 @@ When handing results to the user, follow the evidence contract in the root [AGENTS.md](../AGENTS.md) (direct run links + a live instance + what to try); [RUNNING.md](../RUNNING.md) has the current sharing/demo mechanics. +## Authoring from a live browser (`browse` → `promote`) + +You don't have to hand-write a browser scenario. Drive a running instance's web +UI one step at a time, then turn the recorded journey into a committed scenario. +The generated test drives the same Browser surface the exploration drove, so it +is the real test, not a transcript of one — develop the flow, then crystallize +it. + +```sh +cd e2e +bun run cli up cloud # a live instance to develop against +bun run cli browse cloud goto / # each step REPLAYS the whole flow from a +bun run cli browse cloud click link Policies # clean browser and prints the page's controls +bun run cli browse cloud at-url /policies # (role · name) + a screenshot, so the next +bun run cli browse cloud see "No policies yet" # step is written against what's actually there +bun run cli promote cloud "Policies · a fresh workspace has none" +``` + +Each `browse` replays every step so far, so what you are building is, at every +moment, exactly what `promote` emits — a step that doesn't reproduce fails here, +not in CI. Steps: `goto `, `click `, `click-text `, +`fill `, `press `, and the assertions `see ` / +`at-url `. `--label "…"` names a step (it becomes the `step(...)` +group); `browse show | undo | reset` manages the journey. + +`promote` writes `/.gen.test.ts` and runs it against the live +instance, producing the usual run artifacts (session.mp4, step screenshots, +trace). A journey with no assertion is refused — a scenario must prove +something. From then on the file is an ordinary scenario: edit it, add API/MCP +checks, drop the `.gen` once it's yours. The journey itself lives in +`.dev/.journey.json` (gitignored), not the repo. + +## Desktop targets (the app on real OSes, filmed) + +The packaged desktop app runs as its own targets, each landing in its own +`runs//` bucket with a video. One shared scenario (`desktop-vm/`) and the +shared driver (`src/vm/desktop.ts`) + setup plumbing (`setup/desktop-vm.ts`); one +project + globalsetup per guest OS. + +- **`desktop-packaged`** — the real electron-builder bundle on THIS machine's + display (the supervised-daemon attach path). Needs a logged-in GUI session. +- **`desktop-macos` / `desktop-linux`** — the same bundle inside a guest VM, + driven over CDP from the host and filmed. The globalsetup boots the guest + (tart), builds + pushes the bundle, brings the app up with + `--remote-debugging-port`, forwards it, and the scenario connects + drives + + records. Provisioned automatically — or attach to a running guest with + `E2E_DESKTOP_VM_IP=`: + + ```sh + vitest run --project desktop-macos # or desktop-linux + ``` + +The guests run tart `--no-graphics` (no host window, never steals focus) but +still have a usable display: + +- **macOS**: the base image's autologin reaches a real Aqua session + (WindowServer/Dock/Finder). Launch the app INTO it with `sudo launchctl asuser + …` (a plain SSH spawn lands in a non-GUI session); the unsigned arm64 + bundle is ad-hoc `codesign`'d in the guest; `screencapture` films it. +- **linux**: no window server, so the app renders into an `Xvfb` display with a + minimal WM (`openbox` — without it the electron window never maps); the window + maps tiny (10x10) so the globalsetup `xdotool`-resizes it to fill, and ffmpeg + `x11grab` films it. `--no-sandbox` (the chrome-sandbox needs setuid root). + +Base images (`admin`/`admin`): `executor-macos-base` (cirruslabs sequoia, autologin) +and `executor-linux-base` (cirruslabs ubuntu + Xvfb/ffmpeg/openbox/xdotool + +electron runtime libs). The bundle's `executor` binary is cross-compiled for the +guest (`BUN_TARGET`), and electron-builder's `dir` target assembles the unpacked +app on macOS — so both bundles build on this Mac. + +Note: `desktop-packaged`'s `guiAvailable()` probe (`launchctl managername`) reads +"Background" over SSH even when Aqua is up, so it's host-only; the VM targets gate +on a CDP page target instead. + ## Discovering endpoints - The full OpenAPI spec: `curl http://127.0.0.1:/api/openapi.json` diff --git a/e2e/desktop-vm/console-renders.test.ts b/e2e/desktop-vm/console-renders.test.ts new file mode 100644 index 000000000..c24658784 --- /dev/null +++ b/e2e/desktop-vm/console-renders.test.ts @@ -0,0 +1,75 @@ +// The PACKAGED desktop app, on camera, inside a GUI guest — driven over CDP from +// the host. ONE scenario shared by every desktop- project (desktop-macos, +// desktop-linux): the same bundle and CDP driver, proving it renders on a guest +// OS and filming the actual console. The desktop- globalsetup boots the +// guest, launches the app, forwards its --remote-debugging-port (E2E_DESKTOP_CDP_PORT) +// and publishes the guest IP; this scenario connects, drives, and records. The +// run lands in runs// (its own per-OS bucket). Without a guest it skips +// honestly, like desktop-packaged without a display. +import { writeFileSync } from "node:fs"; +import { join } from "node:path"; + +import { expect, it } from "@effect/vitest"; +import { Effect } from "effect"; + +import { scenario } from "../src/scenario"; +import { RunDir } from "../src/services"; +import { CdpPage, pageWsUrl, recordGuestScreen } from "../src/vm/desktop"; + +const NAME = "Desktop (packaged, in a VM) · the bundle renders its console"; +const cdpPort = process.env.E2E_DESKTOP_CDP_PORT; +const guestIp = process.env.E2E_DESKTOP_VM_IP; +const recSeconds = Number(process.env.E2E_DESKTOP_REC_SECONDS ?? "12"); +const os: "macos" | "linux" | "windows" = + process.env.E2E_TARGET === "desktop-windows" + ? "windows" + : process.env.E2E_TARGET === "desktop-linux" + ? "linux" + : "macos"; + +const run = async (runDir: string) => { + const cdp = await CdpPage.connect(await pageWsUrl(Number(cdpPort))); + try { + await cdp.command("Runtime.enable"); + await cdp.command("Page.enable"); + + // Film the console while we drive it (OS-aware capture lands a playable mp4). + const recording = recordGuestScreen( + guestIp as string, + recSeconds, + join(runDir, "session.mp4"), + os, + ); + + // Reaching the nav proves the packaged bundle booted and connected to its + // daemon on this OS. + await cdp.waitForText("Integrations", 60_000).catch(() => cdp.waitForText("Settings", 60_000)); + writeFileSync(join(runDir, "01-console-rendered.png"), await cdp.screenshot()); + + const body = await cdp.command<{ result?: { value?: string } }>("Runtime.evaluate", { + expression: "document.body.innerText", + returnByValue: true, + }); + expect(body.result?.value ?? "", "the packaged console rendered its nav").toContain( + "Integrations", + ); + + await recording; + } finally { + cdp.close(); + } +}; + +if (!cdpPort || !guestIp) { + it.skip(`${NAME} (needs a desktop guest — set E2E_DESKTOP_VM_IP or run the desktop- project)`, () => {}); +} else { + // Literal name (not NAME) so the run's test.ts review artifact captures it. + scenario( + "Desktop (packaged, in a VM) · the bundle renders its console", + { timeout: 180_000 }, + Effect.gen(function* () { + const runDir = yield* RunDir; + yield* Effect.promise(() => run(runDir)); + }), + ); +} diff --git a/e2e/scripts/cli.ts b/e2e/scripts/cli.ts index 38b13caaa..d619b99f3 100644 --- a/e2e/scripts/cli.ts +++ b/e2e/scripts/cli.ts @@ -7,6 +7,8 @@ // bun scripts/cli.ts api [json] // bun scripts/cli.ts mcp tools | call [json] // bun scripts/cli.ts ledger [workos|autumn] +// bun scripts/cli.ts browse (goto/click/see/…; show|undo|reset) +// bun scripts/cli.ts promote "" // bun scripts/cli.ts logs // bun scripts/cli.ts down // @@ -25,6 +27,9 @@ import { networkInterfaces } from "node:os"; import { join } from "node:path"; import { fileURLToPath } from "node:url"; +import type { JourneyFile } from "../src/journey/codegen"; +import type { Role, Step } from "../src/journey/steps"; + const e2eDir = fileURLToPath(new URL("..", import.meta.url)); const devDir = join(e2eDir, ".dev"); const cliPath = fileURLToPath(import.meta.url); @@ -443,6 +448,308 @@ const ledger = async (targetName: string, service = "workos") => { console.log(JSON.stringify(entries, null, 2)); }; +// --- browser journeys ------------------------------------------------------ +// Drive a live instance's web UI step by step. Each step appends to a journey +// file and REPLAYS the whole flow from a clean browser, so what you build is, at +// every moment, exactly what `promote` emits as a scenario — develop the flow, +// then crystallize it (e2e/AGENTS.md) with no translation gap. The agent can't +// see the screen, so every step returns the page's interactive controls (the +// vocabulary the next step is written against) plus a screenshot for a human. + +const journeyPaths = (target: string) => ({ + file: join(devDir, `${target}.journey.json`), + shotsDir: join(devDir, `${target}.journey`), +}); + +const readJourney = (target: string): JourneyFile => { + try { + return JSON.parse(readFileSync(journeyPaths(target).file, "utf8")) as JourneyFile; + } catch { + return { target, org: true, steps: [] }; + } +}; + +const writeJourney = (journey: JourneyFile) => { + mkdirSync(devDir, { recursive: true }); + writeFileSync(journeyPaths(journey.target).file, JSON.stringify(journey, null, 1)); +}; + +const STEP_ROLES = new Set([ + "link", + "button", + "heading", + "textbox", + "tab", + "menuitem", + "checkbox", +]); + +/** One CLI step verb → a Step. The verbs are deliberately plain English so the + * journey reads like instructions: goto / click / click-text / fill / press / + * see / at-url / run / request. */ +const parseStep = ( + verb: string, + args: ReadonlyArray, + opts: { readonly label?: string; readonly contains?: string }, +): Step => { + const withLabel = opts.label ? { label: opts.label } : {}; + const withContains = opts.contains !== undefined ? { contains: opts.contains } : {}; + switch (verb) { + case "goto": { + const path = args[0]; + if (!path) throw new Error("usage: browse goto (e.g. goto /)"); + return { kind: "goto", path, ...withLabel }; + } + case "click": { + const role = args[0]; + const name = args.slice(1).join(" "); + if (!role || !name) { + throw new Error( + "usage: browse click (role: " + [...STEP_ROLES].join("|") + ")", + ); + } + if (!STEP_ROLES.has(role as Role)) throw new Error(`unknown role ${JSON.stringify(role)}`); + return { kind: "clickRole", role: role as Role, name, ...withLabel }; + } + case "click-text": { + const text = args.join(" "); + if (!text) throw new Error("usage: browse click-text "); + return { kind: "clickText", text, ...withLabel }; + } + case "fill": { + const field = args[0]; + const value = args.slice(1).join(" "); + if (!field || args.length < 2) throw new Error("usage: browse fill "); + return { kind: "fill", field, value, ...withLabel }; + } + case "press": { + const key = args[0]; + if (!key) throw new Error("usage: browse press (e.g. press Enter)"); + return { kind: "press", key, ...withLabel }; + } + case "see": { + const text = args.join(" "); + if (!text) + throw new Error("usage: browse see (asserts the text is visible)"); + return { kind: "expectText", text, ...withLabel }; + } + case "at-url": { + const contains = args[0]; + if (!contains) throw new Error("usage: browse at-url "); + return { kind: "expectUrl", contains, ...withLabel }; + } + case "run": { + const command = args.join(" "); + if (!command) { + throw new Error( + 'usage: browse run "" [--contains ] ({base} = the instance URL)', + ); + } + return { kind: "run", command, ...withContains, ...withLabel }; + } + case "request": { + const method = (args[0] ?? "").toUpperCase(); + const path = args[1]; + if (!method || !path) { + throw new Error("usage: browse request [--contains ]"); + } + return { kind: "request", method, path, ...withContains, ...withLabel }; + } + default: + throw new Error( + `unknown step ${JSON.stringify(verb)} — goto | click | click-text | fill | press | see | at-url | run | request`, + ); + } +}; + +const printJourney = (journey: JourneyFile) => { + if (journey.steps.length === 0) return console.log(`${journey.target}: empty journey`); + console.log(`${journey.target} journey (${journey.steps.length} steps, org=${journey.org}):`); + journey.steps.forEach((step, index) => { + console.log(` ${String(index + 1).padStart(2)}. ${stepLabelOf(step)}`); + }); +}; + +// A local copy of the label default so printing doesn't pull in the browser +// module (which imports playwright). Kept trivially in sync with steps.ts. +const stepLabelOf = (step: Step): string => { + if (step.label) return step.label; + if (step.kind === "goto") return `goto ${step.path}`; + if (step.kind === "clickRole") return `click ${step.role} ${JSON.stringify(step.name)}`; + if (step.kind === "clickText") return `click-text ${JSON.stringify(step.text)}`; + if (step.kind === "fill") return `fill ${JSON.stringify(step.field)}`; + if (step.kind === "press") return `press ${step.key}`; + if (step.kind === "expectText") return `see ${JSON.stringify(step.text)}`; + if (step.kind === "expectUrl") return `at-url ${JSON.stringify(step.contains)}`; + if (step.kind === "run") { + return `run ${JSON.stringify(step.command)}${step.contains ? ` → ${JSON.stringify(step.contains)}` : ""}`; + } + return `request ${step.method} ${step.path}${step.contains ? ` → ${JSON.stringify(step.contains)}` : ""}`; +}; + +const browse = async (raw: ReadonlyArray) => { + const target = raw[0]; + if (!target) throw new Error("usage: browse (or show | undo | reset)"); + + // Re-parse tokens here: --label takes a value, which the top-level flag/arg + // split would mangle. + let label: string | undefined; + let contains: string | undefined; + const positional: string[] = []; + const bools = new Set(); + const tokens = raw.slice(1); + for (let i = 0; i < tokens.length; i++) { + const token = tokens[i]!; + if (token === "--label") { + label = tokens[++i]; + continue; + } + if (token.startsWith("--label=")) { + label = token.slice("--label=".length); + continue; + } + if (token === "--contains") { + contains = tokens[++i]; + continue; + } + if (token.startsWith("--contains=")) { + contains = token.slice("--contains=".length); + continue; + } + if (token.startsWith("--")) { + bools.add(token); + continue; + } + positional.push(token); + } + + const verb = positional[0]; + const { file, shotsDir } = journeyPaths(target); + let journey = readJourney(target); + + if (verb === "show") return printJourney(journey); + if (verb === "reset") { + rmSync(file, { force: true }); + rmSync(shotsDir, { recursive: true, force: true }); + return console.log(`${target}: journey reset`); + } + if (!verb) throw new Error("usage: browse (or show | undo | reset)"); + + // The flow runs as ONE minted identity per replay — same as the generated + // test mints one per run, so a stateful journey behaves identically. + const { target: resolved } = await loadTarget(target); + const identity = await runEffect( + resolved.newIdentity(journey.org ? undefined : { org: false }), + ); + const { replayJourney } = await import("../src/journey/run"); + + let steps = [...journey.steps]; + if (verb === "undo") { + if (steps.length === 0) throw new Error("nothing to undo"); + steps = steps.slice(0, -1); + } else { + if (steps.length === 0 && bools.has("--no-org")) journey = { ...journey, org: false }; + steps = [...steps, parseStep(verb, positional.slice(1), { label, contains })]; + } + + mkdirSync(shotsDir, { recursive: true }); + const shot = join(shotsDir, `${String(steps.length).padStart(2, "0")}.png`); + const observation = await replayJourney(resolved, identity, steps, { screenshotPath: shot }); + + if (observation.failedStep) { + const failed = observation.failedStep; + const isNew = verb !== "undo" && failed.index === steps.length - 1; + console.log( + `✗ ${isNew ? "this step" : `step ${failed.index + 1} (${stepLabelOf(steps[failed.index]!)})`} failed — journey unchanged.`, + ); + console.log(` ${failed.error.split("\n")[0]}`); + console.log(` screenshot: ${observation.screenshotPath}`); + return; + } + + journey = { ...journey, steps }; + writeJourney(journey); + + console.log(`→ ${observation.url} ${JSON.stringify(observation.title)}`); + console.log(` screenshot: ${observation.screenshotPath}`); + if (observation.lastOutput) { + const lines = observation.lastOutput.split("\n").slice(0, 12); + console.log("output:"); + for (const line of lines) console.log(` ${line}`); + } + printJourney(journey); + if (observation.controls.length > 0) { + console.log("controls on the page (role · name):"); + for (const control of observation.controls.slice(0, 40)) { + console.log(` ${control.role.padEnd(9)} ${control.name}`); + } + if (observation.controls.length > 40) { + console.log(` … ${observation.controls.length - 40} more`); + } + } +}; + +const promote = async (raw: ReadonlyArray) => { + const target = raw[0]; + const positional = raw.slice(1).filter((arg) => !arg.startsWith("--")); + const name = positional[0]; + const noRun = raw.includes("--no-run"); + if (!target || !name) { + throw new Error('usage: promote "" [--no-run]'); + } + const journey = readJourney(target); + if (journey.steps.length === 0) { + throw new Error(`no journey for ${target} — build one with \`browse ${target} …\` first`); + } + const { codegenScenario, journeyHasAssertion, journeyHasBrowserStep } = + await import("../src/journey/codegen"); + if (!journeyHasBrowserStep(journey)) { + throw new Error( + "this journey is all terminal/HTTP steps — `promote` generates browser-anchored scenarios. Add a browser step, or write a CLI/API test directly.", + ); + } + if (!journeyHasAssertion(journey)) { + throw new Error( + "this journey has no assertion (a `see`, `at-url`, `request`, or `run --contains` step), so the scenario would prove nothing. Add one, then promote.", + ); + } + + const source = codegenScenario(name, journey); + const slug = name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .slice(0, 80); + // Scope the file to the target's own dir (cloud/, selfhost/) so it runs only + // where its UI selectors apply, not on every cross-target host. + const relFile = join(target, `${slug}.gen.test.ts`); + writeFileSync(join(e2eDir, relFile), source); + console.log(`wrote ${relFile} (${journey.steps.length} steps from the ${target} journey)`); + + const urlKey = `E2E_${target.toUpperCase()}_URL`; + const url = readState(target)?.env?.[urlKey]; + if (noRun || !url) { + console.log( + `run it against the live instance:\n ${urlKey}= bunx vitest run --project ${target} ${relFile}`, + ); + return; + } + + console.log(`running it against the live instance (${url}) …`); + const proc = spawn("bunx", ["vitest", "run", "--project", target, relFile], { + cwd: e2eDir, + stdio: "inherit", + env: { ...process.env, [urlKey]: url }, + }); + const code: number = await new Promise((resolve) => proc.on("exit", (c) => resolve(c ?? 1))); + const slugDir = join("runs", target, slug); + console.log( + code === 0 + ? `\n✓ passed. Artifacts (test.ts, result.json, session.mp4/film.mp4): e2e/${slugDir}\n view: cd e2e && bun run serve → #/${target}/${slug}` + : `\n✗ the generated test failed (exit ${code}). The flow that passed live did not pass as a committed test — inspect ${relFile} and e2e/${slugDir}.`, + ); +}; + // --- lifecycle commands ---------------------------------------------------- const status = () => { @@ -508,9 +815,24 @@ const HELP = `e2e dev CLI — the scenario primitives, interactive (see e2e/AGEN api [json] typed API call as a fresh identity mcp tools | call [json] MCP session call ledger [workos|autumn] the emulator's request ledger (cloud) + browse drive the live web UI, one step at a time; each step + replays the whole flow and prints the page's controls + steps: goto | click | click-text + | fill | press + | see | at-url + | run "" | request + flags: --label "…", --no-org, --contains + ({base} in a run command = the instance URL) + browse show | undo | reset + promote "" turn the recorded journey into a committed scenario + (/.gen.test.ts) and run it (--no-run) logs dump the instance's dev-server log down tear down (kills servers, removes tailscale serves) +A browser journey IS the scenario: develop the flow with \`browse\`, then +\`promote\` it. The generated test drives the same surface, so a reviewer judges +it by reading the test and watching its video. + Instances live in e2e/.dev/.json — a state file marks a DELIBERATE long-lived instance. Use the booted instance for e2e too: E2E_SELFHOST_URL= vitest run --project selfhost `; @@ -534,6 +856,10 @@ const main = async () => { return mcpCall(args[0] ?? "", args[1], args.slice(2)); case "ledger": return ledger(args[0] ?? "", args[1]); + case "browse": + return browse(rest); + case "promote": + return promote(rest); case "logs": return logs(args[0] ?? ""); case "down": diff --git a/e2e/setup/desktop-linux.globalsetup.ts b/e2e/setup/desktop-linux.globalsetup.ts new file mode 100644 index 000000000..46d535a1c --- /dev/null +++ b/e2e/setup/desktop-linux.globalsetup.ts @@ -0,0 +1,125 @@ +// desktop-linux: bring the PACKAGED app up inside a Linux guest and forward its +// CDP port (the shared attach/forward lives in ./desktop-vm). No window server, +// so the app renders into an Xvfb virtual display; ffmpeg x11grab (in the +// scenario's recorder) films that display. Simpler than macOS: no Aqua, no +// codesign, no launchctl — just background processes with DISPLAY set and +// --no-sandbox (the chrome-sandbox needs setuid root, pointless on a throwaway +// guest). The base image (executor-linux-base) carries Xvfb + ffmpeg + the +// electron runtime libs. +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { fileURLToPath } from "node:url"; +import { basename, join } from "node:path"; + +import { pushDirAsTar } from "../src/vm/desktop"; +import { tartVm } from "../src/vm/tart"; +import { + attachOrProvision, + CDP_GUEST_PORT, + waitGuestHttp, + waitGuestPageTarget, + type ProvisionedGuest, +} from "./desktop-vm"; + +const DAEMON_PORT = 4789; +const GUEST_DIR = "/home/admin/exe"; +const GUEST_HOME = "/home/admin/exe-home"; +const DISPLAY = ":99"; + +const appDir = fileURLToPath(new URL("../../apps/desktop/", import.meta.url)); +const hostBundle = () => { + // electron-builder names the dir `linux--unpacked` for non-x64. + const dir = join(appDir, "dist", "linux-arm64-unpacked"); + return { + dir, + exe: join(dir, "executor-desktop"), + executor: join(dir, "resources/executor/executor"), + }; +}; + +/** Build the packaged linux-arm64 bundle if it isn't on disk. The `executor` + * binary is cross-compiled here via BUN_TARGET (same as the cli-linux lane); + * electron-builder's `dir` target assembles the unpacked app on macOS without + * Docker. */ +const ensureBundle = (): void => { + if (existsSync(hostBundle().dir)) return; + const run = (cmd: string, args: string[], env: Record = {}) => + execFileSync(cmd, args, { cwd: appDir, stdio: "inherit", env: { ...process.env, ...env } }); + run("bun", ["./scripts/build-sidecar.ts"], { BUN_TARGET: "bun-linux-arm64" }); + run("bunx", ["--bun", "electron-vite", "build"]); + run( + "bunx", + [ + "--bun", + "electron-builder", + "--config", + "electron-builder.e2e.config.ts", + "--linux", + "--arm64", + ], + { CSC_IDENTITY_AUTO_DISCOVERY: "false" }, + ); +}; + +const provisionLinux = async (): Promise => { + ensureBundle(); + const { dir } = hostBundle(); + const vm = await tartVm("linux", "arm64").provision(); + try { + await vm.ssh(`rm -rf ${GUEST_DIR} ${GUEST_HOME}; mkdir -p ${GUEST_HOME}/.executor`); + await pushDirAsTar(vm.host, dir, GUEST_DIR); + + const guestApp = `${GUEST_DIR}/${basename(dir)}`; + const guestExe = `${guestApp}/executor-desktop`; + const guestExecutor = `${guestApp}/resources/executor/executor`; + await vm.ssh(`chmod +x '${guestExe}' '${guestExecutor}' 2>/dev/null || true`); + const env = `HOME=${GUEST_HOME} EXECUTOR_DATA_DIR=${GUEST_HOME}/.executor`; + + // A virtual display + a minimal WM (openbox) — without a window manager the + // electron window doesn't map onto the framebuffer that x11grab records. + await vm.ssh( + `pkill Xvfb 2>/dev/null; pkill openbox 2>/dev/null; ` + + `nohup Xvfb ${DISPLAY} -screen 0 1280x800x24 >/tmp/xvfb.log 2>&1 & sleep 2; ` + + `DISPLAY=${DISPLAY} nohup openbox >/tmp/openbox.log 2>&1 & sleep 1; echo up`, + ); + + // 1) the bundled daemon, supervised — the app attaches to this. + await vm.ssh( + `nohup env ${env} EXECUTOR_SUPERVISED=1 EXECUTOR_AUTH_TOKEN=desktop-linux-e2e EXECUTOR_CLIENT=desktop ` + + `'${guestExecutor}' daemon run --foreground --port ${DAEMON_PORT} --hostname 127.0.0.1 ` + + `>/tmp/executor-daemon.log 2>&1 &`, + ); + if (!(await waitGuestHttp(vm, `http://127.0.0.1:${DAEMON_PORT}/`))) { + throw new Error( + "supervised daemon never came up in the guest (see /tmp/executor-daemon.log)", + ); + } + + // 2) the packaged app on the virtual display, with CDP enabled. + await vm.ssh( + `nohup env ${env} DISPLAY=${DISPLAY} '${guestExe}' --no-sandbox ` + + `--remote-debugging-port=${CDP_GUEST_PORT} --remote-allow-origins='*' ` + + `>/tmp/executor-app.log 2>&1 &`, + ); + if (!(await waitGuestPageTarget(vm, CDP_GUEST_PORT))) { + const log = (await vm.ssh("tail -40 /tmp/executor-app.log 2>/dev/null").catch(() => null)) + ?.stdout; + throw new Error(`the app's CDP page target never appeared:\n${log ?? "(no app log)"}`); + } + + // The electron window maps tiny (10x10) under Xvfb; size it to the screen so + // the x11grab recording captures the full console (CDP screenshots the + // renderer surface regardless, but the film grabs the X framebuffer). + await vm.ssh( + `WID=$(DISPLAY=${DISPLAY} xdotool search --name executor-desktop | head -1); ` + + `[ -n "$WID" ] && DISPLAY=${DISPLAY} xdotool windowsize "$WID" 1280 800 windowmove "$WID" 0 0 || true`, + ); + + return { ip: vm.host, teardown: async () => void (await vm.discard()) }; + } catch (error) { + await vm.discard(); + throw error; + } +}; + +export default (): Promise<(() => Promise) | void> => attachOrProvision(provisionLinux); diff --git a/e2e/setup/desktop-macos.globalsetup.ts b/e2e/setup/desktop-macos.globalsetup.ts new file mode 100644 index 000000000..b26707ca2 --- /dev/null +++ b/e2e/setup/desktop-macos.globalsetup.ts @@ -0,0 +1,108 @@ +// desktop-macos: bring the PACKAGED app up inside a macOS GUI guest and forward +// its CDP port (the shared attach/forward lives in ./desktop-vm). The guest runs +// tart `--no-graphics` (no host window) but the base image's autologin still +// reaches a real Aqua session, so the GUI renders and `screencapture` films it. +// We come up the SAME way desktop-packaged does — start the bundled daemon, then +// launch the app so it ATTACHES (no sidecar spawn → no first-run consent modal). +// The app must be launched INTO the Aqua session (`launchctl asuser`); a plain +// SSH spawn lands in a non-GUI session. +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { fileURLToPath } from "node:url"; +import { join } from "node:path"; + +import { pushDirAsTar } from "../src/vm/desktop"; +import { tartVm } from "../src/vm/tart"; +import { + attachOrProvision, + CDP_GUEST_PORT, + waitGuestHttp, + waitGuestPageTarget, + type ProvisionedGuest, +} from "./desktop-vm"; + +const DAEMON_PORT = 4789; +const GUEST_DIR = "/Users/admin/exe"; +const GUEST_HOME = "/Users/admin/exe-home"; + +const appDir = fileURLToPath(new URL("../../apps/desktop/", import.meta.url)); +const hostBundle = () => { + const app = join(appDir, "dist", "mac-arm64", "Executor.app"); + return { + app, + exe: join(app, "Contents/MacOS/Executor"), + executor: join(app, "Contents/Resources/executor/executor"), + }; +}; + +/** Build the packaged mac bundle if it isn't on disk (slow; reuse an existing + * dist/ while iterating). Mirrors desktop-packaged.globalsetup. */ +const ensureBundle = (): void => { + if (existsSync(hostBundle().app)) return; + const run = (cmd: string, args: string[]) => + execFileSync(cmd, args, { cwd: appDir, stdio: "inherit", env: { ...process.env } }); + run("bun", ["./scripts/build-sidecar.ts"]); + run("bunx", ["--bun", "electron-vite", "build"]); + execFileSync( + "bunx", + ["--bun", "electron-builder", "--config", "electron-builder.e2e.config.ts", "--mac"], + { + cwd: appDir, + stdio: "inherit", + env: { ...process.env, CSC_IDENTITY_AUTO_DISCOVERY: "false" }, + }, + ); +}; + +const provisionMac = async (): Promise => { + ensureBundle(); + const { exe, executor } = hostBundle(); + const vm = await tartVm("macos", "arm64").provision(); + try { + // Push the bundle (tar-stream, robust over the just-booted link) and clear + // the scp quarantine so it can run. + await vm.ssh(`rm -rf ${GUEST_DIR} ${GUEST_HOME} && mkdir -p ${GUEST_HOME}/.executor`); + await pushDirAsTar(vm.host, hostBundle().app, GUEST_DIR); + await vm.ssh(`xattr -dr com.apple.quarantine ${GUEST_DIR} 2>/dev/null || true`); + // The e2e build is unsigned; an arm64 app needs at least an ad-hoc signature + // to execute, and the host build's signature isn't trusted on another Mac. + await vm.ssh( + `codesign --force --deep --sign - ${GUEST_DIR}/Executor.app 2>&1 | tail -2 || true`, + ); + + const guestExe = `${GUEST_DIR}/Executor.app/${exe.split("/Executor.app/")[1]}`; + const guestExecutor = `${GUEST_DIR}/Executor.app/${executor.split("/Executor.app/")[1]}`; + const env = `HOME=${GUEST_HOME} EXECUTOR_DATA_DIR=${GUEST_HOME}/.executor`; + + // 1) the bundled daemon, supervised — the app attaches to this. + await vm.ssh( + `nohup env ${env} EXECUTOR_SUPERVISED=1 EXECUTOR_AUTH_TOKEN=desktop-macos-e2e EXECUTOR_CLIENT=desktop ` + + `'${guestExecutor}' daemon run --foreground --port ${DAEMON_PORT} --hostname 127.0.0.1 ` + + `>/tmp/executor-daemon.log 2>&1 &`, + ); + if (!(await waitGuestHttp(vm, `http://127.0.0.1:${DAEMON_PORT}/`))) { + throw new Error( + "supervised daemon never came up in the guest (see /tmp/executor-daemon.log)", + ); + } + + // 2) the packaged app, launched INTO the Aqua session with CDP enabled. + await vm.ssh( + `U=$(id -u); sudo launchctl asuser $U bash -lc ` + + `'nohup env HOME=${GUEST_HOME} "${guestExe}" --remote-debugging-port=${CDP_GUEST_PORT} --remote-allow-origins="*" ` + + `>/tmp/executor-app.log 2>&1 &'`, + ); + if (!(await waitGuestPageTarget(vm, CDP_GUEST_PORT))) { + const log = (await vm.ssh("tail -40 /tmp/executor-app.log 2>/dev/null").catch(() => null)) + ?.stdout; + throw new Error(`the app's CDP page target never appeared:\n${log ?? "(no app log)"}`); + } + + return { ip: vm.host, teardown: async () => void (await vm.discard()) }; + } catch (error) { + await vm.discard(); + throw error; + } +}; + +export default (): Promise<(() => Promise) | void> => attachOrProvision(provisionMac); diff --git a/e2e/setup/desktop-vm.ts b/e2e/setup/desktop-vm.ts new file mode 100644 index 000000000..c15f15864 --- /dev/null +++ b/e2e/setup/desktop-vm.ts @@ -0,0 +1,91 @@ +// Shared plumbing for the desktop- globalsetups. Each OS setup supplies a +// `provision` that boots its guest and brings the packaged app up with +// --remote-debugging-port; this module handles the rest the same everywhere: +// attach to an already-running guest (E2E_DESKTOP_VM_IP) or provision a fresh +// one, then forward the guest's CDP port and publish it for the scenario. +import { guestTunnel } from "../src/vm/desktop"; +import type { VmHandle } from "../src/vm/types"; + +export const CDP_GUEST_PORT = 9222; + +const sleep = (ms: number): Promise => new Promise((r) => setTimeout(r, ms)); + +/** Poll until an HTTP endpoint inside the guest answers (any status — a 401 from + * the bearer-gated daemon still means "up"). HTTP, not lsof: the app may be + * owned by root (launchctl asuser), whose listening socket an unprivileged lsof + * can't see — a loopback HTTP probe works regardless of owner. */ +export const waitGuestHttp = async (vm: VmHandle, url: string, attempts = 60): Promise => { + for (let i = 0; i < attempts; i++) { + const r = await vm.ssh( + `curl -s -o /dev/null -w '%{http_code}' --max-time 5 ${url} 2>/dev/null || echo 000`, + ); + const code = r.stdout.trim().slice(-3); + if (code !== "000" && code !== "") return true; + await sleep(2000); + } + return false; +}; + +/** Poll until CDP advertises a real PAGE target — i.e. the app's window/renderer + * is up, not just the browser endpoint. On a cold guest the page appears a good + * bit after the port opens, so gating on this makes the scenario deterministic. */ +export const waitGuestPageTarget = async ( + vm: VmHandle, + port: number, + attempts = 60, +): Promise => { + for (let i = 0; i < attempts; i++) { + const r = await vm.ssh( + `curl -s --max-time 5 http://127.0.0.1:${port}/json/list 2>/dev/null | grep -c '"type": "page"' || echo 0`, + ); + if (Number(r.stdout.trim() || "0") > 0) return true; + await sleep(2000); + } + return false; +}; + +export interface ProvisionedGuest { + readonly ip: string; + readonly teardown: () => Promise; +} + +/** + * The body every desktop-.globalsetup returns: attach to E2E_DESKTOP_VM_IP + * if set, else provision a fresh guest; then forward the guest's CDP port and + * publish it (+ the guest IP, for filming) for the worker. A provision/forward + * failure never fails the run — the scenario skips honestly, like + * desktop-packaged without a display. + */ +export const attachOrProvision = async ( + provision: () => Promise, +): Promise<(() => Promise) | void> => { + let ip = process.env.E2E_DESKTOP_VM_IP; + let teardownVm: (() => Promise) | undefined; + + if (!ip) { + // oxlint-disable-next-line executor/no-try-catch-or-throw -- boundary: VM/host setup may fail; degrade to a skip + try { + const result = await provision(); + ip = result.ip; + teardownVm = result.teardown; + } catch (error) { + console.warn(`[desktop] provision failed, scenario will skip: ${String(error)}`); + return; + } + } + + // oxlint-disable-next-line executor/no-try-catch-or-throw -- boundary: forwarding may fail; degrade to a skip + try { + const forward = await guestTunnel(ip, CDP_GUEST_PORT); + process.env.E2E_DESKTOP_CDP_PORT = String(forward.localPort); + process.env.E2E_DESKTOP_VM_IP = ip; + return async () => { + forward.close(); + await teardownVm?.(); + }; + } catch (error) { + console.warn(`[desktop] could not forward CDP from ${ip}: ${String(error)}`); + await teardownVm?.(); + return; + } +}; diff --git a/e2e/setup/desktop-windows.globalsetup.ts b/e2e/setup/desktop-windows.globalsetup.ts new file mode 100644 index 000000000..814df0871 --- /dev/null +++ b/e2e/setup/desktop-windows.globalsetup.ts @@ -0,0 +1,106 @@ +// desktop-windows: drive the PACKAGED app running in a Windows guest over CDP. +// Windows-in-a-VM works best with dockur (QEMU on a Linux/KVM host): autologin +// gives a real interactive session the app renders into, and QEMU `screendump` +// films the framebuffer directly — sidestepping the session-0 problem that +// defeats SSH-driven screenshots (the prior proof of this path). +// +// Unlike the tart targets this ATTACHES to a long-lived Windows host (the dockur +// guest stays up between runs, like a shared selfhost): it forwards the guest's +// --remote-debugging-port to the host over an SSH jump and publishes it. The +// shared scenario drives; the windows recorder (src/vm/desktop.ts) films via +// screendump. Without a reachable app it skips honestly. All connection details +// come from env (no baked-in host): +// E2E_DESKTOP_WIN_HOST (ssh alias of the docker/KVM host to jump through), +// _SSH_PORT (the guest's mapped OpenSSH port), _KEY, _USER; the recorder also +// reads _CONTAINER and _STORAGE. +import { spawn } from "node:child_process"; +import net from "node:net"; + +const SSH_PORT = process.env.E2E_DESKTOP_WIN_SSH_PORT ?? "2222"; +const KEY = process.env.E2E_DESKTOP_WIN_KEY ?? "/tmp/winkey"; +const USER = process.env.E2E_DESKTOP_WIN_USER ?? "Administrator"; +const CDP_GUEST_PORT = 9222; + +const sleep = (ms: number): Promise => new Promise((r) => setTimeout(r, ms)); + +const freePort = (): Promise => + new Promise((resolve, reject) => { + const srv = net.createServer(); + srv.on("error", reject); + srv.listen(0, "127.0.0.1", () => { + const port = (srv.address() as net.AddressInfo).port; + srv.close(() => resolve(port)); + }); + }); + +interface CdpTarget { + readonly type: string; + readonly webSocketDebuggerUrl?: string; +} + +/** Poll the forwarded port until the app advertises a CDP page target. */ +const pageReady = async (port: number, attempts = 30): Promise => { + for (let i = 0; i < attempts; i++) { + const targets = (await fetch(`http://127.0.0.1:${port}/json/list`) + .then((r) => (r.ok ? r.json() : [])) + .catch(() => [])) as ReadonlyArray; + if (targets.some((t) => t.type === "page" && t.webSocketDebuggerUrl)) return true; + await sleep(2000); + } + return false; +}; + +export default async function setup(): Promise<(() => Promise) | void> { + const host = process.env.E2E_DESKTOP_WIN_HOST; + if (!host) { + console.warn( + "[desktop-windows] E2E_DESKTOP_WIN_HOST not set; scenario will skip. Point it at the ssh " + + "alias of a dockur/KVM Windows host running the packaged app with --remote-debugging-port.", + ); + return; + } + const localPort = await freePort(); + // mac:localPort → (jump host) → guest:9222. -p is the guest's mapped OpenSSH + // port on the host; the final hop into Windows carries the -L forward. + const tunnel = spawn( + "ssh", + [ + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "ConnectTimeout=12", + "-o", + "ServerAliveInterval=15", + "-J", + host, + "-p", + SSH_PORT, + "-i", + KEY, + "-L", + `${localPort}:127.0.0.1:${CDP_GUEST_PORT}`, + "-N", + `${USER}@127.0.0.1`, + ], + { stdio: "ignore" }, + ); + + if (!(await pageReady(localPort))) { + tunnel.kill(); + console.warn( + `[desktop-windows] no app/CDP reachable on the Windows host (${host}); scenario will skip. ` + + `Bring up the packaged app with --remote-debugging-port=${CDP_GUEST_PORT} in the dockur guest.`, + ); + return; + } + + process.env.E2E_DESKTOP_CDP_PORT = String(localPort); + // Non-empty so the scenario runs; the windows recorder uses E2E_DESKTOP_WIN_*. + process.env.E2E_DESKTOP_VM_IP = host; + + return async () => { + tunnel.kill(); + }; +} diff --git a/e2e/src/journey/codegen.ts b/e2e/src/journey/codegen.ts new file mode 100644 index 000000000..77d540228 --- /dev/null +++ b/e2e/src/journey/codegen.ts @@ -0,0 +1,78 @@ +// Generate a committed scenario from a recorded journey. The output is a normal +// scenario file (the same shape a human writes, see e2e/AGENTS.md): an Effect +// body that yields Target + Browser, mints a fresh identity, and drives one +// browser session of labelled steps. Terminal (`run`) and HTTP (`request`) steps +// run INSIDE that session, interleaved with the UI, so page state is never lost +// between them. It is meant to be read and edited after generation — promotion +// is the START of a scenario's life, not a frozen artifact. +import { codegenStep, isAssertion, isBrowserStep, stepLabel, type Step } from "./steps"; + +export interface JourneyFile { + readonly target: string; + readonly org: boolean; + readonly steps: ReadonlyArray; +} + +const INDENT = " "; + +const stepBlock = (step: Step): string => { + const label = JSON.stringify(stepLabel(step)); + const body = codegenStep(step) + .split("\n") + .map((line) => `${INDENT} ${line}`) + .join("\n"); + return `${INDENT}await step(${label}, async () => {\n${body}\n${INDENT}});`; +}; + +/** A journey with no assertion proves nothing; one with no browser step isn't + * what this tool generates (write a CLI/API test directly). `promote` checks + * both. */ +export const journeyHasAssertion = (journey: JourneyFile): boolean => + journey.steps.some(isAssertion); + +export const journeyHasBrowserStep = (journey: JourneyFile): boolean => + journey.steps.some(isBrowserStep); + +export const codegenScenario = (name: string, journey: JourneyFile): string => { + const identityArg = journey.org ? "" : "{ org: false }"; + const body = journey.steps.map(stepBlock).join("\n"); + + const needsExec = journey.steps.some((step) => step.kind === "run"); + const needsExpect = journey.steps.some( + (step) => step.kind === "request" || (step.kind === "run" && step.contains !== undefined), + ); + + const imports: string[] = []; + if (needsExec) { + imports.push(`import { execFile } from "node:child_process";`); + imports.push(`import { promisify } from "node:util";`); + imports.push(""); + } + if (needsExpect) imports.push(`import { expect } from "@effect/vitest";`); + imports.push(`import { Effect } from "effect";`); + imports.push(""); + imports.push(`import { scenario } from "../src/scenario";`); + imports.push(`import { Browser, Target } from "../src/services";`); + + const execHelper = needsExec ? "\nconst execFileAsync = promisify(execFile);\n" : ""; + + return `// Generated from an interactive browser journey: \`bun scripts/cli.ts promote ${journey.target} ""\`. +// This is now an ordinary scenario — edit it freely. It drives the same Browser +// surface the exploration used, so a reviewer can judge the guarantee by +// reading it. Re-run with: E2E_${journey.target.toUpperCase()}_URL= vitest run --project ${journey.target} +${imports.join("\n")} +${execHelper} +scenario( + ${JSON.stringify(name)}, + { timeout: 120_000 }, + Effect.gen(function* () { + const target = yield* Target; + const browser = yield* Browser; + const identity = yield* target.newIdentity(${identityArg}); + yield* browser.session(identity, async ({ page, step }) => { +${body} + }); + }), +); +`; +}; diff --git a/e2e/src/journey/run.ts b/e2e/src/journey/run.ts new file mode 100644 index 000000000..70d8916be --- /dev/null +++ b/e2e/src/journey/run.ts @@ -0,0 +1,104 @@ +// Replay a journey live, from a clean browser, and report what the page looks +// like afterward. This is the development loop: each `browse` command appends a +// step and replays the WHOLE journey from scratch, so the flow the agent is +// building is, at every moment, exactly what the generated test will run — a +// step that doesn't reproduce fails here, not later. The returned observation +// (url, title, the page's interactive controls) is how the agent, which can't +// see the screen, decides the next step; the screenshot is for a human. +import { chromium, type Page } from "playwright"; + +import type { Identity, Target } from "../target"; +import { executeStep, type Step } from "./steps"; + +export interface Control { + readonly role: string; + readonly name: string; +} + +export interface Observation { + readonly url: string; + readonly title: string; + /** The interactive elements on the page, as (role, accessible name) — the + * vocabulary the next clickRole/fill step is written against. */ + readonly controls: ReadonlyArray; + readonly screenshotPath: string; + /** Textual output of the last terminal/HTTP step, if any — so the agent sees + * what a `run`/`request` produced, not just the page. */ + readonly lastOutput?: string; + /** Index of the step that threw, with its message — undefined on success. */ + readonly failedStep?: { readonly index: number; readonly error: string }; +} + +const OBSERVED_ROLES = ["link", "button", "textbox", "tab", "menuitem", "checkbox"] as const; +const PER_ROLE_CAP = 30; + +/** A compact, deduped list of the page's interactive controls. Names come from + * the accessible name (text / aria-label / placeholder), trimmed. */ +const snapshotControls = async (page: Page): Promise => { + const seen = new Set(); + const controls: Control[] = []; + for (const role of OBSERVED_ROLES) { + const elements = await page.getByRole(role).all(); + for (const element of elements.slice(0, PER_ROLE_CAP)) { + const raw = + (await element.textContent().catch(() => null))?.trim() || + (await element.getAttribute("aria-label").catch(() => null)) || + (await element.getAttribute("placeholder").catch(() => null)) || + ""; + const name = raw.replace(/\s+/g, " ").trim().slice(0, 70); + if (!name) continue; + const key = `${role}:${name}`; + if (seen.has(key)) continue; + seen.add(key); + controls.push({ role, name }); + } + } + return controls; +}; + +export const replayJourney = async ( + target: Target, + identity: Identity, + steps: ReadonlyArray, + options: { readonly screenshotPath: string }, +): Promise => { + const browser = await chromium.launch(); + let failedStep: Observation["failedStep"]; + try { + const context = await browser.newContext({ + colorScheme: "dark", + viewport: { width: 1280, height: 800 }, + baseURL: target.baseUrl, + }); + // Same identity injection the Browser surface does, so the live page is the + // logged-in page the generated test will drive. + if (identity.cookies?.length) { + await context.addCookies( + identity.cookies.map((cookie) => ({ ...cookie, url: target.baseUrl })), + ); + } + const page = await context.newPage(); + let lastOutput: string | undefined; + for (let index = 0; index < steps.length; index++) { + try { + const output = await executeStep({ page, baseUrl: target.baseUrl }, steps[index]!); + if (output !== undefined) lastOutput = output; + } catch (error) { + failedStep = { index, error: error instanceof Error ? error.message : String(error) }; + break; + } + } + await page.waitForLoadState("networkidle").catch(() => {}); + await page.screenshot({ path: options.screenshotPath }).catch(() => {}); + return { + url: page.url(), + title: await page.title().catch(() => ""), + controls: await snapshotControls(page).catch(() => []), + screenshotPath: options.screenshotPath, + lastOutput, + failedStep, + }; + } finally { + await browser.close(); + } +}; diff --git a/e2e/src/journey/steps.ts b/e2e/src/journey/steps.ts new file mode 100644 index 000000000..22eb4413d --- /dev/null +++ b/e2e/src/journey/steps.ts @@ -0,0 +1,236 @@ +// The journey Step DSL: the single source of truth shared by interactive +// exploration and the generated scenario. ONE step description is both +// (a) executed live against the real product while the agent develops a flow +// (`executeStep`), and (b) emitted as the matching Playwright line inside a +// committed scenario (`codegenStep`). Because both sides read the same record, +// "turn what I just did into a test" is a translation, not a reimplementation: +// the generated test drives the exact surface the exploration drove. +// +// Steps are plain JSON (they persist to .dev/.journey.json between CLI +// invocations), so they carry no closures — every action is a named primitive. +import { execFile } from "node:child_process"; +import { promisify } from "node:util"; + +import type { Page } from "playwright"; + +const execFileAsync = promisify(execFile); + +/** ARIA roles the journey can target. A closed set keeps codegen honest and + * the live `getByRole` calls type-safe (Playwright's role arg is a union). */ +export type Role = "link" | "button" | "heading" | "textbox" | "tab" | "menuitem" | "checkbox"; + +export type Step = + | { readonly kind: "goto"; readonly path: string; readonly label?: string } + | { + readonly kind: "clickRole"; + readonly role: Role; + readonly name: string; + readonly label?: string; + } + | { readonly kind: "clickText"; readonly text: string; readonly label?: string } + | { + readonly kind: "fill"; + readonly field: string; + readonly value: string; + readonly label?: string; + } + | { readonly kind: "press"; readonly key: string; readonly label?: string } + | { readonly kind: "expectText"; readonly text: string; readonly label?: string } + | { readonly kind: "expectUrl"; readonly contains: string; readonly label?: string } + // A terminal command. `{base}` expands to the target's base URL, so a journey + // can hit the same instance the UI is driving (curl, npx add-mcp, executor …). + // `contains` (when set) asserts on the combined stdout+stderr. + | { + readonly kind: "run"; + readonly command: string; + readonly contains?: string; + readonly label?: string; + } + // An HTTP call through the page's own authenticated session (relative paths + // resolve against the base URL). `contains` asserts on the response body; + // without it, the assertion is a 2xx. + | { + readonly kind: "request"; + readonly method: string; + readonly path: string; + readonly contains?: string; + readonly label?: string; + }; + +/** Assertions are the steps a reviewer reads as the guarantee — a journey with + * none asserts nothing, so `promote` refuses it. A `run`/`request` is an + * assertion when it carries an expectation (`contains`, or `request`'s 2xx). */ +export const isAssertion = (step: Step): boolean => + step.kind === "expectText" || + step.kind === "expectUrl" || + step.kind === "request" || + (step.kind === "run" && step.contains !== undefined); + +export const isBrowserStep = (step: Step): boolean => + step.kind !== "run" && step.kind !== "request"; + +/** The human-readable step name (the `step(label, …)` group + screenshot + * caption). The agent can override per step; this is the sensible default so a + * generated test reads as a journey even when labels were left implicit. */ +export const stepLabel = (step: Step): string => { + if (step.label) return step.label; + switch (step.kind) { + case "goto": + return `Open ${step.path}`; + case "clickRole": + return `Click the ${JSON.stringify(step.name)} ${step.role}`; + case "clickText": + return `Click ${JSON.stringify(step.text)}`; + case "fill": + return `Fill ${JSON.stringify(step.field)}`; + case "press": + return `Press ${step.key}`; + case "expectText": + return `See ${JSON.stringify(step.text)}`; + case "expectUrl": + return `Land on a URL containing ${JSON.stringify(step.contains)}`; + case "run": + return step.contains + ? `Run ${JSON.stringify(step.command)} and see ${JSON.stringify(step.contains)}` + : `Run ${JSON.stringify(step.command)}`; + case "request": + return step.contains + ? `${step.method} ${step.path} returns ${JSON.stringify(step.contains)}` + : `${step.method} ${step.path} succeeds`; + } +}; + +const ASSERT_TIMEOUT = 15_000; + +export interface StepContext { + readonly page: Page; + /** The target's base URL — `{base}` in a `run` command expands to this. */ + readonly baseUrl: string; +} + +/** Expand `{base}` so a terminal command can reach the instance under test. */ +const withBase = (command: string, baseUrl: string): string => + command.replaceAll("{base}", baseUrl); + +/** Drive one step against a live page. Assertions throw on failure (a missing + * text, a wrong URL, a non-matching command output) so it surfaces immediately + * while exploring, the same way it would fail the generated test. Returns any + * textual output (terminal / HTTP) so the caller can show it. */ +export const executeStep = async (ctx: StepContext, step: Step): Promise => { + const { page } = ctx; + switch (step.kind) { + case "goto": + await page.goto(step.path, { waitUntil: "networkidle" }); + return; + case "clickRole": + await page.getByRole(step.role, { name: step.name }).first().click(); + return; + case "clickText": + await page.getByText(step.text).first().click(); + return; + case "fill": + await page.getByLabel(step.field).first().fill(step.value); + return; + case "press": + await page.keyboard.press(step.key); + return; + case "expectText": + await page + .getByText(step.text) + .first() + .waitFor({ state: "visible", timeout: ASSERT_TIMEOUT }); + return; + case "expectUrl": + await page.waitForURL((url) => url.toString().includes(step.contains), { + timeout: ASSERT_TIMEOUT, + }); + return; + case "run": { + const result = await execFileAsync("sh", ["-c", withBase(step.command, ctx.baseUrl)]).catch( + (error: { stdout?: string; stderr?: string }) => ({ + stdout: error.stdout ?? "", + stderr: error.stderr ?? String(error), + }), + ); + const output = `${result.stdout}${result.stderr}`; + if (step.contains !== undefined && !output.includes(step.contains)) { + throw new Error( + `\`run\` output did not contain ${JSON.stringify(step.contains)}\n${output.slice(0, 1000)}`, + ); + } + return output.trim().slice(0, 2000); + } + case "request": { + const response = await page.request.fetch(step.path, { method: step.method }); + const body = await response.text(); + if (step.contains !== undefined) { + if (!body.includes(step.contains)) { + throw new Error( + `${step.method} ${step.path} body did not contain ${JSON.stringify(step.contains)} (status ${response.status()})`, + ); + } + } else if (!response.ok()) { + throw new Error(`${step.method} ${step.path} returned ${response.status()}`); + } + return `${response.status()} ${body.slice(0, 800)}`; + } + } +}; + +/** The Playwright line(s) for this step, as they appear inside the generated + * scenario's `step(label, async () => { … })` body. Mirrors `executeStep` + * exactly — same locator, same call — so live behavior and the test match. */ +export const codegenStep = (step: Step): string => { + const s = (value: string): string => JSON.stringify(value); + switch (step.kind) { + case "goto": + return `await page.goto(${s(step.path)}, { waitUntil: "networkidle" });`; + case "clickRole": + return `await page.getByRole(${s(step.role)}, { name: ${s(step.name)} }).first().click();`; + case "clickText": + return `await page.getByText(${s(step.text)}).first().click();`; + case "fill": + return `await page.getByLabel(${s(step.field)}).first().fill(${s(step.value)});`; + case "press": + return `await page.keyboard.press(${s(step.key)});`; + case "expectText": + // The repo's browser-assertion idiom: waiting for the element IS the + // assertion (a timeout fails the step with the locator in the message). + return `await page.getByText(${s(step.text)}).first().waitFor();`; + case "expectUrl": + return `await page.waitForURL((url) => url.toString().includes(${s(step.contains)}));`; + case "run": { + const lines = [ + `const { stdout } = await execFileAsync("sh", ["-c", ${backtick(step.command)}]);`, + ]; + if (step.contains !== undefined) { + lines.push( + `expect(stdout, "the command output is as expected").toContain(${s(step.contains)});`, + ); + } + return lines.join("\n"); + } + case "request": { + const lines = [ + `const response = await page.request.fetch(${s(step.path)}, { method: ${s(step.method)} });`, + ]; + lines.push( + step.contains !== undefined + ? `expect(await response.text(), "the response is as expected").toContain(${s(step.contains)});` + : `expect(response.ok(), "the request succeeded").toBe(true);`, + ); + return lines.join("\n"); + } + } +}; + +/** A terminal command as a template literal so `{base}` becomes `target.baseUrl` + * (which is in scope in the generated body). Backticks in the command are + * escaped so the literal stays valid. */ +const backtick = (command: string): string => + "`" + + command + .replaceAll("\\", "\\\\") + .replaceAll("`", "\\`") + .replaceAll("{base}", "${target.baseUrl}") + + "`"; diff --git a/e2e/src/vm/desktop.ts b/e2e/src/vm/desktop.ts new file mode 100644 index 000000000..e04866b0a --- /dev/null +++ b/e2e/src/vm/desktop.ts @@ -0,0 +1,327 @@ +// Driving the PACKAGED desktop app inside a GUI guest, from the host. This is +// the shared substrate for the cross-OS desktop targets (Gap A): SSH plumbing, +// an SSH local-forward, a minimal CDP page client, and screen recording — the +// pieces proven against a tart macOS guest. The desktop- globalsetup boots +// the guest and launches the app; a scenario connects over CDP and records. +// +// Why these mechanics (macOS): a tart `--no-graphics` guest opens no host window +// (no focus stealing) yet, with the base image's autologin, still reaches a real +// Aqua session (WindowServer/Dock/Finder) the app can render into. A GUI app must +// be launched INTO that session (`sudo launchctl asuser …`); a plain SSH +// spawn lands in a non-GUI session. The app's --remote-debugging-port is then +// reachable over an SSH forward, and `screencapture` films the console. +import { execFile, spawn } from "node:child_process"; +import net from "node:net"; +import { basename, dirname } from "node:path"; +import { promisify } from "node:util"; + +const execFileP = promisify(execFile); + +const SSHPASS = process.env.E2E_SSHPASS_BIN ?? "/opt/homebrew/bin/sshpass"; +const GUEST_PASS = process.env.E2E_DESKTOP_VM_PASS ?? "admin"; +const GUEST_USER = process.env.E2E_DESKTOP_VM_USER ?? "admin"; +const SSH_OPTS = [ + "-o", + "StrictHostKeyChecking=no", + "-o", + "UserKnownHostsFile=/dev/null", + "-o", + "ConnectTimeout=8", + "-o", + "LogLevel=ERROR", + // Password auth only (sshpass): a loaded SSH agent's keys would otherwise + // exhaust the guest's MaxAuthTries before the password is tried. + "-o", + "PubkeyAuthentication=no", + "-o", + "IdentitiesOnly=yes", +]; + +export const sleep = (ms: number): Promise => + new Promise((resolve) => setTimeout(resolve, ms)); + +export const guestSsh = ( + ip: string, + command: string, +): Promise<{ stdout: string; stderr: string }> => + execFileP(SSHPASS, ["-p", GUEST_PASS, "ssh", ...SSH_OPTS, `${GUEST_USER}@${ip}`, command], { + maxBuffer: 64 * 1024 * 1024, + }); + +export const guestScpFrom = (ip: string, remote: string, local: string): Promise => + execFileP(SSHPASS, [ + "-p", + GUEST_PASS, + "scp", + ...SSH_OPTS, + `${GUEST_USER}@${ip}:${remote}`, + local, + ]); + +/** + * Push a directory into the guest by streaming a tar over ssh: one connection, + * no per-file round-trips, and the flowing data keeps the link alive — far more + * robust than `scp -r` of a big app bundle (thousands of files + symlinks), + * which drops mid-transfer on a freshly-booted guest. Retries once. The dir + * lands at `${remoteParent}/${basename(localDir)}`. + */ +export const pushDirAsTar = async ( + ip: string, + localDir: string, + remoteParent: string, +): Promise => { + const parent = dirname(localDir); + const base = basename(localDir); + const remote = `${SSHPASS} -p ${GUEST_PASS} ssh ${SSH_OPTS.join(" ")} ${GUEST_USER}@${ip} ${JSON.stringify( + `mkdir -p ${remoteParent} && tar xf - -C ${remoteParent}`, + )}`; + const pipeline = `tar cf - -C ${JSON.stringify(parent)} ${JSON.stringify(base)} | ${remote}`; + // oxlint-disable-next-line executor/no-try-catch-or-throw -- boundary: one retry over a flaky just-booted guest link + try { + await execFileP("sh", ["-c", pipeline], { maxBuffer: 16 * 1024 * 1024 }); + } catch { + await sleep(3000); + await execFileP("sh", ["-c", pipeline], { maxBuffer: 16 * 1024 * 1024 }); + } +}; + +const freePort = (): Promise => + new Promise((resolve, reject) => { + const srv = net.createServer(); + srv.on("error", reject); + srv.listen(0, "127.0.0.1", () => { + const port = (srv.address() as net.AddressInfo).port; + srv.close(() => resolve(port)); + }); + }); + +export interface Forward { + readonly localPort: number; + close(): void; +} + +/** SSH local-forward host:localPort → guest:guestPort; resolves once it binds. */ +export const guestTunnel = async (ip: string, guestPort: number): Promise => { + const localPort = await freePort(); + const child = spawn( + SSHPASS, + [ + "-p", + GUEST_PASS, + "ssh", + ...SSH_OPTS, + "-N", + "-L", + `${localPort}:127.0.0.1:${guestPort}`, + `${GUEST_USER}@${ip}`, + ], + { stdio: "ignore" }, + ); + for (let i = 0; i < 40; i++) { + const ok = await new Promise((resolve) => { + const sock = net.connect({ host: "127.0.0.1", port: localPort }, () => { + sock.destroy(); + resolve(true); + }); + sock.on("error", () => resolve(false)); + sock.setTimeout(1000, () => { + sock.destroy(); + resolve(false); + }); + }); + if (ok) break; + await sleep(500); + } + return { localPort, close: () => child.kill() }; +}; + +const guestFileSize = (ip: string, remote: string): Promise => + guestSsh(ip, `stat -f%z ${remote} 2>/dev/null || stat -c%s ${remote} 2>/dev/null || echo 0`) + .then((r) => Number(r.stdout.trim() || "0")) + .catch(() => 0); + +/** + * Film the guest's screen for `seconds` and land it on the host as `localMp4` + * (mp4, plays everywhere). OS-aware capture: + * • macOS — `screencapture -V` to a .mov, then host-side ffmpeg to mp4. The + * first capture after a cold display can silently no-op, so warm it with a + * throwaway still and verify+retry. + * • linux — ffmpeg `x11grab` of the Xvfb display straight to mp4. + * Best-effort: failures never throw — "every run is watchable" wants the video, + * but a missing one shouldn't fail the run. Run it concurrently with the drive. + */ +export const recordGuestScreen = async ( + ip: string, + seconds: number, + localMp4: string, + os: "macos" | "linux" | "windows", +): Promise => { + if (os === "windows") { + // Windows can't screenshot the interactive desktop from an SSH session, so + // we film the VM framebuffer directly via QEMU's `screendump` (the dockur + // host runs the loop + ffmpeg; we pull the mp4). Host/container/storage come + // from env (no baked-in host); best-effort, so skip filming if unconfigured. + const host = process.env.E2E_DESKTOP_WIN_HOST; + const storage = process.env.E2E_DESKTOP_WIN_STORAGE; + if (!host || !storage) return; + const container = process.env.E2E_DESKTOP_WIN_CONTAINER ?? "exec-win"; + const frames = Math.max(8, seconds * 4); + const py = `import socket,time +s=socket.socket(socket.AF_UNIX); s.connect("/run/shm/monitor.sock"); time.sleep(0.2); s.recv(65536) +for i in range(${frames}): + s.sendall(("screendump /storage/frames/f%03d.ppm\\n"%i).encode()); time.sleep(0.2) + try: s.recv(65536) + except Exception: pass`; + const b64 = Buffer.from(py).toString("base64"); + const remote = + `S=${storage}; rm -rf "$S/frames"; mkdir -p "$S/frames"; ` + + `docker exec ${container} python3 -c "import base64;exec(base64.b64decode('${b64}'))"; ` + + `ffmpeg -y -framerate 4 -i "$S/frames/f%03d.ppm" -pix_fmt yuv420p -movflags +faststart "$S/win.mp4" >/dev/null 2>&1`; + await execFileP("ssh", ["-o", "ConnectTimeout=10", host, remote], { + maxBuffer: 16 * 1024 * 1024, + }).catch(() => undefined); + await execFileP("scp", [ + "-o", + "ConnectTimeout=10", + `${host}:${storage}/win.mp4`, + localMp4, + ]).catch(() => undefined); + return; + } + + if (os === "linux") { + const remote = "/tmp/executor-desktop-vm.mp4"; + await guestSsh( + ip, + `rm -f ${remote}; DISPLAY=:99 ffmpeg -y -f x11grab -video_size 1280x800 -framerate 15 ` + + `-i :99 -t ${seconds} -pix_fmt yuv420p ${remote} >/tmp/ffmpeg.log 2>&1`, + ).catch(() => undefined); + // The mostly-flat console compresses small under x264 — a real capture is + // ~30-60KB, a blank/failed one only a few KB. + if ((await guestFileSize(ip, remote)) > 12_000) { + await guestScpFrom(ip, remote, localMp4).catch(() => undefined); + } + return; + } + + const remoteMov = "/tmp/executor-desktop-vm.mov"; + // Warm the capture subsystem — the first screencapture after the display comes + // up can produce nothing. + await guestSsh(ip, "screencapture -x /tmp/.warm.png 2>/dev/null; rm -f /tmp/.warm.png").catch( + () => undefined, + ); + for (let attempt = 0; attempt < 2; attempt++) { + await guestSsh(ip, `rm -f ${remoteMov}; screencapture -V ${seconds} -x ${remoteMov}`).catch( + () => undefined, + ); + if ((await guestFileSize(ip, remoteMov)) > 100_000) { + const localMov = `${localMp4}.mov`; + await guestScpFrom(ip, remoteMov, localMov).catch(() => undefined); + await execFileP("ffmpeg", [ + "-y", + "-i", + localMov, + "-c:v", + "libx264", + "-pix_fmt", + "yuv420p", + "-movflags", + "+faststart", + localMp4, + ]) + .then(() => execFileP("rm", ["-f", localMov])) + .catch(() => undefined); + return; + } + } +}; + +// --- a minimal CDP page client (same protocol as desktop-packaged's driver) -- + +interface CdpTarget { + readonly type: string; + readonly webSocketDebuggerUrl?: string; +} + +export class CdpPage { + private nextId = 1; + private readonly pending = new Map void>(); + + private constructor(private readonly socket: WebSocket) { + socket.addEventListener("message", (event) => { + if (typeof event.data !== "string") return; + const message = JSON.parse(event.data) as { id?: number; result?: unknown }; + if (message.id && this.pending.has(message.id)) { + this.pending.get(message.id)!(message.result); + this.pending.delete(message.id); + } + }); + } + + static connect = (url: string): Promise => + new Promise((resolve, reject) => { + const socket = new WebSocket(url); + const timer = setTimeout( + // oxlint-disable-next-line executor/no-promise-reject, executor/no-error-constructor -- boundary: WebSocket connection promise adapter + () => reject(new Error(`CDP connect timeout: ${url}`)), + 30_000, + ); + socket.addEventListener("open", () => { + clearTimeout(timer); + resolve(new CdpPage(socket)); + }); + socket.addEventListener("error", () => { + clearTimeout(timer); + // oxlint-disable-next-line executor/no-promise-reject, executor/no-error-constructor -- boundary: WebSocket connection promise adapter + reject(new Error(`CDP connect failed: ${url}`)); + }); + }); + + command = (method: string, params: Record = {}): Promise => { + const id = this.nextId++; + const result = new Promise((resolve) => + this.pending.set(id, (value) => resolve(value as T)), + ); + this.socket.send(JSON.stringify({ id, method, params })); + return result; + }; + + waitForText = async (text: string, timeoutMs: number): Promise => { + const deadline = Date.now() + timeoutMs; + const expression = `document.body?.innerText.includes(${JSON.stringify(text)}) ?? false`; + for (;;) { + const r = await this.command<{ result?: { value?: boolean } }>("Runtime.evaluate", { + expression, + returnByValue: true, + }); + if (r.result?.value) return; + // oxlint-disable-next-line executor/no-error-constructor -- boundary: a wait timeout is a plain failure here + if (Date.now() >= deadline) throw new Error(`timed out waiting for text: ${text}`); + await sleep(250); + } + }; + + screenshot = async (): Promise => { + const r = await this.command<{ data: string }>("Page.captureScreenshot", { format: "png" }); + return Buffer.from(r.data, "base64"); + }; + + close = (): void => this.socket.close(); +} + +/** The first drivable page target's WebSocket URL, fetched through the forward + * (so the returned ws URL already points at the local port). */ +export const pageWsUrl = async (localPort: number): Promise => { + const deadline = Date.now() + 60_000; + for (;;) { + const targets = (await fetch(`http://127.0.0.1:${localPort}/json/list`) + .then((r) => (r.ok ? r.json() : [])) + .catch(() => [])) as ReadonlyArray; + const page = targets.find((t) => t.type === "page" && t.webSocketDebuggerUrl); + if (page?.webSocketDebuggerUrl) return page.webSocketDebuggerUrl; + // oxlint-disable-next-line executor/no-error-constructor -- boundary: setup failure surfaced to the caller + if (Date.now() >= deadline) + throw new Error("no CDP page target (app not running with --remote-debugging-port?)"); + await sleep(500); + } +}; diff --git a/e2e/src/vm/tart.ts b/e2e/src/vm/tart.ts index 5ca696420..6b496dca5 100644 --- a/e2e/src/vm/tart.ts +++ b/e2e/src/vm/tart.ts @@ -30,6 +30,14 @@ const SSH_OPTS = [ "ServerAliveInterval=5", "-o", "LogLevel=ERROR", + // We authenticate with sshpass (password). A loaded SSH agent would otherwise + // offer its keys first and exhaust the guest's MaxAuthTries ("Too many + // authentication failures") before the password is tried — intermittently, + // depending on how many keys the agent holds. Force password-only. + "-o", + "PubkeyAuthentication=no", + "-o", + "IdentitiesOnly=yes", ]; const GUEST_USER = "admin"; const GUEST_PASS = "admin"; @@ -92,7 +100,15 @@ export const tartVm = (os: "macos" | "linux", arch: VmArch = "arm64"): VmProvide provision: async () => { const name = `executor-e2e-${os}-${process.pid}-${Math.floor(performance.now())}`; await execFileP(TART, ["clone", baseImage(os), name]); - const runProc = spawn(TART, ["run", name, "--no-graphics"], { stdio: "ignore" }); + // `--no-graphics` opens NO host window (never steals focus) yet the guest + // still has a virtual display: with the base image's autologin it reaches a + // real Aqua session (WindowServer/Dock/Finder), so even the packaged GUI app + // renders and `screencapture` records it. No windowed/VNC mode is needed. + const runProc = spawn(TART, ["run", name, "--no-graphics"], { + stdio: "ignore", + detached: true, + }); + runProc.unref(); const tunnelClosers: Array<() => void> = []; let ip = ""; diff --git a/e2e/targets/desktop.ts b/e2e/targets/desktop.ts index 722ca08ac..5cded8a1a 100644 --- a/e2e/targets/desktop.ts +++ b/e2e/targets/desktop.ts @@ -9,7 +9,10 @@ import { Effect } from "effect"; import type { Target } from "../src/target"; export const desktopTarget = (): Target => ({ - name: "desktop", + // The project name (desktop / desktop-packaged / desktop-macos) so each lands + // in its own runs// bucket and viewer column — they're the same app + // in different harnesses (dev electron / packaged / packaged-in-a-VM). + name: process.env.E2E_TARGET ?? "desktop", baseUrl: "", mcpUrl: "", capabilities: new Set(), diff --git a/e2e/targets/registry.ts b/e2e/targets/registry.ts index 94e966746..c3d6aa351 100644 --- a/e2e/targets/registry.ts +++ b/e2e/targets/registry.ts @@ -19,6 +19,12 @@ const factories: Record Target> = { // The packaged desktop bundle launches its own app per scenario, same as // `desktop` — no standard surfaces to carry. See desktop-packaged.globalsetup. "desktop-packaged": desktopTarget, + // The packaged bundle inside a GUI guest (one per OS), driven over CDP from + // the host. Carries no surfaces (the scenario drives CDP itself). See + // desktop-.globalsetup. + "desktop-macos": desktopTarget, + "desktop-linux": desktopTarget, + "desktop-windows": desktopTarget, local: localTarget, // The supervised CLI daemon inside a VM, one project per guest OS — restart() // is a real reboot. See setup/cli.globalsetup.ts. diff --git a/e2e/vitest.config.ts b/e2e/vitest.config.ts index 74c45288f..408716954 100644 --- a/e2e/vitest.config.ts +++ b/e2e/vitest.config.ts @@ -77,6 +77,26 @@ export default defineConfig({ testTimeout: 360_000, hookTimeout: 600_000, }), + // The packaged desktop app inside a GUI guest, driven over CDP from the + // host and filmed (the cross-OS counterpart of desktop-packaged) — one + // shared scenario (desktop-vm/), one project per guest OS. The globalsetup + // provisions the guest, launches the bundle with --remote-debugging-port, + // and forwards it; the scenario connects, drives, and records the console. + // Each lands in runs//. Not in the default `npm run test` chain — + // run with `vitest run --project desktop-macos` (or desktop-linux). The VM + // is provisioned automatically; set E2E_DESKTOP_VM_IP to attach to an + // already-running guest instead. + // macos/linux provision a tart guest and build+push the ~450MB bundle; + // windows ATTACHES to a long-lived dockur guest over an SSH jump (no + // provision), so it needs no build but the same generous hooks. + ...(["macos", "linux", "windows"] as const).map((os) => + project(`desktop-${os}`, { + include: ["desktop-vm/**/*.test.ts"], + fileParallelism: false, + testTimeout: 300_000, + hookTimeout: 900_000, + }), + ), // The single-user local app. Each scenario launches its OWN `executor // web` via the CLI on a throwaway data dir + an OS-assigned port, so // there is no shared instance and scenarios are independent. Files run From 456f125b02354282565068d86b8d070b289d6e57 Mon Sep 17 00:00:00 2001 From: Rhys Sullivan <39114868+RhysSullivan@users.noreply.github.com> Date: Sun, 28 Jun 2026 12:25:18 -0700 Subject: [PATCH 2/2] Make spec-detected auth methods immutable in the add flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A method seeded from spec/probe detection rendered the full None/API key/ OAuth selector and editable fields, so a user could silently switch a detected method (e.g. a Bearer-token API) to OAuth with empty endpoints and end up with a broken integration. Detected methods now render read-only in a disabled state: a lock icon, the auth kind named explicitly (OAuth / API key / No auth, since MCP seeds a detected method as just "Detected"), the declared config shown read-only, and "Pulled from spec. Remove to override." The only action is to remove the row and add a custom one; hand-added methods stay fully editable. This lives in the shared AuthMethodListEditor, so the OpenAPI and MCP add flows both get it. Detection is keyed off an explicit row flag rather than the seed slug, since MCP seeds a detected method without one. Also fix PlacementLine: its inline-flex container trimmed the whitespace at child edges, dropping the space after "Authorization:" and the trailing space in a "Bearer " prefix and rendering "Authorization:Bearer••••••". Render it as plain inline with preserved whitespace. --- .../detected-auth-immutable-ui.test.ts | 185 ++++++++++++++++++ .../components/auth-method-list-editor.tsx | 159 ++++++++++++--- packages/react/src/lib/auth-placements.tsx | 7 +- 3 files changed, 319 insertions(+), 32 deletions(-) create mode 100644 e2e/selfhost/detected-auth-immutable-ui.test.ts diff --git a/e2e/selfhost/detected-auth-immutable-ui.test.ts b/e2e/selfhost/detected-auth-immutable-ui.test.ts new file mode 100644 index 000000000..2b9bfa795 --- /dev/null +++ b/e2e/selfhost/detected-auth-immutable-ui.test.ts @@ -0,0 +1,185 @@ +// Selfhost-only (browser): a spec/probe-DETECTED auth method is immutable in +// the add flow. The shared AuthMethodListEditor renders detected methods as a +// disabled, read-only summary ("Pulled from spec. Remove to override.") with no +// kind selector, so a user can't silently retype the spec's method into a kind +// nothing backs. A method the user adds by hand stays fully editable. Both the +// MCP and OpenAPI add flows compose the same editor, so one behavior, two +// surfaces. Selfhost runs with EXECUTOR_ALLOW_LOCAL_NETWORK so the probe/analyze +// can reach the loopback fixtures. Video is the artifact. +import { randomBytes } from "node:crypto"; +import { createServer } from "node:http"; + +import { expect } from "@effect/vitest"; +import { Effect } from "effect"; +import { makeGreetingMcpServer, serveMcpServerWithOAuth } from "@executor-js/plugin-mcp/testing"; +import { OAuthTestServer } from "@executor-js/sdk/testing"; + +import { scenario } from "../src/scenario"; +import { Browser, Target } from "../src/services"; + +const REMOVE_HINT = "Pulled from spec. Remove to override."; + +scenario( + "Detected auth · an MCP probe's OAuth method is immutable in the add flow", + {}, + Effect.scoped( + Effect.gen(function* () { + const target = yield* Target; + const browser = yield* Browser; + // OAuth-protected server: the probe 401s with resource metadata, so the + // method list seeds a single detected OAuth row (discovered metadata). + const server = yield* serveMcpServerWithOAuth( + () => makeGreetingMcpServer({ name: `oauth-mcp-${randomBytes(3).toString("hex")}` }), + { path: "/mcp" }, + ); + const identity = yield* target.newIdentity(); + + yield* browser.session(identity, async ({ page, step }) => { + await step("Open the add-MCP flow pointed at the OAuth server", async () => { + await page.goto(`/integrations/add/mcp?url=${encodeURIComponent(server.endpoint)}`, { + waitUntil: "networkidle", + }); + await page.getByText("How does this server authenticate?").waitFor(); + await page.getByText("Method 1 · Detected").waitFor(); + }); + + await step("The detected method is locked: read-only, named, no selector", async () => { + // The kind is named explicitly ("OAuth"), the discovered-OAuth summary + // and override hint sit inside a disabled block, and there is NO + // editable kind selector (the FilterTabs render as buttons). + await page.getByText("OAuth", { exact: true }).first().waitFor(); + await page.getByText("OAuth metadata is discovered from this server").waitFor(); + await page.getByText(REMOVE_HINT).waitFor(); + expect( + await page.locator("[aria-disabled]").count(), + "the detected method renders a disabled (non-interactive) block", + ).toBeGreaterThan(0); + expect( + await page.getByRole("button", { name: "API key", exact: true }).count(), + "no editable kind selector is shown for the detected method", + ).toBe(0); + }); + + await step("A hand-added method keeps the full editable selector", async () => { + await page.getByRole("button", { name: "Add method" }).click(); + await page.getByText("Method 2").waitFor(); + // The added row defaults to API key and exposes the None/API key/OAuth + // kind tabs (buttons) — the detected row above still shows none. + expect( + await page.getByRole("button", { name: "API key", exact: true }).count(), + "the added method exposes the kind selector", + ).toBeGreaterThan(0); + await page.getByText(REMOVE_HINT).waitFor(); + }); + }); + }), + ).pipe(Effect.provide(OAuthTestServer.layer())), +); + +/** A real 127.0.0.1 server that serves a static OpenAPI spec for the add flow. */ +const serveSpec = (body: string) => + Effect.acquireRelease( + Effect.callback<{ readonly url: string; readonly close: () => void }>((resume) => { + const server = createServer((_request, response) => { + response.writeHead(200, { "content-type": "application/json" }); + response.end(body); + }); + server.listen(0, "127.0.0.1", () => { + const address = server.address(); + const port = typeof address === "object" && address ? address.port : 0; + resume( + Effect.succeed({ + url: `http://127.0.0.1:${port}/spec.json`, + close: () => { + server.close(); + server.closeAllConnections(); + }, + }), + ); + }); + }), + (server) => Effect.sync(server.close), + ); + +const apiKeyAndOAuthSpec = (): string => + JSON.stringify({ + openapi: "3.0.3", + info: { title: "Acme Immutable Auth Fixture", version: "1.0.0" }, + servers: [{ url: "https://api.acme.test" }], + security: [{ bearerAuth: [] }, { acmeOAuth: ["read"] }], + components: { + securitySchemes: { + bearerAuth: { type: "http", scheme: "bearer" }, + acmeOAuth: { + type: "oauth2", + flows: { + authorizationCode: { + authorizationUrl: "https://api.acme.test/oauth/authorize", + tokenUrl: "https://api.acme.test/oauth/token", + scopes: { read: "Read access" }, + }, + }, + }, + }, + }, + paths: { + "/widgets": { + get: { + operationId: "listWidgets", + summary: "List widgets", + responses: { "200": { description: "ok" } }, + }, + }, + }, + }); + +scenario( + "Detected auth · OpenAPI spec-detected methods are immutable in the add flow", + {}, + Effect.scoped( + Effect.gen(function* () { + const target = yield* Target; + const browser = yield* Browser; + const spec = yield* serveSpec(apiKeyAndOAuthSpec()); + const identity = yield* target.newIdentity(); + + yield* browser.session(identity, async ({ page, step }) => { + await step("Analyze a spec that declares both API key and OAuth", async () => { + await page.goto(`/integrations/add/openapi`, { waitUntil: "networkidle" }); + await page + .getByPlaceholder(/openapi\.json/i) + .first() + .fill(spec.url); + await page.getByText("How does this API authenticate?").waitFor(); + await page.getByText("Method 2").waitFor(); + }); + + await step("Both detected methods are locked, named, read-only", async () => { + // Two detected methods, each with the override hint and its kind named + // ("API key" / "OAuth"); the OAuth one shows the spec's real endpoints + // read-only. No editable kind selector (FilterTabs render as buttons). + expect( + await page.getByText(REMOVE_HINT).count(), + "both detected methods show the remove-to-override hint", + ).toBe(2); + await page.getByText("https://api.acme.test/oauth/authorize").waitFor(); + await page.getByText("API key", { exact: true }).first().waitFor(); + await page.getByText("OAuth", { exact: true }).first().waitFor(); + expect( + await page.getByRole("button", { name: "OAuth", exact: true }).count(), + "no editable kind selector is shown for the detected methods", + ).toBe(0); + }); + + await step("A hand-added method keeps the full editable selector", async () => { + await page.getByRole("button", { name: "Add method" }).click(); + await page.getByText("Method 3").waitFor(); + expect( + await page.getByRole("button", { name: "API key", exact: true }).count(), + "the added method exposes the kind selector", + ).toBeGreaterThan(0); + }); + }); + }), + ), +); diff --git a/packages/react/src/components/auth-method-list-editor.tsx b/packages/react/src/components/auth-method-list-editor.tsx index 49b657629..45b1344e1 100644 --- a/packages/react/src/components/auth-method-list-editor.tsx +++ b/packages/react/src/components/auth-method-list-editor.tsx @@ -12,8 +12,9 @@ // --------------------------------------------------------------------------- import { useCallback, useEffect, useRef, useState } from "react"; -import { PlusIcon, XIcon } from "lucide-react"; +import { LockIcon, PlusIcon, XIcon } from "lucide-react"; +import { PlacementLine } from "../lib/auth-placements"; import { Button } from "./button"; import { FieldLabel } from "./field"; import { @@ -35,6 +36,11 @@ export interface AuthMethodSeed { export interface AuthMethodRow { readonly value: AuthTemplateEditorValue; + /** True when this row came from detection (a seed), false when the user added + * it. Detected rows are immutable — the spec/probe declared them — so the + * editor renders them read-only. Not inferred from `seedSlug`: some plugins + * (MCP) seed a detected method with a label but no slug. */ + readonly seeded: boolean; readonly seedSlug?: string; readonly seedLabel?: string; } @@ -59,6 +65,7 @@ export function useAuthMethodList(seeds: readonly AuthMethodSeed[]): AuthMethodL seeds.map( (seed: AuthMethodSeed): AuthMethodRow => ({ value: seed.value, + seeded: true, ...(seed.slug !== undefined ? { seedSlug: seed.slug } : {}), ...(seed.label !== undefined ? { seedLabel: seed.label } : {}), }), @@ -79,7 +86,10 @@ export function useAuthMethodList(seeds: readonly AuthMethodSeed[]): AuthMethodL }, []); const addRow = useCallback(() => { - setRows((current: readonly AuthMethodRow[]) => [...current, { value: emptyApiKeyValue() }]); + setRows((current: readonly AuthMethodRow[]) => [ + ...current, + { value: emptyApiKeyValue(), seeded: false }, + ]); }, []); return { rows, setRowAt, removeRowAt, addRow }; @@ -115,36 +125,53 @@ export function AuthMethodListEditor(props: AuthMethodListEditorProps) { ) : null ) : (
- {list.rows.map((row: AuthMethodRow, index: number) => ( -
-
- - Method {index + 1} - {row.seedLabel ? ` · ${row.seedLabel}` : ""} - - + {list.rows.map((row: AuthMethodRow, index: number) => { + // A row seeded from detection is the spec's own auth declaration: + // it's IMMUTABLE here. We render it read-only (no kind selector, no + // editable fields) so a user can't silently retype the spec's + // method into something nothing backs (e.g. flipping a Bearer-token + // API to OAuth with empty endpoints). The escape hatch is to remove + // the row and add a custom one. Manually added rows (no seed) get + // the full editor. + const detected = row.seeded; + return ( +
+
+ + {detected ? : null} + + Method {index + 1} + {row.seedLabel ? ` · ${row.seedLabel}` : ""} + + + +
+ {detected ? ( + + ) : ( + list.setRowAt(index, next)} + {...(allowedKinds ? { allowedKinds } : {})} + {...(presets ? { presets } : {})} + {...(oauthMetadata ? { oauthMetadata } : {})} + /> + )}
- list.setRowAt(index, next)} - {...(allowedKinds ? { allowedKinds } : {})} - {...(presets ? { presets } : {})} - {...(oauthMetadata ? { oauthMetadata } : {})} - /> -
- ))} + ); + })}
)} {list.rows.length > 0 && props.footerHint ? ( @@ -153,3 +180,73 @@ export function AuthMethodListEditor(props: AuthMethodListEditorProps) { ); } + +/** One read-only `label value` line, mono value, for the detected summary. */ +function SpecField(props: { readonly label: string; readonly value: string }) { + return ( +
+ {props.label} + {props.value} +
+ ); +} + +/** Read-only view of a spec-detected method: shows what the spec declared + * (placements / OAuth endpoints) as a DISABLED, non-interactive block. The + * detected method is immutable here, so the summary is styled like a disabled + * field (muted, not-allowed cursor, text not selectable) to communicate that + * plainly. The only action is to remove the row (the header's X) and add a + * custom method to override. */ +function DetectedMethodSummary(props: { + readonly value: AuthTemplateEditorValue; + readonly oauthMetadata?: "editable" | "discovered"; +}) { + const { value, oauthMetadata } = props; + // Name the auth kind explicitly: a detection label like MCP's "Detected" + // doesn't say whether it's OAuth or an API key, so surface it here. + const kindLabel = + value.kind === "oauth" ? "OAuth" : value.kind === "apikey" ? "API key" : "No auth"; + return ( +
+

+ {kindLabel} +

+
+ {value.kind === "none" && ( +

No credential — tools are callable without an account.

+ )} + + {value.kind === "apikey" && + (value.placements.length > 0 ? ( +
+ {value.placements.map((placement, i: number) => ( + + ))} +
+ ) : null)} + + {value.kind === "oauth" && + (oauthMetadata === "discovered" ? ( +

+ OAuth metadata is discovered from this server when you connect an account. +

+ ) : ( +
+ {value.authorizationUrl ? ( + + ) : null} + {value.tokenUrl ? : null} + {value.scopes.length > 0 ? ( + + ) : null} +
+ ))} +
+ +

Pulled from spec. Remove to override.

+
+ ); +} diff --git a/packages/react/src/lib/auth-placements.tsx b/packages/react/src/lib/auth-placements.tsx index c4f49d36e..71a3383a2 100644 --- a/packages/react/src/lib/auth-placements.tsx +++ b/packages/react/src/lib/auth-placements.tsx @@ -106,8 +106,13 @@ export function PlacementLine(props: { readonly placement: Placement; readonly m : placement.carrier === "env" ? `${placement.name || "TOKEN"}=` : `?${placement.name || "api_key"}=`; + // Plain inline (not inline-flex): flex trims the whitespace at the edges of + // each child, which would drop the space after "Authorization:" and the + // trailing space carried by a prefix like "Bearer ", rendering + // "Authorization:Bearer••••••". whitespace-pre-wrap keeps those spaces while + // still allowing the line to wrap. return ( - + {lead} {placement.prefix ? ( {placement.prefix}