diff --git a/.cursor/skills/proof/SKILL.md b/.cursor/skills/proof/SKILL.md index 5d26228f..6bafdfe4 100644 --- a/.cursor/skills/proof/SKILL.md +++ b/.cursor/skills/proof/SKILL.md @@ -256,8 +256,8 @@ set -a && source .env && set +a - Local runtime only — every subagent runs against `--cwd` (defaults to wherever you invoke the runner). - Sibling tasks in the same rank run in parallel; do not let them write the same files. - Inline MCP servers and sub-sub-agents are not configured by this runner. -- A failed task automatically skips all downstream dependents (they are marked `ERROR` with a "Skipped: upstream task(s) … failed" message). This prevents wasted API calls on tasks whose inputs are missing. -- Per-task streamed text is capped at `STREAM_CAP = 4000` chars to keep the canvas file modest. Upstream context passed to child tasks is capped at 2000 chars per parent, with section-aware truncation when the parent output contains multiple `##` sections. +- A failed upstream task skips downstream dependents (`ERROR` with `Skipped:` when any upstream is **`ERROR`** or **`BUDGET-EXCEEDED`**). +- Canvas-inlined streamed text stays bounded (**`CANVAS_DISPLAY_CAP = 4000`** tail per task plus the existing `[...truncated N earlier chars...]` banner). For `kind: 'task'`, child prompts, in-process convergence loops, findings sidecars, and artifact markdown use a separate **execution transcript**; resumed runs can reconstruct it when the same `--full-output-dir` is reused and `transcriptPath` points at the mirrored stream file. Pause/oracle tasks still use their bounded status/output text. Upstream excerpts default to the same **2000-char section-aware policy** as before, now with explicit counted banners when trimming. Set **`DAG.outputPolicy.upstream`** to **`"full"`** to stitch full parent transcripts (mind model context limits). - Timed-out tasks are marked `ERROR` instead of staying indefinitely in `RUNNING`. - SIGINT/SIGTERM/SIGHUP gracefully cancel all in-flight subagents and finalize the canvas before exiting. - Unexpected unhandled rejections from SDK internals are suppressed to prevent runner crashes; uncaught exceptions are logged and trigger a clean shutdown. diff --git a/docs/proposals/proof-output-retention-judge.md b/docs/proposals/proof-output-retention-judge.md new file mode 100644 index 00000000..86116fa3 --- /dev/null +++ b/docs/proposals/proof-output-retention-judge.md @@ -0,0 +1,92 @@ +# Adversarial review: `proof-output-retention-plan.md` + +Author: Opus 4.7 (replaces the prior judge draft). +Scope: read-only review of the plan against the actual code in `packages/proof/src/**`. + +This review is grounded in spot checks of `run_dag.ts`, `canvas_writer.ts`, `findings_sidecar.ts`, `converge_loop.ts`, `oracle_task.ts`, `self_hosting.ts`, `pause_task.ts`, and `dag.ts` at HEAD. Every finding cites the file/symbol it relies on. + +--- + +## Blockers + +None. + +The plan is implementable as written. The objections below are correctness and scope risks that should reshape Phase 1–3, not gates that prevent the work from starting. + +--- + +## High-severity findings + +1. **The forward-compatibility claim for `version: 2` `PersistedRunState` is false as stated.** Phase 3 says: "the `version: 1` reader is kept long enough that an older runner can still read newer files via the legacy fallback (though the converse is not guaranteed and is documented)." `self_hosting.ts` `readPersistedRunState` is hard-gated on `obj.version !== 1` and throws `Resume state ${path} has unsupported version.` Any already-shipped runner reading a `version: 2` file will fail at startup — it cannot fall back to anything because the payload schema changes shape (`state.tasks[].resultText` becomes optional, supplanted by `transcript: { kind, ... }`). This matters for the supervisor + `EXIT_RUNNER_RESTART` (75) flow: a mid-rolling-upgrade scenario where an older runner reads a state file written by a newer process will hard-fail instead of resume. The plan needs to either (a) drop the forward-compat sentence, document v2 as a one-way cutover gated by a release boundary, and add an explicit migration test that the _new_ runner reads v1 (already covered) but not the inverse; or (b) ship a Phase 0 / point-release patch that loosens the existing v1 reader to tolerate unknown future versions and degrade gracefully. (a) is the honest path; (b) requires lead time the plan does not budget. + +2. **`BUDGET-EXCEEDED` parents do not propagate to dependent children, and Phase 2 turns that latent gap into a hot path.** In `run_dag.ts`, `runOne` skips a task only when an upstream is `'ERROR'`: + + ```ts + const failedDeps = task.depends_on.filter((depId) => { + const dep = stateById.get(depId); + return dep !== undefined && dep.status === 'ERROR'; + }); + ``` + + `BUDGET-EXCEEDED` is intentionally not in that set (see the comment in `markRunTerminated`: "BUDGET-EXCEEDED is a terminal status the convergence loop sets explicitly; do not stomp it into a generic ERROR on shutdown"). Today this is mostly invisible because the only producer of `BUDGET-EXCEEDED` is the convergence task at the tail of a loop — it has no DAG-typed children. Phase 2 of the plan introduces a new producer: `runTask`'s pre-dispatch overflow check, which can mark _any_ mid-DAG task `BUDGET-EXCEEDED` because its stitched prompt exceeds `outputPolicy.maxPromptChars`. Once that lands, every task downstream of the budget-exceeded one will: + + - be considered runnable by `runOne` (no `failedDeps` hit), and + - call `buildUpstreamContext` against a parent whose `resultText` is undefined (the task never dispatched), so the dependency renders as `'(no output)'` and the child runs against a silently-broken DAG. + + The plan should either extend `failedDeps` (and `isResumeTerminalStatus`) to treat `'BUDGET-EXCEEDED'` as a skip-trigger for descendants, or document a different propagation policy. Either way, this is a Phase 2 prerequisite, not a follow-up. + +3. **The new `--restart-on-runner-change` precondition is a silent compatibility break for today's supervisor.** The constraint matrix under "Resume / supervisor" adds: "The supervisor and runner both refuse to start if `--restart-on-runner-change` is set without either a pinned `--full-output-dir` or a non-default `--max-in-memory-output-bytes`." That is a behavior change. The current `parseArgs` defaults `statePath` to `.proof/run-state.json` _because_ `--restart-on-runner-change` was set (see the `restartOnRunnerChange ? resumeState ?? '.proof/run-state.json'` branch), with no requirement to pass `--full-output-dir`. Existing supervisor invocations that rely on the timestamped default artifact directory (`.flatbread/artifacts/dag--/` per `defaultArtifactsDir`) would start failing once Phase 3 lands. The plan calls this out as a docs change ("New guidance in `README.md`…") but treats the refusal as load-bearing for the resume-input-parity acceptance criterion. The refusal should be either downgraded to a warning + automatic fallback to gzipped inline transcripts (`kind: 'inline'`), or staged behind an explicit opt-in flag with a deprecation window. As written, "refuse to start" plus "supervisor pin advice" plus "default still timestamped" is internally inconsistent and will trip real users on the first deploy. + +4. **The plan misdescribes how today's truncation banner travels through `buildUpstreamContext`, and that mistake hides a real correctness gap.** Ground-truth section 2 says: "the tail buffer's own `[...truncated N earlier chars...]` banner does travel into the upstream block when present, so the child does see a signal that the parent was capped." The code in `truncateUpstreamSnippet` / `parseUpstreamSections` says otherwise for the most common case. `BoundedTextBuffer.render()` returns `[...truncated ${droppedChars} earlier chars...]\n${data}`, i.e. the banner is the _first_ line, _before_ `data`. When the parent is a reviewer-style task whose output begins with `## Blockers` / `## High-severity findings` (the very pattern `--converge-on` keys on), `parseUpstreamSections` reaches the heading on the second line, and the banner falls into the documented "Lines before the first `## ` heading are intentionally dropped" branch. The child therefore sees a section-aware excerpt with no truncation banner at all — a child prompt that today silently conceals the fact the parent's prefix was lost. This is an _existing_ correctness bug (and an instance of the plan's thesis), but the plan's "Ground truth" describes it the wrong way and the Phase 1 design relies on the wrong model: it routes the visible-banner fix only through the _new_ `applyUpstreamPolicy` boundary, not through preserving the `BoundedTextBuffer` banner inside `parseUpstreamSections` for the legacy default. Phase 1 acceptance criterion 7 ("a grep-for-`…` test fails the legacy silent-ellipsis path") will pass even when the new `summarize` default still drops the banner inside section-aware mode, because the banner survives only on the `truncate(text, cap)` fall-through. The plan should (a) correct the ground-truth description, and (b) require the section-aware path to preserve any `[...truncated N earlier chars...]` preamble as a synthetic kept section even when it appears before the first `## ` heading. + +--- + +## Medium-severity findings + +1. **The post-loop final-findings extraction in `runConvergenceLoop` is not in Phase 1's deliverables.** Phase 1 lists "`runConvergenceLoop` updated so **both** `extractConvergenceFindings` and `buildConvergenceContext` read from the transcript store." There is a third call site in the same function: after the iteration loop exits, `runConvergenceLoop` re-extracts `extractConvergenceFindings(finalSidecarText ?? convergeTs.resultText)` to decide whether to flip the convergence task to `BUDGET-EXCEEDED`. If the transcript-store rewire skips this call, a long final-iteration reviewer whose `## Blockers` lands past the legacy 4000-char cap will silently terminate as `FINISHED`-then-clean even though the underlying evidence still has blockers. Phase 1's criterion 2 covers detection _during_ the loop; it does not cover the post-loop check. + +2. **The sidecar fallback is lossy in ways that defeat its use as a `buildConvergenceContext` source.** Phase 1's source-of-truth table lists "the `--findings-dir` sidecar continues to be written from the same source so external tooling has a stable JSON form, but it is no longer required for parser correctness inside the runner — it is required only when the **runner process** has restarted." Phase 3 then folds the sidecar into the cross-process fallback chain for the transcript store ("falling back to sidecar (across-process boundary, e.g. resume), falling back to `resultText` (legacy resume)"). The sidecar is not a lossless mirror of the transcript: `findings_sidecar.ts` `parseSections` keys on `## ` headings and discards every line before the first heading, then trims trailing whitespace per section (`out[currentHeading] = currentLines.join('\n').trim()`); and `readFindingsSidecarAsText` reconstructs `## Heading\n${body}` joined by `\n\n`, which is not byte-identical to the original transcript even when the original was perfectly heading-shaped. Routing `buildConvergenceContext` through this fallback after a resume will produce ancestor prompts that differ from the in-process path on whitespace and on any preamble content (including, per high-sev #4, the truncation banner). Phase 3's "input parity across restart" acceptance criterion will only catch this if the parity test explicitly forces the sidecar fallback path on the resumed process. + +3. **The Phase 1 canvas-size envelope test will be flaky on benign DAGs.** Phase 1 criterion 6 asserts: "for a fixture of 5 tasks × 12 000-char outputs, the generated `.canvas.tsx` file size remains below `5 * CANVAS_DISPLAY_CAP + 64 KiB`." The static template hardcoded in `canvas_writer.ts` (`HEADER + BODY` template strings around line 209 onward) is already comfortably over 20 KiB on its own, and `JSON.stringify(state, null, 2)` embeds every `subtask_prompt`, `depends_on` array, model selection, and oracle command for every task — none of which are bounded by `STREAM_CAP` today and none of which are bounded by the new `CANVAS_DISPLAY_CAP`. A DAG with 5 tasks whose `subtask_prompt` is, say, 8 KiB each (entirely realistic for the instruction-heavy tasks the proof package targets) blows past `5 * 4000 + 64 KiB` purely from prompt content. Either rebase the envelope on `O(static_template + Σ subtask_prompt + Σ CANVAS_DISPLAY_CAP) + slack`, or add a separate `CANVAS_PROMPT_DISPLAY_CAP` for `subtask_prompt` and document it. + +4. **Phase 4's "oracle in `--no-artifacts` mode marks `BUDGET-EXCEEDED` on in-memory overflow" is not implementable without changing `oracle_task.ts` `execShell`.** `execShell` accumulates `stdout` and `stderr` as plain `string` concatenations on each `'data'` event, with no size accounting and no cancellation hook. By the time `runOracleTask` finishes and would consult the `--max-in-memory-output-bytes` ceiling, the memory was already consumed. To honor the ceiling pre-emptively, `execShell` must (a) track running byte counts per stream, and (b) `kill('SIGTERM')`/escalate on threshold crossing. The plan does not budget that change. The acceptance criterion "the oracle is marked BUDGET-EXCEEDED" can only be honored _after_ the oracle command completed — i.e. after the OOM risk it was meant to prevent already happened. + +5. **Per-chunk `appendFile` to `${taskId}.stream.txt` will dominate the stream loop on real workloads.** Phase 1's "Append-only mirror to `${fullOutputAbsoluteDir}/${taskId}.stream.txt`" runs from inside the `runTask` `while (true)` stream loop, where today the only persistent work is `buffer.append(block.text)`, `fullStreamChunks.push(block.text)`, and a throttled `publishIfDue`. The SDK assistant stream emits text blocks at sub-millisecond intervals during long generations; per-block `fs.appendFile` invocations are `open + write + close` per call (three syscalls per text block, plus a per-call `Promise` allocation in the hot path). On a 100k-block stream that is 300k syscalls plus 100k `await` points dropping into the event loop. The plan's risk-table mitigation ("best-effort … failures are logged and the task is flagged but not aborted") covers correctness on write failure; it does not cover throughput. The deliverable should specify a coalesced strategy — open a `FileHandle` once, accumulate chunks behind the same `streamPublishMs` throttle as `publishIfDue`, and write on flush — and the design must reckon with what happens to the in-memory tail when the file write is slower than the stream. + +6. **`writeFindingsSidecar`'s call site does not have access to the new transcript store.** Phase 1 says: "`findings_sidecar.ts` `writeFindingsSidecar` reads from the transcript store and emits `sections` keyed identically to today." `writeFindingsSidecar` is currently called from the `dispatchTask` closure in `run_dag.ts`'s `main()` with the signature `writeFindingsSidecar(findingsAbsoluteDir, ts)` — it gets a `TaskState`, not a transcript store handle. The plan does not specify the access pattern (DI parameter, module-level singleton, or pass-through at call sites) and the choice has knock-on effects on Phase 3 (a singleton must be re-hydrated before `runConvergenceLoop` runs in a resumed process). This needs to land in Phase 0's "decision document", not be deferred to Phase 1 implementation. + +7. **`applyUpstreamPolicy`'s `summarize` mode silently drops freeform preamble — a regression that becomes more visible once transcripts are no longer pre-truncated.** `parseUpstreamSections` documents the pre-heading drop: "Lines before the first `## ` heading are intentionally dropped — the section-aware truncate only applies to outputs that lead with a heading; freeform preludes fall through to `truncate()`." Today this is masked because `STREAM_CAP=4000` already discarded most of the prefix. After Phase 1, the parent's transcript is the full stream, so a parent that emits (say) 6 KiB of preamble before its first `## Heading` will silently drop all 6 KiB on the section-aware path, with no banner accounting for it. The plan's "visible counted banner" mitigates the _cap-driven_ drop but not this _structural_ drop. Either preserve the preamble as a synthetic "(preamble)" section, or change the heuristic so a preamble larger than N% of the cap routes to the slice fall-through. + +8. **The plan does not budget for the test infrastructure cost of the stitched-prompt assertions.** Phase 1 acceptance criteria 1, 3, 4, and 7 all assert against the stitched prompt string passed to `agent.send(stitched)` in `runTask`. The current bounded-loop suite under `packages/proof/src/__tests__/loops.test.ts` exercises pure helpers (`extractConvergenceFindings`, etc.) — there is no fake `Agent.create` in the repo today, no scripted async-iterator harness, and no recorded fixture format for stream chunks. Phase 1's deliverables should include a "harness" deliverable (a fake `@cursor/sdk` `Agent` surface plus a fixture-driven `RunnerTaskRun` factory) so the acceptance criteria are wired to runnable tests rather than aspirations. + +9. **`enforceTokenBudget` is not the right place to absorb the new prompt overflow path.** Phase 2 says "this reuses the existing `BUDGET-EXCEEDED` exit path". The existing path runs after every rank's `Promise.all` completes (`enforceTokenBudget(state, dag.budget)`), throws `BudgetExceededError`, and unwinds. The new prompt-overflow path runs _inside_ `runTask` _before_ `agent.send`, must mark a single task `BUDGET-EXCEEDED` without throwing (so siblings continue), and depends on per-task accounting that has nothing to do with `maxTokensTotal`. The plan's "reuses the existing path" framing collapses two distinct control-flow lanes. Either the implementation needs a new per-task short-circuit that mirrors the convergence loop's `convergeTs.status = 'BUDGET-EXCEEDED'` pattern, or the plan should commit to a separate `EXIT_PROMPT_BUDGET_EXCEEDED` and own the precedence rules vs. token-budget exit code 4. + +10. **The Phase 0 consumer inventory is missing the `_index.md` / `persistTaskMarkdownFile` writer chain.** Phase 0 lists every read site of `TaskState.resultText`, but `persistTaskMarkdownFile` is called from `runOne` for `kind: 'pause'` / `kind: 'oracle'` with `ts.resultText ?? ''` _and_ from `runTask`'s `finally` for `kind: 'task'` with `fullStreamChunks.join('')` — i.e. there are _two_ artifact-writing branches today, each reading from a different source. After Phase 1 retires `fullStreamChunks`, the call-site contract for `persistTaskMarkdownFile` must be unified, and the inventory should name both branches up front so the unification is intentional rather than incidental. + +--- + +## Recommended adjustments + +1. **Drop the v1↔v2 "older runner can read newer files" sentence in Phase 3 and replace it with an explicit cutover policy.** Add a one-line acceptance criterion that the v1 reader is still strict about `version: 1` (so future readers are forced to broaden the check intentionally), and document the supervisor-restart implication for rolling deploys. + +2. **Make `BUDGET-EXCEEDED` propagate to dependents before Phase 2 lands.** Concretely: extend `failedDeps` in `runOne` to include `'BUDGET-EXCEEDED'` upstreams, and add a Phase 1 acceptance criterion asserting that a child of a `BUDGET-EXCEEDED` parent ends in `'ERROR'` with a "skipped: upstream budget exceeded" message. This is also the right time to revisit `isResumeTerminalStatus` (already includes `'BUDGET-EXCEEDED'`) so resume semantics line up with the new skip semantics. + +3. **Reframe the `--restart-on-runner-change` change as additive.** Default behavior should remain "start regardless"; add a new opt-in (e.g. `--require-pinned-artifacts-on-restart`, or fold it into a future `--strict` flag). The plan can keep its supervisor pin _recommendation_ without making the absence of `--full-output-dir` a startup failure. + +4. **Correct the "Ground truth" description of the truncation banner's path through `buildUpstreamContext`, and add a Phase 1 deliverable that preserves the banner as a synthetic kept section** (or routes section-aware truncation to a code path that prepends the banner unconditionally). This is a one-symbol fix in `parseUpstreamSections` / `renderUpstreamSections` with outsized signal value for the rest of the plan. + +5. **Add the post-loop final-findings extraction to Phase 1's `runConvergenceLoop` rewire.** Either name it explicitly in the deliverables list, or add an acceptance criterion that the `BUDGET-EXCEEDED` decision is made against the transcript store, not `convergeTs.resultText`. + +6. **Either commit to a lossless on-disk transcript format for cross-process fallback, or restrict the sidecar fallback to `extractConvergenceFindings` only.** The Phase 3 input-parity test should explicitly include a leg that forces the resumed process to use the sidecar (not the in-memory reconstruction) and asserts byte-identical stitched prompts. If that leg cannot be made to pass without a separate raw-transcript file, add the raw-transcript file (e.g. `${taskId}.transcript.txt`) to the Phase 1 artifact set and have the sidecar continue to be a derived index, not a fallback source. + +7. **Replace the canvas-size envelope formula with one that includes `subtask_prompt` mass.** A safe formulation: `static_template_bytes + Σ_t (CANVAS_DISPLAY_CAP + |subtask_prompt_t| + per_task_metadata_overhead) + slack`. Pick `slack` empirically against a real DAG fixture and fail the test only on regressions vs. that envelope. + +8. **Specify the stream-file write strategy explicitly.** A concrete sketch the plan can adopt: open a `FileHandle` per task at first chunk; accumulate chunks in a small in-memory buffer; flush on the same `streamPublishMs` cadence as `publishIfDue`; close in the `runTask` `finally`. Document the failure mode when the FS cannot keep up (in-memory buffer grows; trip `--max-in-memory-output-bytes` like any other store). + +9. **Push the oracle full-evidence work into Phase 4 _with_ the `execShell` streaming-cap change, or split it into two phases.** A defensible split: Phase 4a writes `${taskId}.stdout.log` / `${taskId}.stderr.log` from the existing `outcome.stdout`/`stderr` strings (preserves today's memory bound, gains forensic file). Phase 4b adds streaming size accounting in `execShell` with `SIGTERM` on threshold crossing, _only then_ honors `--max-in-memory-output-bytes` for oracles. Without 4b, the `--no-artifacts` ceiling is misleading. + +10. **Disambiguate the prompt-overflow `BUDGET-EXCEEDED` lane from the token-budget lane.** Either name the new exit code (e.g. `EXIT_PROMPT_BUDGET_EXCEEDED = 5`) and add precedence rules to the main-run tally, or explicitly fold the new lane into the existing `EXIT_BUDGET_EXCEEDED = 4` and document the wrapper-script impact. + +11. **Move the `writeFindingsSidecar` access-pattern decision into Phase 0.** Pick singleton vs. DI; do not defer to implementation time. Whichever choice you make has direct implications for Phase 3 resume reconstruction. + +12. **Add the `_index.md` / `persistTaskMarkdownFile` branches to the Phase 0 consumer inventory.** They are a third execution-plane consumer alongside `buildUpstreamContext`, `extractConvergenceFindings`, `buildConvergenceContext`, the sidecar, and the persisted state. The inventory is otherwise complete. diff --git a/docs/proposals/proof-output-retention-plan.md b/docs/proposals/proof-output-retention-plan.md new file mode 100644 index 00000000..54847d01 --- /dev/null +++ b/docs/proposals/proof-output-retention-plan.md @@ -0,0 +1,419 @@ +# Proposal: Proof rank/task output retention — fix execution fidelity without breaking the canvas + +Status: Planning — replaces the prior draft. +Author: Opus 4.7 +Scope: `@flatbread/proof` (`packages/proof`) + +--- + +## GitHub issue tracking + +These follow-ups were created from the Cursor cloud-agent review of PR +[#199](https://github.com/FlatbreadLabs/flatbread/pull/199). The issue bodies +link back to this proposal, the adversarial review, and the judge artifact. + +- [#200 — Phase 0/1 follow-up hardening](https://github.com/FlatbreadLabs/flatbread/issues/200) +- [#201 — Phase 2: upstream prompt policy and budget preflight](https://github.com/FlatbreadLabs/flatbread/issues/201) +- [#202 — Phase 3: resume, supervisor, and disk ergonomics](https://github.com/FlatbreadLabs/flatbread/issues/202) +- [#203 — Phase 4: oracle evidence alignment](https://github.com/FlatbreadLabs/flatbread/issues/203) +- [#204 — Phase 5: documentation and skill refresh](https://github.com/FlatbreadLabs/flatbread/issues/204) + +--- + +## TL;DR + +Today, the runner in `packages/proof/src/run_dag.ts` stores each task's assistant output through a `BoundedTextBuffer(STREAM_CAP=4000)` that **drops the leading characters** as the stream grows past the cap. That bounded string is the **only** copy used for: the canvas `STATE` literal, the parent context stitched into child prompts (capped a second time at `UPSTREAM_SNIPPET_CAP=2000` by `buildUpstreamContext` / `truncateUpstreamSnippet`), the `--findings-dir` JSON sidecar payload, the convergence `extraContext` re-injection, and the persisted `--state-path` snapshot. Only the per-task `${taskId}.md` artifact, written from a separate uncapped `fullStreamChunks` array in `runTask`, ever retains the complete stream — and only when artifacts are enabled. + +This is the "rank/task output truncation limitation" we are paying down. The fix is **not** "remove all caps". The fix is to **split the execution plane from the display plane** so that: + +- Decisions the runner makes on a user's behalf — what to put in a child's prompt, what counts as a `## Blockers` finding, what convergence re-runs see, what resume hands back to a relaunched process — read from an **execution-authoritative full transcript** that the runner persists for the duration of the run. +- The `.canvas.tsx` file the IDE hot-recompiles, the persisted state JSON the supervisor reloads, and the `extraContext` that lands inside a model prompt each consume **explicit, named excerpts** of that transcript with documented size policies and visible truncation banners. No layer is permitted to feed a downstream consumer a silently-truncated string and pretend the rest never existed. + +The work is staged across five phases. Phase 0 just nails the contracts. Phase 1 (the load-bearing one) rebuilds the runner's per-task storage and rewires the **four** existing consumers of `ts.resultText` so that "complete" sources stay complete and "bounded" sources are explicitly bounded with banners. Phase 2 handles upstream-prompt budgets honestly. Phase 3 covers resume/supervisor schema growth. Phase 4 aligns oracle evidence. Phase 5 refreshes docs. + +--- + +## Ground truth (what the code actually does today) + +These are the load-bearing facts the rest of this document is built on. Every claim points to a file/symbol in the package. + +### 1. Two parallel buffers in `runTask`, only one is bounded + +`packages/proof/src/run_dag.ts` `runTask` (the `kind: 'task'` path) maintains both: + +- `const buffer = new BoundedTextBuffer(STREAM_CAP);` (`STREAM_CAP = 4000`). On `append`, when the cumulative chunk length exceeds the cap, `BoundedTextBuffer` does `this.data = this.data.slice(overflow)` and tracks `droppedChars`. `render()` returns either the raw data or `[...truncated ${droppedChars} earlier chars...]\n${data}`. This buffer feeds `ts.resultText` via `publishIfDue` (live) and the final assignments in the success and error branches of `runTask`. +- `const fullStreamChunks: string[] = [];` — every `block.text` from the assistant stream is appended verbatim. This array is only joined in the `finally` of `runTask` and written to `${taskId}.md` via `persistTaskMarkdownFile` when `options.fullOutputAbsoluteDir` is set. + +Consequence: `ts.resultText` is **always the tail** (with an explicit banner when the prefix was dropped). The complete stream exists only as in-memory chunks for the lifetime of `runTask`, then on disk in `${taskId}.md` (and only when artifacts are not suppressed by `--no-artifacts`). + +### 2. `buildUpstreamContext` reads the bounded buffer, then truncates it again + +`buildUpstreamContext` (same file) walks `task.depends_on`, fetches each parent's `TaskState` from `stateById`, and inlines `dep.resultText` after passing it through `truncateUpstreamSnippet(text, UPSTREAM_SNIPPET_CAP)` with `UPSTREAM_SNIPPET_CAP = 2000`. `truncateUpstreamSnippet` is section-aware: when the text has two or more `## ` headings, it drops sections in `SECTION_DROP_PRIORITY` order; otherwise it falls back to `truncate(text, cap)` which does `s.slice(0, n - 1) + '…'`. The section-aware path can also fall through to that final hard slice when no eligible section is droppable. + +Consequence: a child task's prompt is `framing + buildUpstreamContext(...) + extraContext + subtask_prompt`, where the upstream block is **a 2000-char view of the 4000-char tail of the parent's full stream**, glued together with **no visible banner** at the prompt level. The tail buffer's own `[...truncated N earlier chars...]` banner does travel into the upstream block when present, so the child does see a signal that the parent was capped — but the second 2000-char truncate that `truncateUpstreamSnippet` performs ends with `'…'` and no count, which a model will not reliably interpret as "the prompt above is itself truncated". + +### 3. `findings_sidecar.ts` parses the bounded buffer, not the full stream + +`writeFindingsSidecar(findingsDir, ts)` builds `sections: parseSections(ts.resultText ?? '')`. There is no path that reads `fullStreamChunks` or the artifact file. The header comment on `findings_sidecar.ts` correctly states "sidecar is captured at task completion", but "task completion" means after `BoundedTextBuffer` has already dropped the prefix. + +Consequence: the sidecar **cannot** repair `STREAM_CAP` prefix loss. It can only stabilize parsing against in-flight canvas updates: by writing once at `dispatchTask` completion, it avoids the race where `extractConvergenceFindings` reads `ts.resultText` mid-stream. The earlier draft's "sidecars compensate for truncation" framing was incorrect on this point and is dropped here. + +### 4. `runConvergenceLoop` reads the sidecar **for findings extraction only**, not for `extraContext` + +In `run_dag.ts`, `runConvergenceLoop` calls `readFindingsSidecarAsText(...)` and feeds the result (falling back to `convergeTs.resultText`) into `extractConvergenceFindings`. But the very next call, `buildConvergenceContext(convergeOn, iter, convergeTs.resultText)`, is unconditionally passed `convergeTs.resultText`. The resulting string is threaded through `dispatchTask(task, { extraContext: convergenceContext })` for every re-executed ancestor. + +Consequence: even with `--findings-dir` set, **ancestor re-runs still see only the bounded tail** of the reviewer's output as their "Convergence feedback from … (iteration N-1)" preamble. This is a second, independent truncation surface beyond the prompt-stitch issue in (2), and it is the precise bug the adversarial review identified. + +### 5. Canvas inlines the full `RunState` + +`canvas_writer.ts` `renderCanvasSource` builds the canvas with `const STATE: RunState = ${JSON.stringify(state, null, 2)};`. There is no compression, no externalization, no opt-out. Every `TaskState.resultText` value is embedded verbatim in the `.canvas.tsx` file the IDE recompiles. Today, this is tolerable specifically because `STREAM_CAP = 4000` caps each `resultText` value. Any plan that wants to make `ts.resultText` carry full streams must also redesign what goes into `STATE`, or the canvas will balloon to megabytes and stall the IDE. + +### 6. Resume serializes whatever is in `state.tasks[].resultText` + +`writePersistedRunState` (`self_hosting.ts`) does `JSON.stringify(payload, null, 2)` on `{ version: 1, writtenAt, reason, state }`. The state's task list includes `resultText`. `loadResumedRunState` (`run_dag.ts`) refreshes static metadata from the live DAG but leaves `resultText` untouched. So whatever the bounded buffer happens to hold at rank-boundary persistence — typically the tail of a finished task or the running tail of a `RUNNING` task that gets re-queued to `PENDING` — is what a relaunched process inherits. + +### 7. Oracle evidence is bounded by the same number + +`oracle_task.ts` declares `const ORACLE_TAIL_CAP = 4000;` and stamps `tail(outcome.stdout, ORACLE_TAIL_CAP)` / `tail(outcome.stderr, ORACLE_TAIL_CAP)` into the `## Stdout (tail):` / `## Stderr (tail):` sections of `ts.resultText`. There is no uncapped capture analog — full stdout/stderr exist only in the local strings inside `execShell` and are dropped at function exit. Unlike `kind: 'task'`, there is **no artifact path that preserves them**: `persistTaskMarkdownFile` writes `ts.resultText`, which is already tail-truncated for oracles. + +### 8. The docs already describe `STREAM_CAP` and `UPSTREAM_SNIPPET_CAP` + +`packages/proof/README.md` (Artifact Output, `dag.budget`, supervisor sections) and `.cursor/skills/proof/SKILL.md` ("Caveats": "Per-task streamed text is capped at `STREAM_CAP = 4000` chars to keep the canvas file modest. Upstream context passed to child tasks is capped at 2000 chars per parent, with section-aware truncation …") tell operators about both caps. They do **not** explain that those caps are reused as the execution-plane source of truth for sidecars, convergence `extraContext`, and resumed prompts. That gap is part of the limitation: callers who read the docs and reach for `--findings-dir` reasonably believe it is a backstop, when in fact it shares the same upstream loss. + +--- + +## The split: execution plane vs display plane + +The whole plan reduces to one rule: + +> A consumer that influences what the runner does next must never read from a buffer that another consumer is bounding for size or UX reasons. + +Concretely, after this work lands, each consumer of per-task output has a single, documented source: + +| Consumer | Source after the project completes | Plane | +| ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | +| `buildUpstreamContext` (parent → child prompts) | A new execution-authoritative per-task transcript (in-memory `string` during the run, mirrored to disk when artifacts are enabled), explicitly trimmed by an **upstream prompt policy** with a visible, counted banner when trimmed. | Execution | +| `extractConvergenceFindings` (`## Blockers` parsing) | Same authoritative transcript. The `--findings-dir` sidecar continues to be written from the same source so external tooling has a stable JSON form, but it is no longer required for parser correctness inside the runner — it is required only when the **runner process** has restarted. | Execution | +| `buildConvergenceContext` (reviewer feedback into ancestors) | Same authoritative transcript, trimmed by the **same** upstream prompt policy as `buildUpstreamContext` (so a convergence iteration is governed by one policy, not two implicit caps). | Execution | +| `--findings-dir` JSON sidecar contents | Same authoritative transcript. Schema field `sectionsTruncated?: boolean` plus per-section length is added so consumers can detect when **policy** trimmed evidence, distinct from "no content". | Execution-mirrored-to-disk | +| `${taskId}.md` artifact | Authoritative transcript, identical bytes (modulo header). The artifact is the canonical on-disk form of the execution-plane truth for the run. | Execution-mirrored-to-disk | +| `--state-path` / `--resume-state` payload | Either a pointer into the artifact directory (when artifacts are enabled and the directory is stable across restarts — see Phase 3) or an inline-but-compressed body when artifacts are suppressed. Either way, the relaunched process reconstructs the same authoritative transcript before resuming. | Execution | +| Canvas `STATE.tasks[].resultText` | Display-only: a bounded tail (today's `STREAM_CAP` semantics, but renamed `CANVAS_DISPLAY_CAP`) with a visible `[...truncated N earlier chars...]` banner. The canvas may additionally surface a path/hash pointer to the full transcript so a user can open it from the IDE. | Display | +| Canvas `
` block (today already `maxHeight: 320`)        | Same display string; UI continues to virtualize.                                                                                                                                                                                                                                                                  | Display                                        |
+| Oracle `## Stdout (tail) / Stderr (tail)` in `resultText`    | Bounded by `ORACLE_TAIL_CAP` for display **and** the inline sidecar value, but a separate full-evidence path (`${taskId}.stdout.log` / `${taskId}.stderr.log`) is written under the artifact dir for forensics. Convergence and downstream tasks that need oracle evidence pull from artifacts, not `resultText`. | Mixed (display bounded, full evidence on disk) |
+
+The five rows under "Execution" all read from one place. No more drift.
+
+### Why not "just remove the caps"
+
+A naïve "stream everything into `ts.resultText` and inline it in the canvas" approach breaks four things observed today in the code:
+
+1. **Canvas reload UX**: `renderCanvasSource` writes the whole `RunState` JSON every debounce window (default 200ms). A multi-megabyte `resultText` per task × a 10-task DAG would push the file to tens of megabytes and re-trigger an IDE hot-recompile on every `publishIfDue` (default every 500ms). `debounce` and `stream-publish-ms` were tuned around a 4000-char ceiling.
+2. **Prompt overflow**: `buildUpstreamContext` glues every parent's text into the child's prompt. Removing `UPSTREAM_SNIPPET_CAP` without a model-context-aware policy causes silent SDK rejections at runtime whose error messages do not mention "your DAG outputs grew too large".
+3. **State file bloat**: `writePersistedRunState` writes one JSON file per rank boundary and per convergence iteration via `persistState(...)`. Resume reads the entire file synchronously. Long runs would dominate disk and slow restart.
+4. **Privacy**: the canvas lives under `~/.cursor/projects//canvases/` (per `.cursor/skills/proof/SKILL.md` Step 1 conventions). Casual sharing of a canvas TSX today exposes 4000 chars per task; uncapped, it could trivially exfiltrate secrets emitted by a misbehaving subagent (e.g. an oracle dumping env). The privacy posture is a function of "what's inlined in the canvas", not "what the runner saw".
+
+The plan therefore treats each of these as a first-class layer with its own policy, not a side effect of a single shared buffer.
+
+---
+
+## Constraint matrix (where each cap lives after the project)
+
+### Canvas safety (`canvas_writer.ts`)
+
+| Constraint                   | Today                                                                                           | After                                                                                                                                                                                                                                                                                                                 |
+| ---------------------------- | ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `.canvas.tsx` file size      | Bounded indirectly by `STREAM_CAP` × tasks.                                                     | Bounded directly by `CANVAS_DISPLAY_CAP` per task (default = 4000, same as today) plus an optional pointer field. No path emits a canvas larger than `CANVAS_DISPLAY_CAP × tasks + framing/header bytes`. A regression test asserts the size envelope on a "large output" fixture.                                    |
+| Streaming write churn        | Every assistant text block triggers a `publishIfDue`.                                           | Unchanged. Canvas writes continue to consume the display buffer, which is appended to in the streaming loop. The new execution buffer is appended in the same loop but never triggers a canvas write on its own.                                                                                                      |
+| Truncation banner visibility | `BoundedTextBuffer.render()` prepends `[...truncated N earlier chars...]`.                      | Preserved verbatim. The canvas template will be extended to optionally render a "View full transcript" affordance using a relative path inside the artifact dir (feasibility-gated — see Phase 1). If the IDE canvas runtime cannot fetch, the link degrades to a copy-able path. The plan never assumes fetch works. |
+| Privacy posture              | Canvas inlines up to 4000 chars/task of raw stream — already a leak risk for secrets-in-stdout. | Display cap is unchanged in size; banner unchanged. The new execution transcripts live alongside the existing `${taskId}.md` artifacts and inherit their `.gitignore` story (already covered by `.flatbread/artifacts/` convention). Docs gain a "what gets persisted where" section.                                 |
+
+### Prompt budget (`run_dag.ts`)
+
+| Constraint                                   | Today                                                                                                     | After                                                                                                                                                                                                                                                                                                                                         |
+| -------------------------------------------- | --------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Per-parent excerpt in `buildUpstreamContext` | Hard `UPSTREAM_SNIPPET_CAP=2000` with section-aware-then-slice truncation, silent at the prompt boundary. | Explicit **upstream prompt policy** with three modes: `full`, `summarize` (today's section-aware behavior, but with a counted banner emitted into the prompt itself), and `maxChars: N` (operator-controlled). Default remains conservative (the current 2000-char section-aware path) but is now named, surfaced in logs, and tested.        |
+| Convergence `extraContext`                   | `buildConvergenceContext(convergeOn, iter, convergeTs.resultText)` — bounded tail unconditionally.        | Reads the same authoritative transcript as `extractConvergenceFindings`. Trims via the same policy as `buildUpstreamContext`. The judge's finding #2 is the test case: a reviewer whose `## Blockers` lines appear past byte 4000 must produce ancestor prompts that contain those blockers.                                                  |
+| `dag.framing`                                | Prepended verbatim. Counts against model context but not against any Proof cap.                           | Unchanged. Documented as "part of the prompt budget — author it deliberately."                                                                                                                                                                                                                                                                |
+| Stitched prompt overflow handling            | None. The SDK may reject; the task ends as `ERROR` with whatever message comes back.                      | When `outputPolicy.upstream` is `full` and the policy estimator predicts a stitched prompt larger than `outputPolicy.maxPromptChars` (new), the runner marks the task `BUDGET-EXCEEDED` **before** dispatch with an actionable message naming the offending parent ids and char counts. This reuses the existing `BUDGET-EXCEEDED` exit path. |
+
+### Artifact storage (README, `--full-output-dir`, `--no-artifacts`)
+
+| Constraint                           | Today                                                                                   | After                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| ------------------------------------ | --------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Default location                     | `/.flatbread/artifacts/dag--/` (timestamped per run).                    | Unchanged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| `--no-artifacts`                     | Suppresses transcripts, `_index.md`, `_dag.json`. Findings sidecars are independent.    | Unchanged for the user-facing flag. Internally, when the runner is in execution-complete mode and artifacts are suppressed, the authoritative transcript is held in memory for the lifetime of the run with an **explicit RAM ceiling** (`--max-in-memory-output-bytes`, default 64 MiB across all tasks). Crossing the ceiling produces `BUDGET-EXCEEDED` on the next task to overflow, not silent loss. This resolves the judge's "policy decision deferred" finding by picking the policy up front. |
+| Supervisor + timestamped directories | Each child runner picks a new timestamp unless the supervisor pins `--full-output-dir`. | Same. New guidance in `README.md` Self-Hosting Mode: when self-hosting is enabled, **pin** `--full-output-dir` so artifact-backed resume reads find the same transcripts the prior process wrote. The supervisor and runner both refuse to start if `--restart-on-runner-change` is set without either a pinned `--full-output-dir` or a non-default `--max-in-memory-output-bytes`.                                                                                                                   |
+| Atomicity                            | Per-task `${taskId}.md` is written via `writeFile` once per terminal state.             | Unchanged for `${taskId}.md`. New per-task `${taskId}.stream.txt` (or `.bin` if we go content-addressed in a later phase) is written incrementally during the run via append-only writes from the stream loop, then closed in `runTask`'s `finally`. Failure to write the stream file does not abort the task; it is logged and the task is marked with a `streamPersisted: false` flag in state.                                                                                                      |
+
+### Privacy
+
+| Constraint                            | Today                                                            | After                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------------------------------------- | ---------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Canvas as accidentally-shared surface | Up to 4000 chars/task of raw stream content.                     | Same display ceiling; the only new privacy surface is the pointer (relative path) to the transcript file. The pointer's path lives under `.flatbread/artifacts/`, which is already covered by Flatbread's convention to gitignore the artifacts root. The plan does **not** add new redaction hooks — that is deferred work, called out under Risks.                                              |
+| Oracle stdout/stderr                  | Already tail-bounded at 4000 chars in `resultText`.              | Display unchanged. Full stdout/stderr written to `${taskId}.stdout.log` and `${taskId}.stderr.log` under the artifact dir **only when artifacts are enabled**. With `--no-artifacts`, oracle full evidence stays in memory subject to the same `--max-in-memory-output-bytes` ceiling.                                                                                                            |
+| Persisted state JSON                  | Embeds `resultText` (bounded today; would be huge if unbounded). | Embeds the display string (small) plus a `transcript` discriminated union: either `{ kind: 'artifact'; path: string }` or `{ kind: 'inline'; gzippedBase64: string }`. The `inline` form is reserved for `--no-artifacts` runs and carries the new `streamCompression: 'gzip'` versioned field. Schema version bumps to `2` with a documented migration path from `1` (legacy `resultText`-only). |
+
+### Resume / supervisor (`self_hosting.ts`, `loadResumedRunState`)
+
+| Constraint                      | Today                                                                                                               | After                                                                                                                                                                                                                                                                                                                                      |
+| ------------------------------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `version: 1` schema             | Hard rejects anything else.                                                                                         | Adds `version: 2`. Reader accepts `1` and `2`; on `1`, the `resultText` value is promoted to both the display string and the inline transcript (because that's all the legacy run captured). Writer always emits `2`. The version bump is the migration.                                                                                   |
+| `RUNNING → PENDING` on resume   | Re-queues the task; existing behavior.                                                                              | Unchanged. The relaunched task gets a fresh stream and a fresh transcript file (the prior partial transcript is preserved at `${taskId}.iter${N}.partial.stream.txt` for forensics).                                                                                                                                                       |
+| Prompt parity across restart    | A relaunched run reconstructs `extraContext` and `buildUpstreamContext` from the bounded `resultText` it inherited. | A relaunched run reconstructs the **same stitched prompt string** that the original process would have produced for that rank, given the same DAG and the same authoritative transcripts. This is **input parity**, not output parity — LLM outputs are not deterministic, and the plan does not claim they are. Phase 3 test guards this. |
+| `RUNNER_RUNTIME_FILES` snapshot | Triggers `EXIT_RUNNER_RESTART` (75) when `run_dag.ts` / `canvas_writer.ts` / etc. change.                           | Unchanged set. The new module(s) added in Phase 1 join `RUNNER_RUNTIME_FILES`.                                                                                                                                                                                                                                                             |
+
+### Convergence semantics
+
+| Constraint                            | Today                                                                             | After                                                                                                                                                                                                                                           |
+| ------------------------------------- | --------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `extractConvergenceFindings` source   | `readFindingsSidecarAsText(...)` or `convergeTs.resultText`.                      | Authoritative transcript, falling back to sidecar (across-process boundary, e.g. resume), falling back to `resultText` (legacy resume). The fallback chain is documented; tests cover each leg.                                                 |
+| Reviewer payload size                 | Implicitly bounded by `STREAM_CAP`. Blockers past byte 4000 are invisible.        | Bounded only by `outputPolicy.maxPromptChars` for prompt-side stitching; bounded only by `--max-in-memory-output-bytes` or disk for storage. A reviewer can emit `## Blockers` near the end of a long evidence dump and the loop will see them. |
+| `## Blockers` placeholder semantics   | `extractConvergenceFindings.filterMeaningful` already drops `(none)`, `n/a`, etc. | Unchanged.                                                                                                                                                                                                                                      |
+| Mutual exclusion with `--converge-on` | Today's "no `DAG.loops` AND `--converge-on`" guard remains.                       | Unchanged.                                                                                                                                                                                                                                      |
+
+### Oracle evidence
+
+| Constraint                                 | Today                                    | After                                                                                                                                                                                                                                                                                                                                                   |
+| ------------------------------------------ | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `ORACLE_TAIL_CAP = 4000`                   | Applied to stdout and stderr separately. | Renamed `ORACLE_DISPLAY_TAIL_CAP` (semantically still 4000 by default) for clarity. The display-bound tail continues to land in `resultText` so the canvas and sidecar render identically to today.                                                                                                                                                     |
+| Full oracle stdout/stderr persistence      | None.                                    | When artifacts are enabled, `${taskId}.stdout.log` and `${taskId}.stderr.log` capture full output. Sidecar gains optional `stdoutPath` / `stderrPath` fields when present so external tooling can resolve to the full evidence. `formatOracleResult` references these paths in a new `## Evidence` footer.                                              |
+| `extraContext` references to oracle output | Bounded tail.                            | If a downstream task lists an oracle in `depends_on`, `buildUpstreamContext` follows the same policy as for `kind: 'task'`. Long stderr dumps that today are silently dropped will now either flow through to the child (`policy: 'full'`) or trigger `BUDGET-EXCEEDED` (`policy: 'summarize'` exceeding `maxPromptChars`). Either way, no silent drop. |
+
+---
+
+## Phased plan with acceptance criteria
+
+Each phase is independently shippable. Phases 1 and 2 together carry the "remove the silent truncation" promise; the rest harden it.
+
+### Phase 0 — Contracts, inventory, and feasibility spikes (no behavior change)
+
+Tracking issue: [#200](https://github.com/FlatbreadLabs/flatbread/issues/200).
+
+**Deliverables**
+
+- **Consumer inventory document** committed alongside this proposal (or inlined as a stable section of the README's "Internal layout" appendix) that explicitly names every read site of `TaskState.resultText` in `packages/proof/src/**`:
+  - `canvas_writer.ts` → canvas `STATE` literal.
+  - `run_dag.ts` `buildUpstreamContext` → child prompts.
+  - `run_dag.ts` `runConvergenceLoop` → `extractConvergenceFindings` source, `buildConvergenceContext` source.
+  - `findings_sidecar.ts` `writeFindingsSidecar` → sidecar `sections`.
+  - `self_hosting.ts` `writePersistedRunState` (indirectly via `state.tasks[].resultText`) → resume payload.
+  - The implicit consumers in `persistTaskMarkdownFile` (uses `fullStreamChunks`, not `resultText` — noted to disambiguate).
+- **Decision document** picking the canonical execution-authoritative store shape: in-memory `Map` plus per-task append-only stream files under the artifact dir. (The plan does not pursue a content-addressed blob store in Phase 1 — that is called out in "Risks" as deferred work; per-task markdown remains the human-friendly form.)
+- **Feasibility spike for canvas pointer affordance.** Open question from the prior draft: can a `.canvas.tsx` invoke runtime fetch of a workspace-relative file? A short spike against `cursor/canvas` answers yes/no. If no, the canvas affordance degrades to a click-to-copy path string. The Phase 1 design does not block on the result — the pointer is always written; only the UI shape changes.
+- **Policy default decision.** `--no-artifacts` + execution-complete mode is resolved here: in-memory ceiling with `BUDGET-EXCEEDED` on overflow (see constraint matrix). Documented in this proposal already; Phase 0 lifts it into the package docs.
+
+**Acceptance criteria**
+
+- A `docs/proposals/proof-output-retention-plan.md` (this file) and a one-page consumer inventory appendix land. No code under `packages/proof/src/**` changes in Phase 0.
+- `pnpm verify` continues to pass (it should, because nothing changed).
+- A short feasibility note answers the canvas fetch question with either "feasible — example PR" or "not feasible — fall back to copy-path". The note is referenced from Phase 1's design.
+
+### Phase 1 — Execution-authoritative transcript and rewired consumers
+
+This is the load-bearing phase.
+
+**Deliverables**
+
+- New module `packages/proof/src/task_transcript.ts` (or equivalent — naming is implementation-detail) that owns the per-task authoritative store. Responsibilities:
+  - In-memory `string` per task id, appended chunk-by-chunk from the stream loop.
+  - Append-only mirror to `${fullOutputAbsoluteDir}/${taskId}.stream.txt` when artifacts are enabled (best-effort; errors logged but never abort the task).
+  - A `read(taskId)` accessor that returns the in-memory string when present, falls back to reading the stream file (used by resume), and falls back to the legacy `ts.resultText` on Phase-1-pre-existing state.
+- `runTask` updated so the stream loop pushes each `block.text` into both the existing `BoundedTextBuffer` (renamed semantically to "display buffer", same `CANVAS_DISPLAY_CAP = 4000`) **and** the new transcript store. `fullStreamChunks` is removed (the transcript store subsumes it). `persistTaskMarkdownFile` reads from the transcript store.
+- `buildUpstreamContext` rewritten to read from the transcript store for each `depends_on` parent. The result passes through a new `applyUpstreamPolicy(text, policy)` that today defaults to the legacy section-aware-then-slice behavior at 2000 chars but emits a prompt-visible counted banner (`[...upstream excerpt: kept last 2000 of N chars, sections dropped: X, Y...]`) instead of a bare `…`.
+- `runConvergenceLoop` updated so **both** `extractConvergenceFindings` and `buildConvergenceContext` read from the transcript store. The sidecar continues to be written, and continues to be used as a cross-process fallback (e.g. resumed runs), but is no longer the primary in-process source.
+- Canvas `STATE.tasks[].resultText` continues to be the bounded display string. A new optional `STATE.tasks[].transcriptPath` field carries the relative path to the stream file when one was written. The canvas template renders a "View full transcript" affordance per the Phase 0 feasibility result.
+- `findings_sidecar.ts` `writeFindingsSidecar` reads from the transcript store and emits `sections` keyed identically to today. A new sibling field `sectionsRaw?: Record` is reserved for a future phase if downstream tooling asks for it; Phase 1 ships only `sections` to keep the on-disk schema stable.
+
+**Acceptance criteria**
+
+Each of the following is a named test that must pass; the test names below are mnemonic, not literal file paths.
+
+1. **Late-region prompt content (no `--findings-dir`).** Fixture: a synthetic stream of 12 000 deterministic chunks for parent task `a` containing the marker string `MARKER_LATE` at byte ~10 000. Child task `b` depends on `a`. After the runner executes, the stitched prompt for `b` (captured via a test-only hook on `applyUpstreamPolicy`) contains `MARKER_LATE` **when** `outputPolicy.upstream` is `full`, and **does not** when `summarize` (with a visible counted banner in either case when trimmed).
+
+2. **Convergence detects late blockers (no `--findings-dir`).** Fixture: reviewer task `r` emits a long preamble followed by `## Blockers\n- still broken` past byte 5000. The legacy runner would miss this because `STREAM_CAP=4000` drops the prefix and `## Blockers` lands at the cap boundary. The new runner detects it and schedules a re-run, regardless of `--findings-dir`.
+
+3. **Convergence `extraContext` carries late reviewer content (no `--findings-dir`).** Fixture as in (2). When the ancestor `a` is re-executed, the captured `extraContext` substring of `a`'s stitched prompt contains the `## Blockers` line. This is the judge's explicit recommendation — pair the parsing test with a stitched-prompt assertion.
+
+4. **Same with `--findings-dir` set.** Both (2) and (3) pass when `--findings-dir` is also set, proving that the sidecar pathway is consistent with the in-memory authoritative source (no drift between in-process and resumed extraction).
+
+5. **Artifact `${taskId}.md` matches the stream file.** For a non-trivial fixture, `${taskId}.md` bytes equal the transcript store bytes (modulo the meta header). This catches a regression where `fullStreamChunks` was retired but `persistTaskMarkdownFile` was missed.
+
+6. **Canvas size envelope.** For a fixture of 5 tasks × 12 000-char outputs, the generated `.canvas.tsx` file size remains below `5 * CANVAS_DISPLAY_CAP + 64 KiB` (the constant captures header, layout, types, and a generous slack). Today's behavior is preserved at the display layer.
+
+7. **Visible truncation banner in the prompt.** When `applyUpstreamPolicy` trims, the **stitched prompt string** (not just the upstream block) contains a banner with a real character count. A grep-for-`…` test fails the legacy silent-ellipsis path.
+
+8. **Existing bounded-loop suite.** `pnpm -F @flatbread/proof test` continues to pass. The plan does not invalidate `BoundedTextBuffer` tests — that helper still exists for the display buffer.
+
+### Phase 2 — Upstream prompt policy with honest budgets
+
+Tracking issue: [#201](https://github.com/FlatbreadLabs/flatbread/issues/201).
+
+**Deliverables**
+
+- `DAG.outputPolicy` (top-level, optional) added to `packages/proof/src/dag.ts` with `parseDAG` validation:
+  ```ts
+  outputPolicy?: {
+    upstream?: 'full' | 'summarize' | { maxChars: number };
+    maxPromptChars?: number;
+  }
+  ```
+  Defaults: `upstream: 'summarize'` with the existing 2000-char section-aware policy (preserves today's behavior); `maxPromptChars: 200_000` (well under typical model context windows but generous enough that benign DAGs do not trip).
+- `runTask` preflight: before `agent.send(stitched)`, compute the stitched prompt length and compare against `maxPromptChars`. On overflow, the task is marked `BUDGET-EXCEEDED` with a message listing the offending parents and their contribution.
+- New CLI knobs (kept narrow — these are mostly DAG-driven):
+  - `--output-policy-upstream `: overrides `outputPolicy.upstream` for ad-hoc runs.
+  - `--max-prompt-chars `: overrides `outputPolicy.maxPromptChars`.
+- `BUDGET-EXCEEDED` exit code path (`EXIT_BUDGET_EXCEEDED = 4`) is reused so wrapper scripts already keyed on it continue to work.
+
+**Acceptance criteria**
+
+1. **Default is byte-for-byte the legacy behavior.** A DAG without `outputPolicy` produces identical stitched prompts to today (excluding the new visible-banner change from Phase 1, which is in effect from Phase 1 onward).
+2. **`policy: 'full'` passes the late marker through** (covered already by Phase 1 test (1) when the fixture sets `upstream: 'full'`).
+3. **`maxPromptChars` overflow surfaces `BUDGET-EXCEEDED` with an actionable message.** Fixture: two parents each contributing 60 000 chars with `policy: 'full'` and `maxPromptChars: 80 000`. The child task ends `BUDGET-EXCEEDED` and the message names both parent ids and total char count.
+4. **CLI/DAG precedence test.** `--max-prompt-chars` overrides `outputPolicy.maxPromptChars`; `outputPolicy` in the DAG overrides defaults; `--models-file`-style precedence is documented and tested.
+
+### Phase 3 — Resume, supervisor, and disk ergonomics
+
+Tracking issue: [#202](https://github.com/FlatbreadLabs/flatbread/issues/202).
+
+**Deliverables**
+
+- `PersistedRunState` schema bumped to `version: 2` in `self_hosting.ts`:
+  ```ts
+  state.tasks[].transcript: { kind: 'artifact'; path: string }
+                          | { kind: 'inline'; encoding: 'gzip-base64'; data: string }
+                          | { kind: 'legacy'; resultText: string };
+  ```
+  - `kind: 'artifact'` is used when `--full-output-dir` is set and the stream file exists.
+  - `kind: 'inline'` is used when `--no-artifacts` is set (or the stream file is missing); the body is gzipped to bound state size.
+  - `kind: 'legacy'` is what `version: 1` readers see and what writers emit for the migration grace period; documented as removable in a future release.
+- `loadResumedRunState` reconstructs the in-memory transcript store from `state.tasks[].transcript` before any rank executes, so subsequent ranks see the same `applyUpstreamPolicy` inputs the prior process would have produced.
+- Supervisor (`run_dag_supervisor.ts`, not modified in source by this proposal but referenced) gains an early validation: if `--restart-on-runner-change` is forwarded and the runner is configured for `--no-artifacts` without `--max-in-memory-output-bytes` overrides, the supervisor logs a clear warning that resumed runs will pay the gzip cost for every transcript. (This is documentation + a log line, not a refusal.)
+- README "Self-Hosting Mode" gains a "pin `--full-output-dir` for resumable runs" paragraph.
+
+**Acceptance criteria**
+
+1. **Input parity across restart.** Fixture: a 4-task DAG that finishes the first two ranks, persists state, exits via `EXIT_RUNNER_RESTART`, and resumes. The stitched prompt strings handed to the SDK in rank 3 are **bytewise identical** between (a) a non-restart full run and (b) the resumed second process, when the underlying assistant streams from rank 1–2 are deterministically replayed via fixtures. Output parity is **not** asserted (LLM determinism is out of scope; the test uses a fake `Agent.send` that replays canned chunks).
+2. **State file size stays bounded.** For the same fixture above, `state-path` JSON size after rank 2 stays below `O(tasks × CANVAS_DISPLAY_CAP)` when artifacts are enabled (because transcripts are pointers), and below `O(tasks × gzipped_transcript_size)` when artifacts are disabled. Both ceilings are asserted as soft thresholds in the test.
+3. **`version: 1` → `version: 2` migration.** A hand-crafted `version: 1` state file (i.e. legacy `resultText`-only) is loaded successfully and its `resultText` populates both the display buffer and `transcript: { kind: 'legacy', resultText: ... }`. The runner continues from there without error. Documented as a one-release grace period.
+4. **Supervisor pin advice.** Running the supervisor with `--restart-on-runner-change` and no pinned `--full-output-dir` emits the warning line; the test greps stdout/stderr for it.
+
+### Phase 4 — Oracle evidence alignment
+
+Tracking issue: [#203](https://github.com/FlatbreadLabs/flatbread/issues/203).
+
+**Deliverables**
+
+- `oracle_task.ts` writes `${taskId}.stdout.log` and `${taskId}.stderr.log` under the artifact directory when artifacts are enabled. These are written atomically at task completion (oracle commands are short-lived; we can capture into in-memory strings, as today, and flush once).
+- `formatOracleResult` adds an optional `## Evidence` footer listing the absolute paths when present; the canvas template renders them as "Open stdout / Open stderr" pointers analogous to the `kind: 'task'` "View full transcript" affordance.
+- The new `--max-in-memory-output-bytes` ceiling (introduced in Phase 1's `--no-artifacts` handling) applies uniformly to oracle full evidence in `--no-artifacts` mode. Overflow produces `BUDGET-EXCEEDED` on the oracle task with an actionable message; this is consistent with how Phase 2 treats prompt overflow.
+- `buildUpstreamContext` is unchanged for oracles in shape: if a child task depends on an oracle, the upstream excerpt follows `outputPolicy.upstream`. The `## Stdout (tail) / ## Stderr (tail)` headings in `resultText` survive untouched.
+
+**Acceptance criteria**
+
+1. **Oracle full evidence persists.** Fixture: an oracle command that emits 10 000 lines to stdout. After completion, `${taskId}.stdout.log` contains all 10 000 lines and `ts.resultText`'s `## Stdout (tail):` body matches today's tail-capped string.
+2. **Oracle in `--no-artifacts` mode**. With `--no-artifacts` and a 10 000-line oracle, the in-memory store holds the full output; if the run's cumulative held bytes exceed `--max-in-memory-output-bytes`, the oracle is marked `BUDGET-EXCEEDED` (a deterministic test fixture sets the ceiling low to trip this on a small payload).
+3. **Downstream task depending on oracle sees policy-bounded upstream.** With `policy: 'full'` and a long oracle stderr, the child task's prompt contains stderr lines past the legacy 4000-char tail. With `policy: 'summarize'`, the prompt sees the existing tail behavior and a visible banner.
+
+### Phase 5 — Documentation and skill refresh
+
+Tracking issue: [#204](https://github.com/FlatbreadLabs/flatbread/issues/204).
+
+**Deliverables**
+
+- `packages/proof/README.md` gains a new section "Where output lives" with a copy of the consumer-source table from this document, adapted for operator audience. The "Caveats" mentions of `STREAM_CAP = 4000` and "upstream context capped at 2000 chars" are rewritten to point at `outputPolicy` and `CANVAS_DISPLAY_CAP` / `ORACLE_DISPLAY_TAIL_CAP`.
+- `.cursor/skills/proof/SKILL.md` Caveats section is rewritten in the same way. The "DAG quality bar" section is unchanged.
+- The CLI options table in `SKILL.md` and `README.md` gains the new flags from Phase 2 (`--output-policy-upstream`, `--max-prompt-chars`) and Phase 1's `--max-in-memory-output-bytes`. The existing flag rows are unchanged.
+- A short "What changed" migration paragraph in the README's release notes for the version that ships Phase 1/2.
+
+**Acceptance criteria**
+
+- A grep across `README.md` and `.cursor/skills/proof/SKILL.md` for the literal strings `STREAM_CAP`, `4000`, and `2000` finds them only where they document the **display** caps `CANVAS_DISPLAY_CAP` and `ORACLE_DISPLAY_TAIL_CAP`, not as execution-plane behavior.
+- `pnpm verify` passes.
+
+---
+
+## Test strategy
+
+The proof package's existing test infrastructure is the AVA bounded-loop suite plus the ava+vitest matrix surfaced by root `pnpm test` (per `AGENTS.md`). The plan adds tests at three layers.
+
+### Unit-level (vitest under `packages/proof/__tests__/` or equivalent existing location)
+
+- **`task_transcript`** (new module): append behavior, read fallback chain (in-memory → stream file → legacy `resultText`), write-failure logging without abort.
+- **`applyUpstreamPolicy`**: full/summarize/maxChars modes; banner contents; section-aware drop order preserved when `summarize` is selected; counted banner replaces the bare `'…'`.
+- **`extractConvergenceFindings`**: regression coverage for placeholder detection (existing) and a new case with blockers past the legacy 4000-char boundary.
+- **`writeFindingsSidecar`** + **`readFindingsSidecarAsText`**: round-trip, including the case where the sidecar is written from the new authoritative transcript and consumed across a simulated process boundary.
+- **Schema migration**: `version: 1` → `version: 2` `PersistedRunState` reader. The migration is a pure function and easy to test.
+
+### Integration / golden-fixture (AVA bounded-loop suite, expanded)
+
+These tests do not call the live Cursor SDK. They drive `runTask` through a fake `Agent.create` whose `send` returns a `RunnerTaskRun` with a scripted async iterator. Fixtures are checked-in JSON files describing scripted chunks per task plus expected stitched-prompt substrings.
+
+- **Late marker prompt content** (Phase 1, criteria 1, 7).
+- **Convergence detects late blockers** (Phase 1, criterion 2).
+- **Convergence `extraContext` carries late blockers** (Phase 1, criterion 3) — explicitly asserts the substring of the captured stitched prompt for the re-executed ancestor task. This addresses the adversarial review's "tests should cover `buildConvergenceContext`, with and without `--findings-dir`" recommendation.
+- **`--findings-dir` ↔ in-memory parity** (Phase 1, criterion 4).
+- **Artifact / transcript byte equality** (Phase 1, criterion 5).
+- **Canvas size envelope** (Phase 1, criterion 6) — reads the generated `.canvas.tsx` on disk after a fixture run and asserts the file size and the presence of the new `transcriptPath` field per task.
+- **`maxPromptChars` overflow** (Phase 2, criterion 3) — uses the same scripted-chunks harness with deliberately large fixtures.
+- **Resume input parity** (Phase 3, criterion 1) — runs the fake-agent harness through `EXIT_RUNNER_RESTART`, persists state, re-instantiates a second runner with `--resume-state`, and asserts identical stitched prompts in rank 3 between the resumed second process and a non-restart full run with the same fixtures.
+- **Oracle full evidence persistence** (Phase 4, criterion 1) — `execShell` is exercised against `node -e 'for (let i=0;i<10000;i++) console.log(i)'` so the test does not depend on a system command beyond Node itself.
+
+### Privacy and operator-facing smoke
+
+- Canvas snapshot test: for a fixture that streams a synthetic "secret-shaped" token at byte 0, byte 3500, byte 4500, and byte 10 000, assert that the `.canvas.tsx` `STATE` contains only the byte-3500 and byte-4500 occurrences (i.e. the display tail), with the explicit `[...truncated N earlier chars...]` banner. The transcript file contains all four. Future redaction work plugs in here.
+- A `pnpm verify` invocation continues to pass at every phase boundary. Failing `pnpm verify` blocks the phase.
+
+### CI commands (from `AGENTS.md`)
+
+- `pnpm -F @flatbread/proof test` — focused bounded-loop suite; should remain fast even with the added fixtures (no live SDK).
+- `pnpm verify` — full lint + typecheck + build + test before any phase is considered shipped.
+
+---
+
+## Risks and mitigations
+
+| Risk                                                                                              | Mitigation                                                                                                                                                                                                                                                                                                     |
+| ------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Canvas pointer affordance is not feasible inside `cursor/canvas` runtime.                         | Phase 0 spike. Degrade to a copy-able path string with no fetch. The plan's execution-plane fix does not depend on the canvas displaying full output; it depends on the runner reading from a separate store.                                                                                                  |
+| `outputPolicy: 'full'` users blow past the model context window and see SDK rejections.           | `maxPromptChars` preflight produces `BUDGET-EXCEEDED` with an actionable message before dispatch. Default policy remains `summarize`, so this only affects users who opt in.                                                                                                                                   |
+| Append-only stream files multiply small writes and stress slow filesystems.                       | Writes are best-effort and behind the existing artifact dir gate; failures are logged and the task is flagged but not aborted. The display path continues to work without the stream file. We can add a "write-every-N-chunks" coalesce later if profiling warrants.                                           |
+| State file growth even with pointers (the canvas state itself plus task metadata can still grow). | `--state-path` writes are already once per rank boundary plus per convergence iteration, not per chunk. The new `transcript` field is a tiny discriminated union when artifacts are enabled. Phase 3 acceptance test asserts a soft size ceiling.                                                              |
+| Privacy regression via persisted `${taskId}.stream.txt` files capturing secrets.                  | Files live under the same `.flatbread/artifacts/` directory as today's `${taskId}.md`; the same `.gitignore` and review hygiene apply. A redaction hook (e.g. opt-in regex-based stripping in the stream loop) is **out of scope** for this proposal and called out as future work.                            |
+| Resumed runs find a stale `${taskId}.stream.txt` written by a prior runner with different source. | The supervisor already restarts at rank boundaries — partial in-flight tasks are re-queued `PENDING` and get a new stream file (`${taskId}.iter${N}.partial.stream.txt` is preserved for forensics, new run writes `${taskId}.iter${N+1}.stream.txt`). The transcript store reads honor the iteration counter. |
+| Schema version churn breaks downstream tooling that consumed `--findings-dir` sidecars by shape.  | The sidecar `sections` shape is preserved. New fields (`stdoutPath`, `stderrPath`, `sectionsRaw`) are optional. Existing schemas keep working.                                                                                                                                                                 |
+| `--restart-on-runner-change` triggers mid-Phase-1 development as engineers edit the source.       | The new module(s) are added to `RUNNER_RUNTIME_FILES` only after their first stable commit. During development, contributors use ad-hoc CLI runs (without the supervisor) per `AGENTS.md`'s commands.                                                                                                          |
+| Bounded-loop test suite slows down with the added fixtures.                                       | Fixtures use scripted-chunk fake agents (no LLM calls). Each fixture is at most a few tens of milliseconds. The aggregate budget for the proof test suite is monitored in CI; this proposal's tests target +20 cases at well under 1 s each.                                                                   |
+
+---
+
+## Backout / partial-ship considerations
+
+Each phase is independently revertable:
+
+- Phase 1 is the most invasive. If the new transcript module is rolled back, `BoundedTextBuffer` and `fullStreamChunks` are restored exactly as today and the consumer rewires unwind. The schema bump and CLI flags from later phases do not depend on Phase 1 source files existing.
+- Phase 2 adds CLI flags and a parse step. Reverting them returns the defaults that match today's behavior.
+- Phase 3 schema bump must be backed out alongside any field additions to `PersistedRunState`; the `version: 1` reader is kept long enough that an older runner can still read newer files via the legacy fallback (though the converse is not guaranteed and is documented).
+- Phases 4 and 5 are nearly pure additions.
+
+---
+
+## Open questions deferred (and why)
+
+1. **Redaction hooks in the stream loop.** Important enough to call out, not enough scoped runway to bundle in. Tracked as a follow-up that consumes the same `task_transcript` write boundary.
+2. **Content-addressed blob storage.** Per-task markdown is human-friendly and gets reused by `_index.md`. A single content-addressed store (e.g. `/blobs/.txt` plus a manifest) is a clean Phase 6 if disk duplication becomes painful, but premature today.
+3. **Lazy canvas fetch vs path-only pointers.** Phase 0 spike decides. Either answer is consistent with the rest of the plan.
+4. **Per-task summaries generated by a cheap model.** Some operators may want the canvas to show a 200-char LLM summary instead of a raw tail. That is purely a display-plane question and orthogonal to the execution-fidelity fix. Listed for future work.
+
+---
+
+## References (code, with line-ish landmarks)
+
+- `packages/proof/src/run_dag.ts`: `BoundedTextBuffer`, `STREAM_CAP`, `UPSTREAM_SNIPPET_CAP`, `buildUpstreamContext`, `truncateUpstreamSnippet`, `runTask` (display buffer + `fullStreamChunks` + `persistTaskMarkdownFile`), `runConvergenceLoop` (sidecar fallback + `buildConvergenceContext` call site), `persistState`, `loadResumedRunState`.
+- `packages/proof/src/canvas_writer.ts`: `renderCanvasSource`, `STATE` literal layout, the `
` rendering for streaming output.
+- `packages/proof/src/findings_sidecar.ts`: `writeFindingsSidecar` (reads `ts.resultText`), `readFindingsSidecarAsText`, `parseSections`, `FindingsSidecar`.
+- `packages/proof/src/converge_loop.ts`: `extractConvergenceFindings`, `buildConvergenceContext`, `filterMeaningful`, `PLACEHOLDER_WORDS`.
+- `packages/proof/src/oracle_task.ts`: `ORACLE_TAIL_CAP`, `tail`, `formatOracleResult`, `execShell`.
+- `packages/proof/src/self_hosting.ts`: `PersistedRunState` (`version: 1`), `RUNNER_RUNTIME_FILES`, `EXIT_RUNNER_RESTART`.
+- `packages/proof/README.md`: artifact defaults, supervisor pinning notes, the existing `dag.budget` story.
+- `.cursor/skills/proof/SKILL.md`: operator caveats explicitly naming the 4000/2000 caps that this plan reframes.
diff --git a/docs/proposals/proof-output-retention-review.md b/docs/proposals/proof-output-retention-review.md
new file mode 100644
index 00000000..1f928bab
--- /dev/null
+++ b/docs/proposals/proof-output-retention-review.md
@@ -0,0 +1,131 @@
+# Adversarial review: Proof output retention — Phase 1 after latest fixes
+
+Author: Opus 4.7 (replaces the prior review draft).
+Scope: read-only audit of the currently uncommitted diff in `packages/proof/src/**`, `packages/proof/README.md`, and `.cursor/skills/proof/SKILL.md`, plus the new modules `task_transcript.ts` and `upstream_policy.ts` and the new test file `packages/proof/src/__tests__/output-retention-phase1.test.ts`, measured against [`docs/proposals/proof-output-retention-plan.md`](./proof-output-retention-plan.md) and [`docs/proposals/proof-output-retention-judge.md`](./proof-output-retention-judge.md).
+
+The latest fixes close the prior review's three high-severity items: `TaskTranscriptStore.get` now reads `${taskId}.stream.txt` from disk when the in-memory map is empty, `main()` registers existing mirror paths from `loadResumedRunState` for any task whose persisted `transcriptPath` is set, `runConvergenceLoop` threads `includeSidecar: false` into the `buildConvergenceContext` source, and the README / SKILL.md docs are now scoped to `kind: 'task'` with an explicit "resumed runs can reconstruct that transcript when the same `--full-output-dir` is reused" clause. This pass focuses on what those fixes still leave open and on residual structural issues.
+
+Follow-up issues:
+
+- [#200 — Phase 0/1 follow-up hardening](https://github.com/FlatbreadLabs/flatbread/issues/200) covers the residual medium/low findings in this review.
+- [#201 — Phase 2: upstream prompt policy and budget preflight](https://github.com/FlatbreadLabs/flatbread/issues/201) covers prompt budgets and the planned `{ maxChars }` policy shape.
+- [#202 — Phase 3: resume, supervisor, and disk ergonomics](https://github.com/FlatbreadLabs/flatbread/issues/202) covers versioned resume state and supervisor restart ergonomics.
+- [#203 — Phase 4: oracle evidence alignment](https://github.com/FlatbreadLabs/flatbread/issues/203) covers full oracle stdout/stderr evidence.
+- [#204 — Phase 5: documentation and skill refresh](https://github.com/FlatbreadLabs/flatbread/issues/204) covers the final operator-facing docs pass.
+
+---
+
+## Blockers
+
+None.
+
+Phase 1's in-process happy path is correct; the resumed-process happy path is now correct when `--full-output-dir` is pinned across restarts; the disk fallback in `TaskTranscriptStore.get` is wired into both `buildUpstreamContext` and `resolveConvergenceReviewerSource`; `BUDGET-EXCEEDED` propagation skips downstream children. Nothing below prevents merging Phase 1.
+
+---
+
+## High-severity findings
+
+None.
+
+The three high-severity items from the prior review are all addressed in source:
+
+1. **Disk fallback for resumed runs.** `task_transcript.ts:116-128` (`get`) reads `${taskId}.stream.txt` synchronously when the in-memory entry is missing/empty and a registered path exists, then caches the result. `run_dag.ts:678-688` wires this up by calling `transcriptStore.registerExistingMirror(...)` for every task whose persisted `transcriptPath` is non-empty before any rank runs. The fallback chain is the one the plan promised: in-memory → stream file → legacy `resultText`.
+2. **`buildConvergenceContext` no longer reads the sidecar.** `run_dag.ts:1828-1834` constructs the `buildConvergenceContext` source via `resolveConvergenceReviewerSource({ ..., includeSidecar: false })`. Only the findings-extraction call sites (`run_dag.ts:1777-1783`, `run_dag.ts:1883-1889`) pass `includeSidecar: true`, so the lossy sidecar shape (heading-only, body-trimmed) can no longer enter the stitched prompt. The judge's medium #2 fix landed.
+3. **Docs no longer overstate.** `packages/proof/README.md:155-160` scopes the execution-authoritative claim to `kind: "task"` and explicitly says "Resumed runs can reconstruct that transcript when the same `--full-output-dir` is reused and `transcriptPath` points at `${task-id}.stream.txt`; otherwise legacy bounded `resultText` remains the fallback." `.cursor/skills/proof/SKILL.md:260` carries the same scoping. The earlier unconditional claim is gone.
+
+---
+
+## Medium-severity findings
+
+1. **`TaskTranscriptStore.get` cannot distinguish "definitively empty" from "not yet read"; every empty-transcript access re-reads from disk.** `task_transcript.ts:116-128`: the early-return guard is `if (v !== undefined && v !== '') return v;`. After the disk fallback returns `''` (e.g., a task with no streamed text, or a freshly-truncated mirror file), the cached `text.set(taskId, '')` does not suppress the disk read on the next call — `v === ''` still falls through to the `readFileSync` branch. For a `kind: 'task'` invocation that legitimately emits no assistant text, every downstream `buildUpstreamContext` → `transcripts.getJoined(depId)` call performs a synchronous filesystem read. Functionally correct (returns `''` on every call), but it is the kind of hot-path footgun that the plan's "execution plane vs display plane" framing should have eliminated, not introduced. Add a `triedDisk: Set` (or store `null` vs `''` semantically) so a definitively-empty result is cached.
+
+2. **`registerExistingMirror` does not verify the file exists; the disk fallback silently degrades when the supervisor uses a fresh timestamped directory.** `task_transcript.ts:65-71` just stores the absolute path; it does not `stat` the file. `run_dag.ts:678-688` calls `registerExistingMirror(taskId, fullOutputAbsoluteDir, taskState.transcriptPath)`, where `fullOutputAbsoluteDir` is the **current process's** artifact directory. When `proof-supervisor` relaunches without a pinned `--full-output-dir`, `defaultArtifactsDir(...)` (`run_dag.ts:450-459`) picks a new timestamp, so the registered path points into a directory that does not contain the prior process's stream files. `get`'s `try { readFileSync(...) } catch { return v; }` then silently falls back to bounded `resultText` for every consumer (upstream, convergence, sidecar). The README documents the pinning requirement (`packages/proof/README.md:204`), and `.cursor/skills/proof/SKILL.md:254` repeats it, but the runner neither warns at startup ("registered transcript path missing on disk") nor surfaces the degradation on the canvas. A single `console.warn` in `main()` (after the `registerExistingMirror` loop, comparing how many `transcriptPath`s pointed to extant files) and a one-line `runMessage` augmentation when any registered path is missing would convert a silent fidelity regression into a noisy diagnostic.
+
+3. **The new `task transcript store reads existing mirror files after resume` test exercises the store in isolation, not the runner consumers that depend on it.** `output-retention-phase1.test.ts:190-201` writes a fixture file, calls `store.registerExistingMirror(...)`, and asserts `store.getJoined(...)` returns the bytes. It does not assert that (a) `buildUpstreamContext` surfaces those bytes into the upstream-context block for a downstream task, nor (b) `resolveConvergenceReviewerSource` surfaces them into either `extractConvergenceFindings` or `buildConvergenceContext`. Both wirings are present at `run_dag.ts:2017` and `run_dag.ts:1706` respectively, but a regression that swapped `transcripts.getJoined(depId)` for `dep.resultText` in `buildUpstreamContext` would not fail any current test. The plan's Phase 1 criteria 1, 3, and 4 require end-to-end stitched-prompt assertions; the current diff still does not provide a fake `Agent.create` harness that would let those criteria be wired to runnable tests. Same family as prior review medium #2.
+
+4. **The post-loop `BUDGET-EXCEEDED` decision now sources from the transcript store, but no test asserts that.** `run_dag.ts:1875-1890` wires `extractConvergenceFindings(finalReviewerSource)` to `resolveConvergenceReviewerSource(..., { includeSidecar: true })`, which is the fix the judge's medium #1 asked for. There is still no fixture asserting that a final-iteration reviewer whose `## Blockers` lands past byte 4000 (i.e., outside the legacy display tail) drives `convergeTs.status = 'BUDGET-EXCEEDED'`. The closest test is `convergence extract sees late section beyond legacy STREAM cap window` (`output-retention-phase1.test.ts:100-105`), which exercises `extractConvergenceFindings` directly on a fabricated string — it does not go through `resolveConvergenceReviewerSource` or the loop's terminal branch. Add a fixture that seeds `transcriptStore` with late-region blockers, drives `runConvergenceLoop` with `maxIterations: 1`, and asserts `convergeTs.status === 'BUDGET-EXCEEDED'`. Same gap as prior review medium #7.
+
+5. **The `runOne skips children when upstream is BUDGET-EXCEEDED` test is still a source-string grep, not a behavioral assertion.** `output-retention-phase1.test.ts:243-250` reads `run_dag.ts` from disk, slices 450 chars after `failedDeps = task.depends_on.filter`, and asserts the snippet contains `'BUDGET-EXCEEDED'`. The runtime change at `run_dag.ts:832-838` (and the message update in `skipTask` at `run_dag.ts:1940-1942`) is correct, but a behavioral assertion — a `TaskState` for an upstream marked `'BUDGET-EXCEEDED'` drives the `failedDeps` path through `skipTask`, the child ends `'ERROR'` with the expected `errorMessage` substring — would cost a few lines and is the form the judge's recommendation #2 actually asked for. Carried forward from prior review medium #1.
+
+6. **`${taskId}.md` ↔ `${taskId}.stream.txt` byte parity is unasserted.** `run_dag.ts:1367-1377` calls `persistTaskMarkdownFile(..., options.transcriptStore.getJoined(task.id))` after the awaited final mirror flush, so the artifact body and the mirror file should agree modulo the meta header. Plan Phase 1 criterion 5 names this explicitly: _"For a non-trivial fixture, `${taskId}.md` bytes equal the transcript store bytes (modulo the meta header)."_ The diff has zero coverage. A regression that swapped `transcriptStore.getJoined(task.id)` for `ts.resultText` in the `persistTaskMarkdownFile` call would not fail any test. Carried forward from prior review medium #3.
+
+7. **`--findings-dir` ↔ in-memory parity claim is asserted in one direction only.** `output-retention-phase1.test.ts:114-134` proves the writer respects `parseSource` over `ts.resultText`. The reverse claim — that the convergence loop, with `--findings-dir` set, derives the same `extraContext` it derives without `--findings-dir` — is not exercised. After fix #2 (sidecar removed from `buildConvergenceContext`), the parity is structural in source; testing it would lock the invariant. Carried forward from prior review medium #4.
+
+8. **`beginMirroredAppend` truncates `${taskId}.stream.txt` on every entry, including convergence re-runs and `runTask`-re-queued tasks on resume.** `task_transcript.ts:37-63`: `await writeFile(absPath, '', 'utf8')` overwrites whatever the prior process / prior iteration wrote. On a convergence loop iteration the prior iteration's stream evidence on disk is gone; on a resumed `RUNNING` → `PENDING` task (`run_dag.ts:497-507`), the prior process's partial transcript is gone the moment the new process's `runTask` reaches `beginMirroredAppend`. The plan's Phase 3 risk-table notes the same forensic concern and proposes `${taskId}.iter${N}.partial.stream.txt` preservation; Phase 1 does not implement it. `${taskId}.md` is rewritten with the new content at the end of the new run so the artifact survives, but `.stream.txt` is the only authoritative real-time capture and it is destroyed in place. Carried forward from prior review medium #5.
+
+9. **Mirror flush is bounded to the canvas publish cadence; an unscheduled exit drops up to `streamPublishMs` of bytes from disk.** `run_dag.ts:1234-1244` (`publishIfDue`) returns early when `now - lastPublishAt < streamPublishMs`. The `void options.transcriptStore.flushStreamMirror(...)` call lives inside the non-early-return branch. With `--stream-publish-ms 500` (default), a task crashing 450ms after the last publish loses up to 450ms of unflushed bytes from `m.pendingBuf` from the mirror file. The signal handlers (`run_dag.ts:736-744`) call `failAndExit` without flushing mirrors first; `uncaughtException` similarly skips the flush. Add an `await options.transcriptStore.flushStreamMirror(...)` (or a parallel `flushAllStreamMirrors`) to `failAndExit`'s try block before the canvas flush, and similar drainage inside `onSignal` / `onUncaughtException`. Carried forward from prior review medium #6.
+
+10. **`outputPolicy` validation does not anticipate the Phase 2 shape (`{ maxChars: N }`), and the rejection error does not name the reserved object form.** `dag.ts:287-310` rejects any `upstream` value not equal to `'full'` or `'summarize'`. The test in `output-retention-phase1.test.ts:44-61` only exercises the `'everything'` rejection. When Phase 2 lands and authors copy a `{ maxChars: 2000 }` shape from an internal doc or an LLM-generated DAG, the error will be the same generic string with no signal that the object form is reserved for a later release. An `{ maxChars: 2000 }` rejection test plus a clearer error message — "DAG.outputPolicy.upstream must be 'full' or 'summarize'; the object form `{ maxChars }` is reserved for a future Proof release" — would future-proof the contract. Carried forward from prior review medium #10.
+
+11. **`parseUpstreamSections`' synthetic-heading text is not namespaced and can collide with parent-authored sections.** `upstream_policy.ts:78-92`: when a parent transcript has pre-heading content, the parser injects a section with `heading: 'Upstream truncation notice'` (when the canvas truncation banner is present) or `'Upstream preamble'`. Neither name is unique to the synthetic context. A parent task that legitimately emits `## Upstream preamble` of its own would produce two sections with the same heading; `summarizeWithinCap`'s `findIndex((s, idx2) => idx2 > 0 && s.normalized === dropTarget)` (`upstream_policy.ts:146-154`) would treat them as candidates for the same drop key, and `renderUpstreamSections` would emit both side-by-side. Neither name appears in `SECTION_DROP_PRIORITY`, so the practical attack surface is small, but the collision is structural. Prefixing with a non-Markdown sentinel (e.g. `[proof] Upstream preamble`) — or attaching the preamble to the first authored section's `bodyLines` instead of synthesizing a new heading — closes this without adding state. Carried forward from prior review medium #8.
+
+12. **The README's "Execution vs canvas" bullet says "in-process convergence parsing" — but the disk fallback now serves cross-process resumed convergence too.** `packages/proof/README.md:157`: _"For `kind: "task"` only, stitched prompts, in-process convergence parsing (`--converge-on` / `DAG.loops`), `${task-id}.findings.json` payloads (`--findings-dir`), and `.md` derive from an **execution-authoritative** transcript."_ The qualifier "in-process" is now imprecise: after the disk fallback fix, a resumed process's convergence loop reads the prior process's transcript from `${taskId}.stream.txt` via `TaskTranscriptStore.get` and then runs `extractConvergenceFindings` / `buildConvergenceContext` against that. The next sentence about "Resumed runs can reconstruct that transcript when the same `--full-output-dir` is reused" partly clarifies the picture, but the first sentence walks an operator into the wrong model of when the execution-authoritative source is consulted. Drop "in-process" or rewrite as: "stitched prompts, convergence parsing, findings sidecars, and `.md` derive from an execution-authoritative transcript that is reconstructed from `${task-id}.stream.txt` after a runner restart when `--full-output-dir` is pinned."
+
+13. **`TaskTranscriptStore.get` uses `readFileSync`, which blocks the runner's event loop for large transcripts on resume.** `task_transcript.ts:116-128`: the sync read is called from `buildUpstreamContext` (`run_dag.ts:1995-2027`, synchronous), which itself is called from the async `runTask`. The synchronous-read decision was likely deliberate — `buildUpstreamContext` cannot today produce an `await` without restructuring — but a 5 MiB transcript file read synchronously is tens of ms of event-loop pause per resumed dispatch. The fix is non-trivial (thread `await` through `buildUpstreamContext` → `runTask` → `runOne`) but the cost should be measured before deferring. For Phase 1, a comment naming the constraint and a follow-up note would be sufficient.
+
+14. **`resolveConvergenceReviewerSource` falls through to `convergeTs.resultText` silently when the transcript store is empty and `includeSidecar: false`.** `run_dag.ts:1699-1716`: in the path used by `buildConvergenceContext` (line 1828, `includeSidecar: false`), if `transcriptStore.getJoined(convergeOn).trim().length === 0` — e.g., a resumed process where the convergence task's mirror file was lost per medium #2 — the function returns `opts.resultText ?? ''`. That is the bounded display string, exactly the silent loss the plan was paid down to retire. The structural fix landed for the common case; the residual gap is the edge case where the disk fallback also failed. Either log the fallback at the call site (`buildConvergenceContext` reading bounded `resultText` because no authoritative source was reachable), or attach a synthetic `[...convergence reviewer transcript unavailable; reading display tail...]` banner to the returned string so the model can interpret what it is seeing.
+
+---
+
+## Low-severity findings
+
+1. **Stray JSDoc fragment in `TaskTranscriptStore`.** `task_transcript.ts:24` carries `/** When false, omit mirror writes entirely (logged once per task via callback). */` directly above `mirrorEnabledForTask(taskId)`. The comment describes a boolean field that does not exist (the method consults the `mirrors` map and returns `true`/`false` based on presence). Either delete the orphan or rewrite as a proper `@returns` comment on the method.
+
+2. **`RUNNER_RUNTIME_FILES` ordering is no longer alphabetical.** `self_hosting.ts:23-35` appends `'task_transcript.ts'` and `'upstream_policy.ts'` after `'self_hosting.ts'`. The list feeds a hash-and-compare loop, so order does not matter functionally, but the surrounding entries are alphabetical. Minor consistency lapse.
+
+3. **`renderCanvasSource` is exported solely so the new test can call it.** `canvas_writer.ts:210` changed from `function renderCanvasSource` to `export function renderCanvasSource`. Fine for testability, but it broadens the public surface of `canvas_writer.ts`; consider adding a JSDoc `@internal` marker or moving the canvas-size test inside a separate `__internal__` import path.
+
+4. **Canvas-size envelope test slack is generous to the point of being load-insensitive.** `output-retention-phase1.test.ts:234`: `t.true(cappedLen < baselineLen + 5 * CANVAS_DISPLAY_CAP + 96000)`. The plan's Phase 1 criterion 6 specified `5 * CANVAS_DISPLAY_CAP + 64 KiB`. The test allows ~94 KiB of slack — a 5× canvas-render growth of arbitrary metadata would still pass. The complementary assertion `uncappedLen - cappedLen > 35000` does catch a "leaked uncapped resultText" regression, so the test is sufficient as a "cap is doing something" gate, but it is not the tight envelope the plan specified. Tighten the upper bound (e.g. `+ 32000`) so the test fails on 2× growth, not 5×.
+
+5. **`buildConvergenceContext` defaults `upstreamMode` to `'summarize'`.** `converge_loop.ts:164-168`. The runner always passes a value explicitly from `runConvergenceLoop` (`run_dag.ts:1834`), but the default is a footgun for downstream library callers: a tool that calls `buildConvergenceContext(reviewer, iter, text)` without thinking about policy silently gets the legacy 2000-char excerpt. Either require the policy parameter (`UpstreamPolicyMode` is exported, so callers can pass it) or document the default in the JSDoc so the contract is obvious to a downstream consumer.
+
+6. **The README's "Execution vs canvas" block is a four-bullet list now (good) but the bullets do not call out the silent-fallback edge case.** `packages/proof/README.md:155-160`. After the latest fix, the bullets describe the common cases but do not mention what happens when `--full-output-dir` is not reused on the supervisor: the runner does not warn, the canvas does not surface degradation, and downstream prompts silently read bounded `resultText`. A single sentence — "When the artifact directory is not preserved across restarts, the runner silently falls back to the bounded display string for previously-completed tasks; pin `--full-output-dir` on the supervisor to avoid this." — would close the operator-trust gap.
+
+7. **No consumer-inventory document landed.** The plan's Phase 0 deliverable was: _"Consumer inventory document committed alongside this proposal (or inlined as a stable section of the README's 'Internal layout' appendix) that explicitly names every read site of `TaskState.resultText` in `packages/proof/src/**`."_ The diff does not contain such a document, and the README's prose does not enumerate the consumers. Future contributors editing `run_dag.ts` will have to re-derive the list from scratch. Low-impact for Phase 1 itself, but a one-page appendix would pay back the next refactor.
+
+8. **`writeFindingsSidecar`'s call site in `dispatchTask` gates `parseSource` on `effectiveTaskKind(task) === 'task' && transcriptBody.length > 0`** (`run_dag.ts:930-937`). For a `kind: 'task'` that finished with no streamed output, `parseSource` is `undefined` and the sidecar falls back to `ts.resultText` — also empty. Fine in practice. For resumed runs where the prior process completed the task and the current process has an empty in-memory transcript but a non-empty disk transcript, the `transcriptBody.length > 0` check would now succeed because the disk fallback fills `transcriptStore.getJoined(task.id)` on demand — good. But the current runner does not actually call `dispatchTask` for previously-FINISHED tasks (the rank loop's `runnableRank` filter at `run_dag.ts:951-954` skips them), so the sidecar is never re-emitted on resume. That keeps the on-disk sidecar stale relative to the new process's view of the transcript. A comment naming the invariant would prevent a future "always re-emit sidecars on resume" change from regressing.
+
+9. **`fullStreamChunks` removal leaves a stale JSDoc comment in `runTask`.** `run_dag.ts:1230` carries `/** Uncapped execution transcript is accumulated in options.transcriptStore. */` directly above `let run: RunnerTaskRun | undefined;`. The comment attaches to the `let run` declaration, which has nothing to do with the transcript. Reflow onto `options.transcriptStore.append(task.id, block.text);` at line 1282, or delete.
+
+10. **`registerExistingMirror` is a void-return method, so a registration that points at a non-existent file is indistinguishable at the call site from a successful registration.** `task_transcript.ts:65-71`. Return `boolean` (or a discriminated union) so `main()` can count missing files and emit a single summary warning. Pairs naturally with medium #2.
+
+11. **`oracle_task.ts` still tail-bounds stdout/stderr at `ORACLE_TAIL_CAP = 4000` for the inline `resultText` and the sidecar, with no parallel transcript or evidence file.** Phase 4 territory and explicitly out of Phase 1 scope. The README's "Execution vs canvas" bullets correctly carve out "For `kind: "task"` only" so the docs do not promise oracle full-evidence. No action needed beyond keeping the carve-out intact in any future doc rewrite.
+
+---
+
+## Verification
+
+1. **Documents consulted end-to-end.** [`docs/proposals/proof-output-retention-plan.md`](./proof-output-retention-plan.md) (all five phases, the constraint matrix, and Phase 1 acceptance criteria 1–8) and [`docs/proposals/proof-output-retention-judge.md`](./proof-output-retention-judge.md) (all four high-severity findings plus the ten medium-severity items) re-read in full. Verified the prior review draft's items 1, 2, 3 (high-severity) are addressed in this diff and that mediums 1, 2, 3, 4, 5, 6, 7, 8, 10 from the prior review are still open.
+
+2. **Resumed-transcript fallback specifically.** Traced `loadResumedRunState` (`run_dag.ts:461-516`) → `main()` registration loop (`run_dag.ts:678-688`) → `transcriptStore.registerExistingMirror` (`task_transcript.ts:65-71`) → `transcriptStore.get` (`task_transcript.ts:116-128`, sync `readFileSync`) → `buildUpstreamContext` (`run_dag.ts:1995-2027`, calls `transcripts.getJoined(depId)`) → `resolveConvergenceReviewerSource` (`run_dag.ts:1699-1716`, calls `opts.transcriptStore.getJoined(opts.convergeOn)`). The chain works when `fullOutputAbsoluteDir` is the same path the prior process wrote to; it silently degrades to legacy `resultText` when the path differs (medium #2).
+
+3. **Sidecar use for findings vs convergence context specifically.** Confirmed the three `resolveConvergenceReviewerSource` call sites at `run_dag.ts:1777`, `run_dag.ts:1828`, and `run_dag.ts:1883`. The middle call (feeding `buildConvergenceContext`) passes `includeSidecar: false`, so the lossy sidecar reconstruction can no longer enter ancestor prompts. The other two calls feed `extractConvergenceFindings`, where the sidecar's heading-only shape is sufficient (`## Blockers` / `## High-severity findings` round-trip cleanly).
+
+4. **Canvas boundedness specifically.** `canvas_writer.ts:210-213` still embeds the full `RunState` JSON via `JSON.stringify(state, null, 2)`. Each `TaskState.resultText` is bounded at `CANVAS_DISPLAY_CAP=4000` chars in `runTask`'s `publishIfDue` (the rendered `BoundedTextBuffer` output is assigned). The new `transcriptPath` field is a short relative path string. `subtask_prompt` remains uncapped — the canvas-size test (`output-retention-phase1.test.ts:203-241`) implicitly accepts this by using `'prompt:'.repeat(200)` (1.4 KiB per task) as the fixture; a larger prompt fixture would expand the assertion's slack.
+
+5. **Budget-exceeded downstream skip specifically.** `run_dag.ts:832-838` (`failedDeps`) includes both `'ERROR'` and `'BUDGET-EXCEEDED'`. `skipTask` (`run_dag.ts:1926-1959`) writes `Skipped: upstream task(s) … blocked this task (upstream ERROR or BUDGET-EXCEEDED)` and marks the child `'ERROR'`. The only test (`output-retention-phase1.test.ts:243-250`) is the source-grep noted in medium #5.
+
+6. **`outputPolicy` validation specifically.** `dag.ts:287-310`: rejects `null`, non-objects, arrays, unknown top-level keys (offending key named in the error), and non-`'full' | 'summarize'` `upstream`. Returns `{}` when `upstream === undefined` so downstream `dag.outputPolicy?.upstream === 'full'` evaluations stay clean. The three tests at `output-retention-phase1.test.ts:28-80` cover the accept-`full`, reject-unknown-key, and reject-unknown-value paths; not the array, non-object, or `{ maxChars }` paths (medium #10).
+
+7. **Stream-mirror ordering specifically.** Re-traced `flushStreamMirror` (`task_transcript.ts:86-110`) against `append` (`task_transcript.ts:73-81`). The synchronous portion of each flush call captures `payload = m.pendingBuf` and clears `m.pendingBuf` before scheduling `m.flushing = m.flushing.then(...)`. Two interleaved `append` + `flush` calls always serialize their `appendFile` payloads in append order via the `flushing` chain. The fire-and-forget `void options.transcriptStore.flushStreamMirror(...)` in `publishIfDue` does not violate this because the synchronous capture still happens before `void` returns. The awaited final flush in `runTask`'s `finally` (`run_dag.ts:1363-1365`) drains all prior scheduled appends before `finalizeTaskMirrorsDone`. The corresponding test at `output-retention-phase1.test.ts:169-188` exercises this with two overlapping flushes and asserts `raw === 'ab'`. Sound.
+
+8. **Docs accuracy specifically.** `packages/proof/README.md:143-180` ("Artifact Output", "Execution vs canvas") now scopes the execution-authoritative claim to `kind: "task"` and names the `--full-output-dir` pinning requirement. `.cursor/skills/proof/SKILL.md:251-263` ("Caveats") matches. The "in-process" qualifier in the bullet at README:157 is now slightly imprecise after the disk fallback fix (medium #12). The Self-Hosting Mode block (`packages/proof/README.md:191-211`) explicitly tells operators to pin `--full-output-dir ` on the supervisor invocation; matches the runtime behavior.
+
+9. **Tests specifically.** All 15 tests in `output-retention-phase1.test.ts` read and mapped to plan acceptance criteria:
+
+   - Criterion 1 (late-region prompt content) — partially covered by helper-level `full upstream excerpt includes late marker past multi-kchar parents`. End-to-end stitched-prompt assertion still missing (medium #3).
+   - Criterion 2 (late blockers) — covered by `convergence extract sees late section beyond legacy STREAM cap window`.
+   - Criterion 3 (convergence extraContext) — covered by `convergence extraContext carries late blockers under full upstream excerpt mode` (helper-level).
+   - Criterion 4 (`--findings-dir` parity) — partially covered by `findings sidecar uses parseSource (full transcript) over bounded resultText` (writer direction only — medium #7).
+   - Criterion 5 (`.md` ↔ `.stream.txt`) — not covered (medium #6).
+   - Criterion 6 (canvas envelope) — covered, slack noted (low #4).
+   - Criterion 7 (counted banner) — covered by `summarize upstream attaches counted excerpt banner instead of omitting rationale`.
+   - Criterion 8 (existing bounded-loop suite) — out-of-scope for this diff.
+
+10. **Suggested next-step verification (not executed in this review).**
+    - `pnpm -F @flatbread/proof typecheck` — confirms TypeScript still compiles. The diff adds a `readFileSync` import to `task_transcript.ts`, exports `renderCanvasSource` and `taskStreamArtifactRelPath`, and threads `UpstreamPolicyMode` through new call sites; all should be statically clean.
+    - `pnpm -F @flatbread/proof test` — exercises both `loops.test.ts` and `output-retention-phase1.test.ts`.
+    - `pnpm verify` — Phase 1 acceptance gate.
+    - A manual fixture run with a >12 KiB parent transcript and `--restart-on-runner-change` triggered between rank 2 and the convergence loop, with `--full-output-dir` pinned, asserting late-region `## Blockers` survive into the convergence ancestor's stitched prompt across the restart. This is the missing acceptance test that pins down high-severity items #1 and #2 in their cross-process form.
+    - The same fixture run without `--full-output-dir` pinning, asserting the runner either warns (the recommendation in medium #2) or at minimum still completes without surfacing stale bounded `resultText` as if it were authoritative.
+    - A manual fixture comparing `${taskId}.stream.txt` bytes against the `## Agent output` body of `${taskId}.md` for a non-trivial streaming task (medium #6).
diff --git a/packages/proof/README.md b/packages/proof/README.md
index 73207474..ae077796 100644
--- a/packages/proof/README.md
+++ b/packages/proof/README.md
@@ -149,8 +149,16 @@ By default, every **full DAG run** writes per-task markdown transcripts to a tim
   _dag.json      # The original DAG definition
   _index.md      # Run summary: outcome, timings, and links to all transcripts
   .md   # Full agent output for each task (kind: task, oracle, or pause)
+  .stream.txt   # Append-only assistant transcript mirror (`kind: task` only)
 ```
 
+**Execution vs canvas:**
+
+- For `kind: "task"` only, stitched prompts, in-process convergence parsing (`--converge-on` / `DAG.loops`), `${task-id}.findings.json` payloads (`--findings-dir`), and `.md` derive from an **execution-authoritative** transcript. Resumed runs can reconstruct that transcript when the same `--full-output-dir` is reused and `transcriptPath` points at `${task-id}.stream.txt`; otherwise legacy bounded `resultText` remains the fallback.
+- The inlined canvas payload snapshots only a **4000-character display tail** (`CANVAS_DISPLAY_CAP`) per task plus an optional **`transcriptPath`** when `${task-id}.stream.txt` is mirrored.
+- Author `DAG.outputPolicy.upstream` as `"full"` or `"summarize"` (default) to widen or keep the upstream excerpt policy; trims carry visible counted banners.
+- Downstream nodes are skipped with `ERROR` when any upstream is `ERROR` or `BUDGET-EXCEEDED`.
+
 Paths resolve from `--cwd` (defaults to the process working directory). The live canvas still defaults under `~/.cursor/projects//canvases/` when using `--canvas` without `--canvas-path`.
 
 Previously, transcripts only appeared when you passed `--full-output-dir`; now they land under `.flatbread/` by default. Use `--no-artifacts` for opt-out, or `--full-output-dir` to redirect elsewhere.
diff --git a/packages/proof/package.json b/packages/proof/package.json
index 43010745..975eff90 100644
--- a/packages/proof/package.json
+++ b/packages/proof/package.json
@@ -6,8 +6,8 @@
   "scripts": {
     "build": "tsup",
     "dev": "tsup --watch src",
-    "test": "pnpm --dir ../.. exec ava packages/proof/src/__tests__/loops.test.ts",
-    "test:watch": "pnpm --dir ../.. exec ava --watch packages/proof/src/__tests__/loops.test.ts",
+    "test": "pnpm --dir ../.. exec ava \"packages/proof/src/__tests__/**/*.test.ts\"",
+    "test:watch": "pnpm --dir ../.. exec ava --watch \"packages/proof/src/__tests__/**/*.test.ts\"",
     "typecheck": "tsc -p tsconfig.json --noEmit",
     "models:list": "tsx src/list_models.ts",
     "cursor:fetch-cloud-agent": "node scripts/fetch-cloud-agent-conversation.mjs"
diff --git a/packages/proof/src/__tests__/output-retention-phase1.test.ts b/packages/proof/src/__tests__/output-retention-phase1.test.ts
new file mode 100644
index 00000000..8c6c7894
--- /dev/null
+++ b/packages/proof/src/__tests__/output-retention-phase1.test.ts
@@ -0,0 +1,250 @@
+import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { dirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import test from 'ava';
+
+import { parseDAG } from '../dag.js';
+import {
+  buildConvergenceContext,
+  extractConvergenceFindings,
+} from '../converge_loop.js';
+import {
+  TaskTranscriptStore,
+  taskStreamArtifactRelPath,
+} from '../task_transcript.js';
+import type { TaskState } from '../canvas_writer.js';
+import {
+  CANVAS_DISPLAY_CAP,
+  excerptUpstreamForPrompt,
+  parseUpstreamSections,
+  renderUpstreamSections,
+  summarizeUpstreamForPrompt,
+  UPSTREAM_SNIPPET_CAP,
+} from '../upstream_policy.js';
+import { renderCanvasSource, initialRunState } from '../canvas_writer.js';
+import { writeFindingsSidecar } from '../findings_sidecar.js';
+
+test('parseDAG accepts DAG.outputPolicy.upstream', (t) => {
+  const dag = parseDAG({
+    title: 'pol',
+    outputPolicy: { upstream: 'full' },
+    tasks: [
+      {
+        id: 'a',
+        depends_on: [],
+        complexity: 'LOW',
+        subtask_prompt: 'do',
+      },
+    ],
+  });
+  t.is(dag.outputPolicy?.upstream, 'full');
+});
+
+test('parseDAG rejects invalid outputPolicy upstream value', (t) => {
+  t.throws(
+    () =>
+      parseDAG({
+        title: 'bad',
+        outputPolicy: { upstream: 'everything' },
+        tasks: [
+          {
+            id: 'a',
+            depends_on: [],
+            complexity: 'LOW',
+            subtask_prompt: 'do',
+          },
+        ],
+      }),
+    { message: /upstream must be/ }
+  );
+});
+
+test('parseDAG rejects unknown outputPolicy keys', (t) => {
+  t.throws(
+    () =>
+      parseDAG({
+        title: 'bad-key',
+        outputPolicy: { upstram: 'full' },
+        tasks: [
+          {
+            id: 'a',
+            depends_on: [],
+            complexity: 'LOW',
+            subtask_prompt: 'do',
+          },
+        ],
+      }),
+    { message: /DAG\.outputPolicy\.upstram is not supported/ }
+  );
+});
+
+test('summarize upstream attaches counted excerpt banner instead of omitting rationale', (t) => {
+  const filler = 'y'.repeat(5000);
+  const { excerpt } = summarizeUpstreamForPrompt(filler, UPSTREAM_SNIPPET_CAP);
+  t.true(
+    excerpt.includes('[...upstream excerpt:') &&
+      excerpt.includes('parent output was 5000 chars')
+  );
+  t.false(/^[^\n]+\u2026$/u.test(excerpt.trim().split(/\n/).pop() ?? ''));
+});
+
+test('full upstream excerpt includes late marker past multi-kchar parents', (t) => {
+  const preamble = 'z'.repeat(2800);
+  const tailMarker = `${'x'.repeat(9100)}MARKER_LATE`;
+  const blob = `${preamble}\n## Section one\nstuff\n## Blockers\n${tailMarker}`;
+  const full = excerptUpstreamForPrompt(blob, 'full');
+  t.true(full.includes('MARKER_LATE'));
+});
+
+test('convergence extract sees late section beyond legacy STREAM cap window', (t) => {
+  const long = `${'p'.repeat(6000)}\n## Blockers\n- late blocker\n`;
+  const f = extractConvergenceFindings(long);
+  t.true(f.hasIssues);
+  t.true(f.blockerLines.some((l) => l.includes('late blocker')));
+});
+
+test('convergence extraContext carries late blockers under full upstream excerpt mode', (t) => {
+  const long = `${'p'.repeat(6000)}\n## Blockers\n- still broken\n`;
+  const ctx = buildConvergenceContext('reviewer', 2, long, 'full');
+  t.true(ctx.includes('## Blockers'));
+  t.true(ctx.includes('still broken'));
+});
+
+test('findings sidecar uses parseSource (full transcript) over bounded resultText', async (t) => {
+  const dir = mkdtempSync(join(tmpdir(), 'proof-sidecar-'));
+  try {
+    const ts: TaskState = {
+      id: 'task-a',
+      depends_on: [],
+      complexity: 'LOW',
+      subtask_prompt: 'x',
+      status: 'FINISHED',
+      model: 'gpt-5.4',
+      resultText: '## Blockers\n(none)',
+    };
+    const longTruth = `${'z'.repeat(5000)}\n## Blockers\n- deep blocker line\n`;
+    await writeFindingsSidecar(dir, ts, { parseSource: longTruth });
+    const raw = readFileSync(join(dir, 'task-a.findings.json'), 'utf8');
+    const parsed = JSON.parse(raw) as { sections: Record };
+    t.true(parsed.sections.Blockers?.includes('deep blocker line'));
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('upstream section parsing keeps canvas truncation banner before headings', (t) => {
+  const line = '[...truncated 9000 earlier chars...]';
+  const body = `${line}\n## Blockers\nhit\n`;
+  const sections = parseUpstreamSections(body);
+  t.true(sections.some((s) => s.heading === 'Upstream truncation notice'));
+  const rendered = renderUpstreamSections(sections);
+  t.true(rendered.includes(line));
+});
+
+test('upstream section parsing keeps freeform preamble before headings', (t) => {
+  const body = `Important preface before headings.\nStill preface.\n## Findings\nhit\n## Proposed contract\nkeep\n`;
+  const sections = parseUpstreamSections(body);
+  t.is(sections[0]?.heading, 'Upstream preamble');
+  const rendered = renderUpstreamSections(sections);
+  t.true(rendered.includes('Important preface before headings.'));
+});
+
+test('summarize upstream does not rewrite author-owned trailing ellipsis', (t) => {
+  const body = [
+    '## Summary',
+    'This sentence intentionally trails off…',
+    '',
+    '## Current contract',
+    'drop me '.repeat(500),
+    '',
+    '## Findings',
+    'keep this section',
+  ].join('\n');
+  const { excerpt } = summarizeUpstreamForPrompt(body, 500);
+  t.true(excerpt.includes('trails off…'));
+  t.false(excerpt.includes('[...truncated in excerpt body at char cap …]'));
+});
+
+test('task transcript mirror serializes overlapping flushes in append order', async (t) => {
+  const dir = mkdtempSync(join(tmpdir(), 'proof-stream-'));
+  const store = new TaskTranscriptStore();
+  try {
+    await store.beginMirroredAppend('task-a', dir);
+    store.append('task-a', 'a');
+    const first = store.flushStreamMirror('task-a');
+    store.append('task-a', 'b');
+    const second = store.flushStreamMirror('task-a');
+    await Promise.all([first, second]);
+    await store.flushStreamMirror('task-a');
+    const raw = readFileSync(
+      join(dir, taskStreamArtifactRelPath('task-a')),
+      'utf8'
+    );
+    t.is(raw, 'ab');
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('task transcript store reads existing mirror files after resume', (t) => {
+  const dir = mkdtempSync(join(tmpdir(), 'proof-stream-resume-'));
+  const store = new TaskTranscriptStore();
+  try {
+    const rel = taskStreamArtifactRelPath('task-a');
+    writeFileSync(join(dir, rel), 'full transcript from prior process', 'utf8');
+    store.registerExistingMirror('task-a', dir, rel);
+    t.is(store.getJoined('task-a'), 'full transcript from prior process');
+  } finally {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('canvas render growth stays bounded by display-sized tails versus megabyte dumps', (t) => {
+  const tasks = Array.from({ length: 5 }, (_, i) => ({
+    id: `t${i}`,
+    depends_on: [] as string[],
+    complexity: 'LOW' as const,
+    subtask_prompt: `${'prompt:'.repeat(200)}\n`,
+  }));
+  const dag = parseDAG({ title: 'canvas-env', tasks });
+  const fresh = (): ReturnType =>
+    initialRunState(dag, () => ({
+      id: 'gpt-5.4',
+    }));
+
+  const baselineLen = renderCanvasSource(fresh()).length;
+
+  const cappedState = fresh();
+  cappedState.tasks.forEach((st) => {
+    st.resultText = `[...truncated 800000 earlier chars...]\n${'a'.repeat(
+      CANVAS_DISPLAY_CAP
+    )}`;
+  });
+  const cappedLen = renderCanvasSource(cappedState).length;
+
+  const leakyState = fresh();
+  leakyState.tasks.forEach((st) => {
+    st.resultText = `[...truncated 800000 earlier chars...]\n${'b'.repeat(
+      12000
+    )}`;
+  });
+  const uncappedLen = renderCanvasSource(leakyState).length;
+
+  t.true(cappedLen < baselineLen + 5 * CANVAS_DISPLAY_CAP + 96000);
+
+  /** Longer fake transcripts should substantially grow the inlined JSON blob. */
+  t.true(
+    uncappedLen - cappedLen > 35000,
+    'expected materially larger stringify when payloads stay long'
+  );
+});
+
+test('runOne skips children when upstream is BUDGET-EXCEEDED (guard in run_dag)', (t) => {
+  const path = join(dirname(fileURLToPath(import.meta.url)), '../run_dag.ts');
+  const src = readFileSync(path, 'utf8');
+  const idx = src.indexOf('failedDeps = task.depends_on.filter');
+  t.not(idx, -1);
+  const snippet = src.slice(idx, idx + 450);
+  t.true(snippet.includes("'BUDGET-EXCEEDED'"));
+});
diff --git a/packages/proof/src/canvas_writer.ts b/packages/proof/src/canvas_writer.ts
index 1c3d6db6..ea35ad46 100644
--- a/packages/proof/src/canvas_writer.ts
+++ b/packages/proof/src/canvas_writer.ts
@@ -49,6 +49,12 @@ export interface TaskState {
   startedAt?: number;
   finishedAt?: number;
   resultText?: string;
+  /**
+   * Relative path (under the run artifact directory) to the append-only stream
+   * mirror for this task's full assistant transcript. Canvas shows bounded
+   * `resultText`; this pointer is for locating the authoritative stream file.
+   */
+  transcriptPath?: string;
   errorMessage?: string;
   inputTokens?: number;
   outputTokens?: number;
@@ -201,7 +207,7 @@ export class CanvasWriter {
   }
 }
 
-function renderCanvasSource(state: RunState): string {
+export function renderCanvasSource(state: RunState): string {
   const stateLiteral = JSON.stringify(state, null, 2);
   return `${HEADER}\n\nconst STATE: RunState = ${stateLiteral};\n\n${BODY}\n`;
 }
@@ -258,6 +264,11 @@ interface TaskState {
   startedAt?: number;
   finishedAt?: number;
   resultText?: string;
+  /**
+   * Relative path (artifact dir) for the authoritative stream transcript.
+   * Canvas shows bounded resultText strings; transcriptPath reveals the mirror file path.
+   */
+  transcriptPath?: string;
   errorMessage?: string;
   inputTokens?: number;
   outputTokens?: number;
@@ -793,6 +804,12 @@ function TaskList({
                       {t.resultText}
                       {t.status === 'RUNNING' ? '\u2588' : ''}
                     
+ {t.transcriptPath ? ( + + Full transcript file (relative to artifact dir):{' '} + {t.transcriptPath} + + ) : null} ) : t.status === 'RUNNING' ? ( diff --git a/packages/proof/src/converge_loop.ts b/packages/proof/src/converge_loop.ts index 046e229b..961b43e8 100644 --- a/packages/proof/src/converge_loop.ts +++ b/packages/proof/src/converge_loop.ts @@ -2,7 +2,7 @@ * --converge-on + --max-iterations loop helpers. * * The convergence task is expected to be a `flatbread-adversarial-reviewer` - * style node — its `resultText` follows the schema: + * style node — its bounded canvas `resultText` follows the schema: * * ## Blockers * … @@ -27,6 +27,10 @@ import { type DAG, type ResolvedConvergenceLoop, } from './dag.js'; +import { + type UpstreamPolicyMode, + excerptUpstreamForPrompt, +} from './upstream_policy.js'; export interface ConvergenceFindings { hasIssues: boolean; @@ -151,18 +155,29 @@ export function resolveLoopReexecuteIds( } /** - * Renders the convergence task's `resultText` into the standard "extra + * Renders the convergence task's reviewer transcript into the standard "extra * upstream context" preamble we stitch into ancestor prompts on re-run. The * iteration index lets re-runs distinguish their feedback from any future - * iterations. + * iterations. The body is excerpted via the same upstream policy as child + * `buildUpstreamContext` — never silently truncated mid-review. */ export function buildConvergenceContext( convergeTaskId: string, iteration: number, - resultText: string | undefined + reviewerTranscript: string | undefined, + upstreamMode: UpstreamPolicyMode = 'summarize' ): string { - const trimmed = (resultText ?? '').trim(); - const body = trimmed === '' ? '(empty result text)' : trimmed; + const trimmed = (reviewerTranscript ?? '').trim(); + if (trimmed === '') { + return [ + `Convergence feedback from "${convergeTaskId}" (iteration ${ + iteration - 1 + }):`, + '', + '(empty result text)', + ].join('\n'); + } + const body = excerptUpstreamForPrompt(trimmed, upstreamMode); return [ `Convergence feedback from "${convergeTaskId}" (iteration ${ iteration - 1 diff --git a/packages/proof/src/dag.ts b/packages/proof/src/dag.ts index ad12f049..d7c2f902 100644 --- a/packages/proof/src/dag.ts +++ b/packages/proof/src/dag.ts @@ -92,11 +92,23 @@ export interface RawTask { allowNonZeroExit?: boolean; } +/** + * Optional per-DAG policy for how parent task output is excerpted into child + * prompts and convergence `extraContext`. Phase 1 defaults match historical + * behavior (`summarize` with a 2000-char section-aware cap plus explicit banners). + */ +export interface DAGOutputPolicy { + /** When `full`, upstream snippets are not structurally capped (still subject to model context). */ + upstream?: 'full' | 'summarize'; +} + export interface DAG { title: string; models?: ModelMapOverride; framing?: string; budget?: DAGBudget; + /** How much of each parent transcript is stitched into downstream prompts. */ + outputPolicy?: DAGOutputPolicy; tasks: RawTask[]; /** * Optional first-class bounded convergence loops. Each entry generalizes @@ -254,10 +266,47 @@ export function parseDAG(raw: unknown): DAG { obj.framing === undefined ? undefined : validateFraming(obj.framing); const budget = obj.budget === undefined ? undefined : validateBudget(obj.budget); + const outputPolicy = + obj.outputPolicy === undefined + ? undefined + : validateOutputPolicy(obj.outputPolicy); const loops = obj.loops === undefined ? undefined : validateLoops(obj.loops, tasks); - return { title: obj.title, models, framing, budget, tasks, loops }; + return { + title: obj.title, + models, + framing, + budget, + outputPolicy, + tasks, + loops, + }; +} + +function validateOutputPolicy(raw: unknown): DAGOutputPolicy { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + throw new Error('DAG.outputPolicy must be a JSON object when set.'); + } + const obj = raw as Record; + const allowedKeys = new Set(['upstream']); + for (const key of Object.keys(obj)) { + if (!allowedKeys.has(key)) { + throw new Error( + `DAG.outputPolicy.${key} is not supported. Supported keys: upstream.` + ); + } + } + const upstream = obj.upstream; + if (upstream === undefined) { + return {}; + } + if (upstream !== 'full' && upstream !== 'summarize') { + throw new Error( + 'DAG.outputPolicy.upstream must be "full" or "summarize" when set.' + ); + } + return { upstream }; } /** diff --git a/packages/proof/src/findings_sidecar.ts b/packages/proof/src/findings_sidecar.ts index 17c14ab0..ac3938c1 100644 --- a/packages/proof/src/findings_sidecar.ts +++ b/packages/proof/src/findings_sidecar.ts @@ -2,7 +2,10 @@ * `--findings-dir ` JSON sidecar writer + reader. * * After every task finishes — including `kind: 'oracle'` — the runner parses - * the task's final `resultText` for `## Heading` blocks and writes a + * headings from an optional richer `parseSource` when callers pass one. The + * `kind: 'task'` runner path passes the execution-authoritative transcript; + * pause/oracle callers continue to rely on their bounded `TaskState.resultText`. + * Otherwise this writer falls back to `TaskState.resultText`. * `.findings.json` (or `.iter.findings.json` once the * convergence loop has bumped the task's iteration past the original run). * The schema is intentionally tiny so downstream tools can lift findings @@ -21,8 +24,10 @@ * } * * The convergence loop prefers reading these JSON files over re-parsing the - * live `resultText` when `--findings-dir` is set on the same run, because the - * sidecar is captured at task completion instead of mid-stream. + * live bounded `resultText` when `--findings-dir` is set on the same run, + * because the sidecar is captured at task completion instead of mid-stream. + * In-process, the runner still prefers the authoritative transcript map and + * uses the sidecar primarily as a cross-process fallback. * * Oracle tasks ride the same code path: their standardized `## Pass: ` * / `## Command: ` / `## Exit code: ` / `## Stdout (tail):` / @@ -96,15 +101,17 @@ export function parseSections(text: string): Record { export async function writeFindingsSidecar( findingsDir: string, - ts: TaskState + ts: TaskState, + opts?: { parseSource?: string } ): Promise { const iteration = ts.iteration ?? 0; + const source = opts?.parseSource ?? ts.resultText ?? ''; const sidecar: FindingsSidecar = { taskId: ts.id, iteration, status: ts.status, durationMs: ts.durationMs ?? null, - sections: parseSections(ts.resultText ?? ''), + sections: parseSections(source), }; await mkdir(findingsDir, { recursive: true }); const path = join(findingsDir, findingsFileName(ts.id, iteration)); diff --git a/packages/proof/src/index.ts b/packages/proof/src/index.ts index 81c46646..6311f294 100644 --- a/packages/proof/src/index.ts +++ b/packages/proof/src/index.ts @@ -28,6 +28,7 @@ export type { DAG, DAGBudget, DAGConvergenceLoop, + DAGOutputPolicy, LoopReexecute, ModelCatalogItem, ModelMap, @@ -53,6 +54,21 @@ export { export { transitiveAncestorIds } from './dag.js'; export type { ConvergenceFindings } from './converge_loop.js'; +export { + CANVAS_DISPLAY_CAP, + DISPLAY_TRUNCATION_BANNER_RE, + UPSTREAM_SNIPPET_CAP, + excerptUpstreamForPrompt, + parseUpstreamSections, + renderUpstreamSections, + summarizeUpstreamForPrompt, +} from './upstream_policy.js'; +export type { + UpstreamPolicyMode, + UpstreamSection, + UpstreamSummarizeStats, +} from './upstream_policy.js'; + export { findingsFileName, parseSections, diff --git a/packages/proof/src/oracle_task.ts b/packages/proof/src/oracle_task.ts index f53d9a63..e159a119 100644 --- a/packages/proof/src/oracle_task.ts +++ b/packages/proof/src/oracle_task.ts @@ -48,7 +48,7 @@ export interface OracleTaskDeps { cloneState: (state: RunState) => RunState; } -/** Cap on per-stream tail captured into `resultText`. Mirrors `STREAM_CAP` in run_dag.ts to keep canvas payload bounded. */ +/** Cap on per-stream tail captured into `resultText`; matches `CANVAS_DISPLAY_CAP` in run_dag.ts so canvas payloads stay bounded. */ const ORACLE_TAIL_CAP = 4000; /** SIGTERM → SIGKILL escalation window for hung oracle commands. */ const KILL_GRACE_MS = 2000; diff --git a/packages/proof/src/run_dag.ts b/packages/proof/src/run_dag.ts index 767b5b74..75104573 100644 --- a/packages/proof/src/run_dag.ts +++ b/packages/proof/src/run_dag.ts @@ -47,8 +47,11 @@ * convergence re-runs). Schema: * `{ taskId, iteration, status, durationMs, * sections }`. When set, `--converge-on` - * reads sidecars instead of re-parsing live - * `resultText`. Relative paths resolve against + * reads sidecars as a fallback when only bounded + * `resultText` is available cross-process (the + * live runner prefers the authoritative in-memory + * transcript whenever the same process executes the + * loop). Relative paths resolve against * --cwd. Oracle tasks are included — their * standardized `## Pass` / `## Command` / * `## Exit code` / `## Stdout (tail)` / @@ -57,8 +60,10 @@ * --checkpoint-dir Directory for `kind: 'pause'` sentinel files * (default `.proof/` under --cwd). * --converge-on After the main DAG run, parse the named task's - * `resultText` for `## Blockers` / - * `## High-severity findings`. If non-empty, + * authoritative transcript for `## Blockers` / + * `## High-severity findings` (fallbacks to bounded + * `resultText` / `--findings-dir` after restarts). + * If non-empty, * re-execute the entire upstream ancestor * subtree with the convergence task's latest * result appended as context, then re-execute @@ -138,6 +143,15 @@ import { writePersistedRunState, type RunnerFileSnapshot, } from './self_hosting.js'; +import { + TaskTranscriptStore, + taskStreamArtifactRelPath, +} from './task_transcript.js'; +import { + CANVAS_DISPLAY_CAP, + excerptUpstreamForPrompt, + type UpstreamPolicyMode, +} from './upstream_policy.js'; const SCRIPTS_DIR = dirname(fileURLToPath(import.meta.url)); /** @@ -660,6 +674,20 @@ async function main(): Promise { const stateById = new Map( state.tasks.map((t) => [t.id, t]) ); + const transcriptStore = new TaskTranscriptStore(); + if (fullOutputAbsoluteDir) { + for (const taskState of state.tasks) { + if (taskState.transcriptPath) { + transcriptStore.registerExistingMirror( + taskState.id, + fullOutputAbsoluteDir, + taskState.transcriptPath + ); + } + } + } + const upstreamMode: UpstreamPolicyMode = + dag.outputPolicy?.upstream === 'full' ? 'full' : 'summarize'; const runnerSnapshot = args.restartOnRunnerChange ? await snapshotRunnerRuntimeFiles(RUNNER_SOURCE_DIR) : undefined; @@ -793,6 +821,8 @@ async function main(): Promise { fullOutputAbsoluteDir, dagTitle: dag.title, framing: dag.framing, + upstreamMode, + transcriptStore, }; const runOne = async ( @@ -801,7 +831,10 @@ async function main(): Promise { ): Promise => { const failedDeps = task.depends_on.filter((depId) => { const dep = stateById.get(depId); - return dep !== undefined && dep.status === 'ERROR'; + return ( + dep !== undefined && + (dep.status === 'ERROR' || dep.status === 'BUDGET-EXCEEDED') + ); }); if (failedDeps.length > 0) { return skipTask( @@ -894,7 +927,14 @@ async function main(): Promise { const ts = stateById.get(task.id); if (ts) { try { - await writeFindingsSidecar(findingsAbsoluteDir, ts); + const transcriptBody = transcriptStore.getJoined(task.id); + const parseSource = + effectiveTaskKind(task) === 'task' && transcriptBody.length > 0 + ? transcriptBody + : undefined; + await writeFindingsSidecar(findingsAbsoluteDir, ts, { + parseSource, + }); } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error( @@ -950,6 +990,8 @@ async function main(): Promise { reExecIds, ranks, stateById, + transcriptStore, + upstreamMode, dispatchTask, writer, state, @@ -1138,7 +1180,12 @@ async function runTask( ts.startedAt = Date.now(); writer.schedule(structuredCloneState(state)); - const upstreamContext = buildUpstreamContext(task, stateById); + const upstreamContext = buildUpstreamContext( + task, + stateById, + options.transcriptStore, + options.upstreamMode + ); const promptParts: string[] = []; if (upstreamContext) promptParts.push(upstreamContext); if (options.extraContext && options.extraContext.trim() !== '') { @@ -1164,10 +1211,25 @@ async function runTask( local: { cwd }, }); + options.transcriptStore.resetTask(task.id); + ts.transcriptPath = undefined; + const artifactDir = options.fullOutputAbsoluteDir; + if (artifactDir) { + await options.transcriptStore.beginMirroredAppend( + task.id, + artifactDir, + (_, msg) => { + console.warn(msg); + } + ); + if (options.transcriptStore.mirrorEnabledForTask(task.id)) { + ts.transcriptPath = taskStreamArtifactRelPath(task.id); + } + } + + /** Uncapped execution transcript is accumulated in `options.transcriptStore`. */ let run: RunnerTaskRun | undefined; - const buffer = new BoundedTextBuffer(STREAM_CAP); - /** Uncapped capture for `--full-output-dir` (canvas still uses bounded buffer). */ - const fullStreamChunks: string[] = []; + const buffer = new BoundedTextBuffer(CANVAS_DISPLAY_CAP); let lastPublishAt = 0; const publishIfDue = (force = false): void => { const now = Date.now(); @@ -1176,6 +1238,9 @@ async function runTask( if (text.trim()) ts.resultText = text; writer.schedule(structuredCloneState(state)); lastPublishAt = now; + void options.transcriptStore.flushStreamMirror(task.id, (_, msg) => { + console.warn(msg); + }); }; const deadline = Date.now() + taskTimeoutMs; @@ -1214,7 +1279,7 @@ async function runTask( for (const block of blocks) { if (block.type === 'text' && typeof block.text === 'string') { buffer.append(block.text); - fullStreamChunks.push(block.text); + options.transcriptStore.append(task.id, block.text); appended = true; } } @@ -1295,18 +1360,22 @@ async function runTask( await bestEffortCancel(run, task.id); } publishIfDue(true); + await options.transcriptStore.flushStreamMirror(task.id, (_, msg) => { + console.warn(msg); + }); const fullDir = options.fullOutputAbsoluteDir; if (fullDir) { await persistTaskMarkdownFile( fullDir, options.dagTitle ?? state.title, ts, - fullStreamChunks.join('') + options.transcriptStore.getJoined(task.id) ).catch((e: unknown) => { const msg = e instanceof Error ? e.message : String(e); console.warn(`[proof] artifact write failed for ${task.id}: ${msg}`); }); } + options.transcriptStore.finalizeTaskMirrorsDone(task.id); try { await (agent as unknown as AsyncDisposable)[Symbol.asyncDispose](); } catch { @@ -1340,6 +1409,10 @@ interface RunTaskOptions { * is called. Pause and oracle tasks ignore framing (they never run an LLM). */ framing?: string; + /** How parent transcripts are excerpted for this process. */ + upstreamMode: UpstreamPolicyMode; + /** Full streamed assistant transcripts for `kind: 'task'` invocations. */ + transcriptStore: TaskTranscriptStore; } /** Single source of truth for the "undefined kind === task" rule. */ @@ -1347,8 +1420,6 @@ function effectiveTaskKind(task: RawTask): TaskKind { return task.kind ?? 'task'; } -/** Cap on per-task `resultText` size — applies to live streaming and final state. */ -const STREAM_CAP = 4000; /** Hard timeout per task to prevent stale RUNNING tasks. */ const DEFAULT_TASK_TIMEOUT_MS = 20 * 60 * 1000; /** Throttle live state writes to avoid excessive full-state cloning churn. */ @@ -1357,8 +1428,6 @@ const DEFAULT_STREAM_PUBLISH_MS = 500; const DEFAULT_STREAM_IDLE_TIMEOUT_MS = 5 * 60 * 1000; /** Avoid hanging indefinitely in wait() when stream is already done. */ const WAIT_AFTER_STREAM_GRACE_MS = 15 * 1000; -/** Chars of each parent's output included in the child prompt. */ -const UPSTREAM_SNIPPET_CAP = 2000; /** Raised listener ceiling to avoid false-positive AbortSignal warnings from SDK internals. */ const ABORT_SIGNAL_LISTENER_LIMIT = 100; /** Default cap on `--converge-on` re-execution attempts after the initial run. */ @@ -1598,6 +1667,10 @@ interface RunConvergenceLoopOptions { reExecIds: Set; ranks: RawTask[][]; stateById: Map; + /** Authoritative reviewer transcript store for in-process fidelity. */ + transcriptStore: TaskTranscriptStore; + /** Same excerpt policy enforced on child upstream blocks. */ + upstreamMode: UpstreamPolicyMode; dispatchTask: ( task: RawTask, overrides?: Partial @@ -1605,10 +1678,9 @@ interface RunConvergenceLoopOptions { writer: CanvasWriter; state: RunState; /** - * When set, the loop reads the convergence task's `findings-dir` JSON - * sidecar instead of re-parsing live `resultText`. Falls back to the live - * text on missing/malformed sidecars so a stale findings dir cannot wedge - * the loop. + * When set, the loop prefers the findings-dir JSON sidecar ONLY when no + * in-memory authoritative transcript exists (e.g. resumed runs). Same-process + * execution always parses the reviewer transcript backing store first. */ findingsDir?: string; /** @@ -1624,17 +1696,38 @@ interface RunConvergenceLoopOptions { afterIteration?: (iteration: number) => Promise; } +function resolveConvergenceReviewerSource(opts: { + transcriptStore: TaskTranscriptStore; + convergeOn: string; + sidecarText: string | null; + resultText: string | undefined; + includeSidecar: boolean; +}): string { + const fromStore = opts.transcriptStore.getJoined(opts.convergeOn); + if (fromStore.trim().length > 0) return fromStore; + if ( + opts.includeSidecar && + opts.sidecarText !== null && + opts.sidecarText.trim().length > 0 + ) { + return opts.sidecarText; + } + return opts.resultText ?? ''; +} + /** * Implements the `--converge-on` re-execution loop. Iteration 0 happened in * the main rank loop. Each subsequent iteration: * - * 1. Parses the convergence task's current `resultText` for `## Blockers` - * and `## High-severity findings`. If both sections are empty, exit. + * 1. Parses the convergence task's authoritative reviewer transcript + * (in-memory store; sidecars/backed `resultText` are fallbacks after + * restarts) for `## Blockers` and `## High-severity findings`. If both + * sections are empty, exit. * 2. Resets the convergence task and every transitive ancestor back to * `PENDING` and bumps their `iteration` counter. * 3. Re-executes the affected subset of the DAG in the original - * topological order, threading the convergence task's previous - * `resultText` into ancestor prompts as `extraContext`. + * topological order, threading the excerpt-policied reviewer feedback + * into ancestor prompts as `extraContext`. * 4. Re-executes the convergence task itself. */ async function runConvergenceLoop( @@ -1647,6 +1740,8 @@ async function runConvergenceLoop( reExecIds, ranks, stateById, + transcriptStore, + upstreamMode, dispatchTask, writer, state, @@ -1671,10 +1766,6 @@ async function runConvergenceLoop( const startingIteration = (convergeTs.iteration ?? 0) + 1; for (let iter = startingIteration; iter <= maxIterations; iter++) { - // Prefer the findings-dir JSON sidecar when one was written for the most - // recent run of the convergence task; the sidecar is captured at task - // completion, so it survives the streaming buffer churn that can occasionally - // truncate live `resultText` mid-section. Falls back to live text on miss. const sidecarText = findingsDir !== undefined ? await readFindingsSidecarAsText( @@ -1683,9 +1774,14 @@ async function runConvergenceLoop( convergeTs.iteration ?? 0 ) : null; - const findings = extractConvergenceFindings( - sidecarText ?? convergeTs.resultText - ); + const reviewerSource = resolveConvergenceReviewerSource({ + transcriptStore, + convergeOn, + sidecarText, + resultText: convergeTs.resultText, + includeSidecar: true, + }); + const findings = extractConvergenceFindings(reviewerSource); if (!findings.hasIssues) { console.log( `[proof] ${loopId} (converge-on ${convergeOn}): clean — no Blockers / High-severity findings after ${ @@ -1729,7 +1825,14 @@ async function runConvergenceLoop( const convergenceContext = buildConvergenceContext( convergeOn, iter, - convergeTs.resultText + resolveConvergenceReviewerSource({ + transcriptStore, + convergeOn, + sidecarText, + resultText: convergeTs.resultText, + includeSidecar: false, + }), + upstreamMode ); // Reset state on every re-executed task. We deliberately do not clear @@ -1777,9 +1880,14 @@ async function runConvergenceLoop( convergeTs.iteration ?? 0 ) : null; - const finalFindings = extractConvergenceFindings( - finalSidecarText ?? convergeTs.resultText - ); + const finalReviewerSource = resolveConvergenceReviewerSource({ + transcriptStore, + convergeOn, + sidecarText: finalSidecarText, + resultText: convergeTs.resultText, + includeSidecar: true, + }); + const finalFindings = extractConvergenceFindings(finalReviewerSource); if (finalFindings.hasIssues) { const now = Date.now(); convergeTs.status = 'BUDGET-EXCEEDED'; @@ -1793,7 +1901,11 @@ async function runConvergenceLoop( // authoritative. if (findingsDir !== undefined) { try { - await writeFindingsSidecar(findingsDir, convergeTs); + const reviewerJoined = transcriptStore.getJoined(convergeOn); + await writeFindingsSidecar(findingsDir, convergeTs, { + parseSource: + reviewerJoined.trim().length > 0 ? reviewerJoined : undefined, + }); } catch (err) { const msg = err instanceof Error ? err.message : String(err); console.error( @@ -1825,9 +1937,13 @@ async function skipTask( ts.status = 'ERROR'; ts.finishedAt = now; ts.durationMs = 0; - ts.errorMessage = `Skipped: upstream task(s) ${failedDeps.join(', ')} failed`; + ts.errorMessage = `Skipped: upstream task(s) ${failedDeps.join( + ', ' + )} blocked this task (upstream ERROR or BUDGET-EXCEEDED)`; console.log( - `[proof] skipping ${task.id} — upstream ${failedDeps.join(', ')} failed` + `[proof] skipping ${task.id} — upstream ${failedDeps.join( + ', ' + )} in ERROR/BUDGET-EXCEEDED` ); writer.schedule(structuredCloneState(state)); if (!fullOutputAbsoluteDir) return; @@ -1878,7 +1994,9 @@ async function markRunTerminated( function buildUpstreamContext( task: RawTask, - stateById: Map + stateById: Map, + transcripts: TaskTranscriptStore, + upstreamMode: UpstreamPolicyMode ): string { if (task.depends_on.length === 0) return ''; const lines: string[] = [ @@ -1889,11 +2007,19 @@ function buildUpstreamContext( const dep = stateById.get(depId); if (!dep) continue; const status = dep.status; - const snippet = dep.resultText - ? truncateUpstreamSnippet(dep.resultText, UPSTREAM_SNIPPET_CAP) - : dep.errorMessage - ? `(failed: ${dep.errorMessage})` - : '(no output)'; + let snippet: string; + if (dep.status === 'ERROR' || dep.status === 'BUDGET-EXCEEDED') { + snippet = dep.errorMessage + ? `(failed: ${dep.errorMessage})` + : `(${status.toLowerCase()})`; + } else { + const authoritative = + transcripts.getJoined(depId) || (dep.resultText ?? '').trim(); + snippet = + authoritative.length > 0 + ? excerptUpstreamForPrompt(authoritative, upstreamMode) + : '(no output)'; + } lines.push(`### ${depId} [${status}]`); lines.push(snippet); lines.push(''); @@ -1901,114 +2027,6 @@ function buildUpstreamContext( return lines.join('\n'); } -/** - * Section-aware truncation for upstream parent results stitched into a - * downstream prompt. - * - * The runner used to slice `resultText` at `UPSTREAM_SNIPPET_CAP`, which - * could decapitate the most actionable section (e.g. `## Proposed contract`) - * mid-sentence whenever earlier sections were verbose. This helper instead: - * - * 1. Parses the text into `## Heading` blocks (`### …` and below stay - * attached to their parent). - * 2. If fewer than 2 `## ` headings are present (unstructured output), - * falls back to the existing `slice(…)` truncate so we never make - * legacy / freeform output worse. - * 3. Otherwise drops whole sections in `SECTION_DROP_PRIORITY` order - * (`Current contract` first, `Proposed contract` last) until the - * rendered text fits within the cap. The leading section in the - * original document is *always* preserved — it is the parent task's - * primary output (`## Proposed contract`, `## Files changed`, etc.) - * and dropping it would defeat the whole context-stitching pattern. - * 4. If sections we are willing to drop are exhausted and the text still - * exceeds the cap, slices the remaining rendered text as a last - * resort. This guarantees we always return something that fits. - * - * Section name comparisons are case-insensitive and trim-tolerant; only - * the names listed in `SECTION_DROP_PRIORITY` are eligible to be dropped. - * Any other heading (e.g. agent-specific sections like `## Files changed`, - * `## Blockers`) is treated as preserve-by-default for safety. - */ -function truncateUpstreamSnippet(text: string, cap: number): string { - if (text.length <= cap) return text; - const sections = parseUpstreamSections(text); - if (sections.length < 2) return truncate(text, cap); - // Walk the drop priority list once. Each pass that succeeds rebuilds the - // rendered text and re-checks the cap; we stop as soon as we fit. - const kept = sections.slice(); - for (const dropTarget of SECTION_DROP_PRIORITY) { - if (renderUpstreamSections(kept).length <= cap) break; - // Preserve index 0 (the leading section) regardless of name match. - const idx = kept.findIndex((s, i) => i > 0 && s.normalized === dropTarget); - if (idx === -1) continue; - kept.splice(idx, 1); - } - const rendered = renderUpstreamSections(kept); - if (rendered.length <= cap) return rendered; - return truncate(rendered, cap); -} - -interface UpstreamSection { - /** Original heading text minus the leading `## `, trimmed. */ - heading: string; - /** Lower-cased trimmed heading for drop-priority comparisons. */ - normalized: string; - /** Body lines below the heading (sub-headings stay attached). */ - bodyLines: string[]; -} - -/** - * Drop priority used by `truncateUpstreamSnippet`. Last entry is the last - * one we will give up — i.e. `## Proposed contract` is the highest-value - * section and is preserved as long as anything else can be dropped first. - */ -const SECTION_DROP_PRIORITY: readonly string[] = [ - 'current contract', - 'validation plan', - 'human checkpoints', - 'migration impact', - 'proposed contract', -]; - -/** Mirrors converge_loop's heading regex: `## …` only, never `### …`. */ -const UPSTREAM_HEADING_RE = /^##(?!#)\s*(.+?)\s*$/; - -function parseUpstreamSections(text: string): UpstreamSection[] { - const sections: UpstreamSection[] = []; - const lines = text.split(/\r?\n/); - let current: UpstreamSection | null = null; - for (const line of lines) { - const m = UPSTREAM_HEADING_RE.exec(line); - if (m) { - if (current) sections.push(current); - const heading = m[1].trim(); - current = { - heading, - normalized: heading.toLowerCase(), - bodyLines: [], - }; - } else if (current) { - current.bodyLines.push(line); - } - // Lines before the first `## ` heading are intentionally dropped — the - // section-aware truncate only applies to outputs that lead with a - // heading; freeform preludes fall through to `truncate()`. - } - if (current) sections.push(current); - return sections; -} - -function renderUpstreamSections(sections: UpstreamSection[]): string { - return sections - .map((s) => `## ${s.heading}\n${s.bodyLines.join('\n')}`.trimEnd()) - .join('\n\n'); -} - -function truncate(s: string, n: number): string { - if (s.length <= n) return s; - return s.slice(0, n - 1) + '…'; -} - function formatMs(ms: number): string { if (ms < 1000) return `${ms}ms`; const s = ms / 1000; diff --git a/packages/proof/src/self_hosting.ts b/packages/proof/src/self_hosting.ts index c5c5f0b2..f853d738 100644 --- a/packages/proof/src/self_hosting.ts +++ b/packages/proof/src/self_hosting.ts @@ -30,6 +30,8 @@ export const RUNNER_RUNTIME_FILES: readonly string[] = [ 'oracle_task.ts', 'pause_task.ts', 'self_hosting.ts', + 'task_transcript.ts', + 'upstream_policy.ts', ]; export async function writePersistedRunState( diff --git a/packages/proof/src/task_transcript.ts b/packages/proof/src/task_transcript.ts new file mode 100644 index 00000000..265c3789 --- /dev/null +++ b/packages/proof/src/task_transcript.ts @@ -0,0 +1,140 @@ +import { readFileSync } from 'node:fs'; +import { appendFile, mkdir, writeFile } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; + +/** File name suffix for mirrored raw assistant stream under the artifact dir. */ +export function taskStreamArtifactRelPath(taskId: string): string { + return `${taskId}.stream.txt`; +} + +interface StreamMirrorState { + absPath: string; + pendingBuf: string; + flushing: Promise; +} + +/** + * In-memory authoritative assistant transcript per task, with optional best-effort + * append-only `.stream.txt` mirror when artifacts are enabled. + */ +export class TaskTranscriptStore { + private readonly text = new Map(); + private readonly streamPaths = new Map(); + private mirrors = new Map(); + /** When false, omit mirror writes entirely (logged once per task via callback). */ + + mirrorEnabledForTask(taskId: string): boolean { + return this.mirrors.has(taskId); + } + + resetTask(taskId: string): void { + this.text.set(taskId, ''); + } + + /** + * Registers a mirrored stream path and truncates/creates it. Absolute path. + */ + async beginMirroredAppend( + taskId: string, + artifactAbsoluteDir: string, + logMirrorError?: (taskId: string, message: string) => void + ): Promise { + const absPath = join( + artifactAbsoluteDir, + taskStreamArtifactRelPath(taskId) + ); + this.streamPaths.set(taskId, absPath); + this.mirrors.set(taskId, { + absPath, + pendingBuf: '', + flushing: Promise.resolve(), + }); + try { + await mkdir(dirname(absPath), { recursive: true }); + await writeFile(absPath, '', 'utf8'); + } catch (e) { + this.mirrors.delete(taskId); + const msg = e instanceof Error ? e.message : String(e); + logMirrorError?.( + taskId, + `[proof] stream mirror init failed for ${taskId}: ${msg}` + ); + } + } + + registerExistingMirror( + taskId: string, + artifactAbsoluteDir: string, + relativePath: string + ): void { + this.streamPaths.set(taskId, join(artifactAbsoluteDir, relativePath)); + } + + append(taskId: string, chunk: string): void { + if (!chunk) return; + const cur = this.text.get(taskId) ?? ''; + this.text.set(taskId, cur + chunk); + const m = this.mirrors.get(taskId); + if (m) { + m.pendingBuf += chunk; + } + } + + /** + * Flush pending mirrored bytes best-effort. Coalesced to match canvas publish throttle. + */ + async flushStreamMirror( + taskId: string, + logMirrorError?: (taskId: string, message: string) => void + ): Promise { + const m = this.mirrors.get(taskId); + if (!m) return; + if (m.pendingBuf.length === 0) { + await m.flushing; + return; + } + const payload = m.pendingBuf; + m.pendingBuf = ''; + m.flushing = m.flushing.then(async () => { + try { + await appendFile(m.absPath, payload, 'utf8'); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + logMirrorError?.( + taskId, + `[proof] stream mirror append failed for ${taskId}: ${msg}` + ); + } + }); + await m.flushing; + } + + finalizeTaskMirrorsDone(taskId: string): void { + this.mirrors.delete(taskId); + } + + get(taskId: string): string | undefined { + const v = this.text.get(taskId); + if (v !== undefined && v !== '') return v; + const path = this.streamPaths.get(taskId); + if (!path) return v; + try { + const diskText = readFileSync(path, 'utf8'); + this.text.set(taskId, diskText); + return diskText; + } catch { + return v; + } + } + + /** Non-optional concatenation helper — empty string when missing. */ + getJoined(taskId: string): string { + return this.get(taskId) ?? ''; + } + + clearAll(): void { + this.text.clear(); + this.streamPaths.clear(); + this.mirrors.clear(); + } +} diff --git a/packages/proof/src/upstream_policy.ts b/packages/proof/src/upstream_policy.ts new file mode 100644 index 00000000..f9539221 --- /dev/null +++ b/packages/proof/src/upstream_policy.ts @@ -0,0 +1,260 @@ +/** + * Upstream prompt excerpt policy for child tasks and convergence `extraContext`. + * Full parent transcripts are trimmed here with explicit, counted banners — never silent `…`. + */ + +/** Default cap on upstream prose stitched into each child prompt (chars). */ +export const UPSTREAM_SNIPPET_CAP = 2000; + +/** Canvas / `TaskState.resultText` streaming display bound (chars). */ +export const CANVAS_DISPLAY_CAP = 4000; + +export type UpstreamPolicyMode = 'full' | 'summarize'; + +/** + * Matches the first line emitted by the canvas display buffer when it drops + * earlier prefix characters. + */ +export const DISPLAY_TRUNCATION_BANNER_RE = + /^\[\.\.\.truncated (\d+) earlier chars\.\.\.\]$/; + +export interface UpstreamSection { + /** Original heading text minus the leading `## `, trimmed. */ + heading: string; + /** Lower-cased trimmed heading for drop-priority comparisons. */ + normalized: string; + /** Body lines below the heading (sub-headings stay attached). */ + bodyLines: string[]; +} + +/** + * Drop priority used by section-aware summarization. Last entry is the last + * one we give up — i.e. `## Proposed contract` is preserved longest. + */ +const SECTION_DROP_PRIORITY: readonly string[] = [ + 'current contract', + 'validation plan', + 'human checkpoints', + 'migration impact', + 'proposed contract', +]; + +/** Mirrors converge_loop's heading regex: `## …` only, never `### …`. */ +const UPSTREAM_HEADING_RE = /^##(?!#)\s*(.+?)\s*$/; + +export interface UpstreamSummarizeStats { + originalChars: number; + excerptChars: number; + droppedSectionTitles: string[]; + /** True when bytes were dropped only because of the numeric cap after section drops. */ + hardLimited: boolean; +} + +function truncateWithEllipsisLegacy(s: string, n: number): string { + if (s.length <= n) return s; + return s.slice(0, n - 1) + '…'; +} + +/** + * Parses `## ` sections. Preserves pre-heading text as a synthetic first + * section so section-aware drops cannot silently discard a parent preamble. + * When that preamble is the display-buffer truncation banner, the synthetic + * heading makes the warning durable inside child prompts. + */ +export function parseUpstreamSections(text: string): UpstreamSection[] { + const lines = text.split(/\r?\n/); + const preamble: string[] = []; + let i = 0; + while (i < lines.length) { + const line = lines[i]; + const m = UPSTREAM_HEADING_RE.exec(line); + if (m) break; + preamble.push(line); + i++; + } + + const sections: UpstreamSection[] = []; + + const meaningfulPreamble = preamble.some((ln) => ln.trim() !== ''); + if (meaningfulPreamble) { + const hasDisplayBanner = preamble.some((ln) => + DISPLAY_TRUNCATION_BANNER_RE.test(ln.trim()) + ); + sections.push({ + heading: hasDisplayBanner + ? 'Upstream truncation notice' + : 'Upstream preamble', + normalized: hasDisplayBanner + ? 'upstream truncation notice' + : 'upstream preamble', + bodyLines: preamble, + }); + } + + while (i < lines.length) { + const line = lines[i]; + const m = UPSTREAM_HEADING_RE.exec(line); + if (m) { + const heading = m[1].trim(); + const section: UpstreamSection = { + heading, + normalized: heading.toLowerCase(), + bodyLines: [], + }; + i++; + while (i < lines.length) { + const inner = lines[i]; + if (UPSTREAM_HEADING_RE.test(inner)) break; + section.bodyLines.push(inner); + i++; + } + sections.push(section); + } else { + i++; + } + } + + return sections; +} + +export function renderUpstreamSections(sections: UpstreamSection[]): string { + return sections + .map((s) => `## ${s.heading}\n${s.bodyLines.join('\n')}`.trimEnd()) + .join('\n\n'); +} + +function summarizeWithinCap( + text: string, + cap: number +): { excerpt: string; droppedSectionTitles: string[]; hardLimited: boolean } { + const droppedSectionTitles: string[] = []; + + if (text.length <= cap) { + return { excerpt: text, droppedSectionTitles, hardLimited: false }; + } + const parsed = parseUpstreamSections(text); + if (parsed.length < 2) { + return { + excerpt: truncateWithEllipsisLegacy(text, cap), + droppedSectionTitles, + hardLimited: text.length > cap, + }; + } + + let kept = parsed.slice(); + + for (const dropTarget of SECTION_DROP_PRIORITY) { + if (renderUpstreamSections(kept).length <= cap) break; + const idx = kept.findIndex( + (s, idx2) => idx2 > 0 && s.normalized === dropTarget + ); + if (idx === -1) continue; + droppedSectionTitles.push(kept[idx]!.heading); + kept.splice(idx, 1); + } + + let rendered = renderUpstreamSections(kept); + let hardLimited = false; + if (rendered.length > cap) { + rendered = truncateWithEllipsisLegacy(rendered, cap); + hardLimited = true; + } + return { excerpt: rendered, droppedSectionTitles, hardLimited }; +} + +function formatUpstreamBanner( + stats: UpstreamSummarizeStats, + cap: number +): string { + const parts: string[] = [ + `parent output was ${stats.originalChars} chars`, + `excerpt is ${stats.excerptChars} chars`, + `cap=${cap}`, + ]; + if (stats.droppedSectionTitles.length > 0) { + parts.push(`sections dropped: ${stats.droppedSectionTitles.join(', ')}`); + } + if (stats.hardLimited) { + parts.push( + `hard slice applied after structural trim (no silent ellipsis boundary in excerpt body)` + ); + } + return `[...upstream excerpt: ${parts.join('; ')}]`; +} + +/** Replace trailing Unicode ellipsis from our hard slice with a visible sentence. */ +function stripTrailingStructuralEllipsis( + excerpt: string, + hardLimited: boolean +): string { + if (hardLimited && excerpt.endsWith('…')) { + return ( + excerpt.slice(0, -1) + '[...truncated in excerpt body at char cap …]' + ); + } + return excerpt; +} + +export function summarizeUpstreamForPrompt( + fullText: string, + cap: number +): { + excerpt: string; + stats: UpstreamSummarizeStats; +} { + const originalChars = fullText.length; + const inner = summarizeWithinCap(fullText, cap); + const excerptCore = stripTrailingStructuralEllipsis( + inner.excerpt, + inner.hardLimited + ); + + const shortened = + inner.droppedSectionTitles.length > 0 || + inner.hardLimited || + excerptCore.length < originalChars; + + if (!shortened) { + return { + excerpt: excerptCore, + stats: { + originalChars, + excerptChars: excerptCore.length, + droppedSectionTitles: [], + hardLimited: false, + }, + }; + } + + const statsForBanner: UpstreamSummarizeStats = { + originalChars, + excerptChars: excerptCore.length, + droppedSectionTitles: inner.droppedSectionTitles, + hardLimited: inner.hardLimited, + }; + + const bannered = `${formatUpstreamBanner( + statsForBanner, + cap + )}\n\n${excerptCore}`; + return { + excerpt: bannered, + stats: statsForBanner, + }; +} + +/** + * Applies the upstream excerpt policy used for stitch into child prompts and + * inside convergence `extraContext`. + */ +export function excerptUpstreamForPrompt( + fullText: string, + mode: UpstreamPolicyMode, + snippetCap = UPSTREAM_SNIPPET_CAP +): string { + const trimmed = fullText.trimEnd(); + if (mode === 'full') return trimmed; + + const { excerpt } = summarizeUpstreamForPrompt(trimmed, snippetCap); + return excerpt; +}