EntityProcess · christso · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -231,10 +231,16 @@ jobs:
           GH_MODELS_TOKEN: ${{ secrets.GH_MODELS_TOKEN }}
           CONTRACT_EVAL_MODEL: ${{ vars.CONTRACT_EVAL_MODEL || 'openai/gpt-4.1-mini' }}
 
+      - name: Build and run example oracle eval gate
+        if: needs.prepare.outputs.publish_action == 'publish-next'
+        run: |
+          bun run build
+          bun run examples:oracle:gate -- --output-dir "${{ runner.temp }}/example-oracle-publish-next"
+
       - name: Publish to npm
         run: |
           if [ "${{ needs.prepare.outputs.publish_action }}" = "publish-next" ]; then
-            bun run publish:next
+            bun scripts/publish.ts next
           else
             bun run publish
           fi

diff --git a/examples/README.md b/examples/README.md
@@ -18,6 +18,22 @@ cd examples/features/execution-metrics
 bun install
 ```
 
+## Oracle Fixture Sweep
+
+Run the deterministic oracle sweep when you need to prove agent or LLM-backed example evals still parse, execute, write artifacts, and avoid live LLM calls:
+
+```bash
+bun run examples:oracle
+```
+
+The command discovers eval files under `examples/`, reads `examples/oracle-fixtures.yaml` for explicit exclusions, classifies oracle-capable targets as already covered, generates replay target fixtures under `.agentv/tmp/example-oracle-fixtures/` for evals that otherwise require an agent or LLM target, and runs those evals with an oracle replay target plus an oracle CLI grader target. These fixtures are a contract oracle for example execution, not captured live-model golden transcripts; live provider dogfood remains a separate release-gate workflow. To inspect the inventory without running evals:
+
+```bash
+bun run examples:oracle -- --inventory
+```
+
+Use `--eval <path>` to run or inventory a single eval file.
+
 ## Directory Structure
 
 Examples are organized into two categories:

diff --git a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml
diff --git a/examples/oracle-fixtures.yaml b/examples/oracle-fixtures.yaml
@@ -0,0 +1,21 @@
+schema_version: agentv.example_oracle_manifest.v1
+description: >
+  Maintained inventory controls for the deterministic example oracle sweep.
+  The runner discovers example evals dynamically, generates replay target
+  fixtures from expected outputs or assertion-derived references only for
+  agent or LLM-backed targets, treats oracle-capable targets as their own
+  oracle, and skips only the exclusions listed here. These fixtures are
+  contract oracles for example execution, not captured live-model golden
+  transcripts.
+target_name: example_oracle
+source_target: example_oracle_source
+grader_target: example_oracle_grader
+exclusions:
+  - path: examples/features/docker-workspace/evals/docker-example.EVAL.yaml
+    reason: Requires Docker workspace runtime; replaying target output does not remove the Docker setup dependency.
+  - path: examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml
+    reason: Requires local Copilot session/workspace artifacts and .github/skills/agentv-bench/SKILL.md during before_all setup.
+  - path: examples/features/prompt-template-sdk/evals/dataset.eval.yaml
+    reason: Loader currently resolves zero runnable tests because the eval references a missing ../prompts/custom-grader.ts command file.
+  - path: examples/showcase/bug-fix-benchmark/evals/bug-fixes.eval.yaml
+    reason: Eval metadata is invalid today; experiment targets[0].use_target is empty, so the suite cannot be loaded.
diff --git a/examples/showcase/cross-repo-sync/scripts/run-ts.sh b/examples/showcase/cross-repo-sync/scripts/run-ts.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
 # Cross-runtime TypeScript runner.
-# Bun's node shim runs .ts natively; real Node.js needs tsx.
+# Bun runs .ts files natively; real Node.js needs tsx.
 SCRIPT="$1"
 shift
-if node -e "process.exit(typeof Bun === 'undefined' ? 1 : 0)" 2>/dev/null; then
-  exec node "$SCRIPT" "$@"
+if command -v bun >/dev/null 2>&1; then
+  exec bun "$SCRIPT" "$@"
 else
   exec node --import tsx "$SCRIPT" "$@"
 fi
diff --git a/package.json b/package.json
@@ -21,6 +21,8 @@
     "beads:check": "bun scripts/check-beads-context.ts",
     "debug:pi-sdk-tools": "bun scripts/debug-pi-sdk-tools.ts",
     "validate:examples": "EVAL_CRITERIA=placeholder CUSTOM_SYSTEM_PROMPT=placeholder bun scripts/validate-example-evals.ts",
+    "examples:oracle": "bun scripts/run-example-oracle-fixtures.ts",
+    "examples:oracle:gate": "bun scripts/run-example-oracle-gate.ts",
     "eval:baseline-check": "bun scripts/check-eval-baselines.ts",
     "release": "bun scripts/release.ts",
     "release:next": "bun scripts/release.ts next",

diff --git a/scripts/example-oracle-grader.ts b/scripts/example-oracle-grader.ts
@@ -0,0 +1,106 @@
+#!/usr/bin/env bun
+import { readFile, writeFile } from 'node:fs/promises';
+import { parseArgs } from 'node:util';
+
+interface RubricCheck {
+  readonly id: string;
+  readonly satisfied: boolean;
+  readonly reasoning: string;
+}
+
+function unique(values: readonly string[]): string[] {
+  return [...new Set(values)];
+}
+
+function extractRubricIds(prompt: string): string[] {
+  const ids: string[] = [];
+  const bracketPattern = /^- \[([^\]]+)\]/gm;
+  for (const match of prompt.matchAll(bracketPattern)) {
+    if (match[1]) ids.push(match[1]);
+  }
+
+  const quotedIdPattern = /"id"\s*:\s*"([^"]+)"/g;
+  for (const match of prompt.matchAll(quotedIdPattern)) {
+    if (match[1] && match[1] !== 'string (criterion id)') ids.push(match[1]);
+  }
+
+  return unique(ids);
+}
+
+function buildEvaluation(prompt: string): unknown {
+  const ids = extractRubricIds(prompt);
+  if (ids.length > 0 || prompt.includes('"checks"')) {
+    const usesScoreRanges = prompt.includes('"score": integer') || prompt.includes('score ranges');
+    if (usesScoreRanges) {
+      return {
+        checks: ids.map((id) => ({
+          id,
+          score: 10,
+          reasoning:
+            'Deterministic oracle grader marks the reference fixture as satisfying this criterion.',
+        })),
+        overall_reasoning: 'Deterministic oracle grader response.',
+      };
+    }
+
+    return {
+      checks: ids.map(
+        (id): RubricCheck => ({
+          id,
+          satisfied: true,
+          reasoning:
+            'Deterministic oracle grader marks the reference fixture as satisfying this criterion.',
+        }),
+      ),
+      overall_reasoning: 'Deterministic oracle grader response.',
+    };
+  }
+
+  return {
+    score: 1,
+    assertions: [
+      {
+        text: 'Reference fixture was accepted by the deterministic oracle grader',
+        passed: true,
+        evidence:
+          'The oracle workflow validates execution and artifact compatibility without live LLM calls.',
+      },
+    ],
+    details: {
+      oracle_grader: true,
+    },
+  };
+}
+
+async function main() {
+  const { values } = parseArgs({
+    options: {
+      'prompt-file': { type: 'string' },
+      output: { type: 'string' },
+    },
+  });
+
+  const promptFile = values['prompt-file'];
+  const outputFile = values.output;
+  if (!promptFile || !outputFile) {
+    throw new Error('Usage: example-oracle-grader.ts --prompt-file <path> --output <path>');
+  }
+
+  const prompt = await readFile(promptFile, 'utf8');
+  const evaluation = buildEvaluation(prompt);
+  const response = {
+    text: JSON.stringify(evaluation),
+    token_usage: { input: 0, output: 0 },
+    cost_usd: 0,
+    duration_ms: 1,
+  };
+
+  await writeFile(outputFile, `${JSON.stringify(response)}\n`, 'utf8');
+}
+
+if (import.meta.main) {
+  main().catch((error) => {
+    console.error(error instanceof Error ? error.message : String(error));
+    process.exit(1);
+  });
+}