Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,16 @@ jobs:
GH_MODELS_TOKEN: ${{ secrets.GH_MODELS_TOKEN }}
CONTRACT_EVAL_MODEL: ${{ vars.CONTRACT_EVAL_MODEL || 'openai/gpt-4.1-mini' }}

- name: Build and run example oracle eval gate
if: needs.prepare.outputs.publish_action == 'publish-next'
run: |
bun run build
bun run examples:oracle:gate -- --output-dir "${{ runner.temp }}/example-oracle-publish-next"

- name: Publish to npm
run: |
if [ "${{ needs.prepare.outputs.publish_action }}" = "publish-next" ]; then
bun run publish:next
bun scripts/publish.ts next
else
bun run publish
fi
Expand Down
16 changes: 16 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,22 @@ cd examples/features/execution-metrics
bun install
```

## Oracle Fixture Sweep

Run the deterministic oracle sweep when you need to prove agent or LLM-backed example evals still parse, execute, write artifacts, and avoid live LLM calls:

```bash
bun run examples:oracle
```

The command discovers eval files under `examples/`, reads `examples/oracle-fixtures.yaml` for explicit exclusions, classifies oracle-capable targets as already covered, generates replay target fixtures under `.agentv/tmp/example-oracle-fixtures/` for evals that otherwise require an agent or LLM target, and runs those evals with an oracle replay target plus an oracle CLI grader target. These fixtures are a contract oracle for example execution, not captured live-model golden transcripts; live provider dogfood remains a separate release-gate workflow. To inspect the inventory without running evals:

```bash
bun run examples:oracle -- --inventory
```

Use `--eval <path>` to run or inventory a single eval file.

## Directory Structure

Examples are organized into two categories:
Expand Down

This file was deleted.

21 changes: 21 additions & 0 deletions examples/oracle-fixtures.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
schema_version: agentv.example_oracle_manifest.v1
description: >
Maintained inventory controls for the deterministic example oracle sweep.
The runner discovers example evals dynamically, generates replay target
fixtures from expected outputs or assertion-derived references only for
agent or LLM-backed targets, treats oracle-capable targets as their own
oracle, and skips only the exclusions listed here. These fixtures are
contract oracles for example execution, not captured live-model golden
transcripts.
target_name: example_oracle
source_target: example_oracle_source
grader_target: example_oracle_grader
exclusions:
- path: examples/features/docker-workspace/evals/docker-example.EVAL.yaml
reason: Requires Docker workspace runtime; replaying target output does not remove the Docker setup dependency.
- path: examples/features/copilot-log-eval/evals/skill-trigger.EVAL.yaml
reason: Requires local Copilot session/workspace artifacts and .github/skills/agentv-bench/SKILL.md during before_all setup.
- path: examples/features/prompt-template-sdk/evals/dataset.eval.yaml
reason: Loader currently resolves zero runnable tests because the eval references a missing ../prompts/custom-grader.ts command file.
- path: examples/showcase/bug-fix-benchmark/evals/bug-fixes.eval.yaml
reason: Eval metadata is invalid today; experiment targets[0].use_target is empty, so the suite cannot be loaded.
6 changes: 3 additions & 3 deletions examples/showcase/cross-repo-sync/scripts/run-ts.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/bin/env bash
# Cross-runtime TypeScript runner.
# Bun's node shim runs .ts natively; real Node.js needs tsx.
# Bun runs .ts files natively; real Node.js needs tsx.
SCRIPT="$1"
shift
if node -e "process.exit(typeof Bun === 'undefined' ? 1 : 0)" 2>/dev/null; then
exec node "$SCRIPT" "$@"
if command -v bun >/dev/null 2>&1; then
exec bun "$SCRIPT" "$@"
else
exec node --import tsx "$SCRIPT" "$@"
fi
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
"beads:check": "bun scripts/check-beads-context.ts",
"debug:pi-sdk-tools": "bun scripts/debug-pi-sdk-tools.ts",
"validate:examples": "EVAL_CRITERIA=placeholder CUSTOM_SYSTEM_PROMPT=placeholder bun scripts/validate-example-evals.ts",
"examples:oracle": "bun scripts/run-example-oracle-fixtures.ts",
"examples:oracle:gate": "bun scripts/run-example-oracle-gate.ts",
"eval:baseline-check": "bun scripts/check-eval-baselines.ts",
"release": "bun scripts/release.ts",
"release:next": "bun scripts/release.ts next",
Expand Down
106 changes: 106 additions & 0 deletions scripts/example-oracle-grader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env bun
import { readFile, writeFile } from 'node:fs/promises';
import { parseArgs } from 'node:util';

interface RubricCheck {
readonly id: string;
readonly satisfied: boolean;
readonly reasoning: string;
}

function unique(values: readonly string[]): string[] {
return [...new Set(values)];
}

function extractRubricIds(prompt: string): string[] {
const ids: string[] = [];
const bracketPattern = /^- \[([^\]]+)\]/gm;
for (const match of prompt.matchAll(bracketPattern)) {
if (match[1]) ids.push(match[1]);
}

const quotedIdPattern = /"id"\s*:\s*"([^"]+)"/g;
for (const match of prompt.matchAll(quotedIdPattern)) {
if (match[1] && match[1] !== 'string (criterion id)') ids.push(match[1]);
}

return unique(ids);
}

function buildEvaluation(prompt: string): unknown {
const ids = extractRubricIds(prompt);
if (ids.length > 0 || prompt.includes('"checks"')) {
const usesScoreRanges = prompt.includes('"score": integer') || prompt.includes('score ranges');
if (usesScoreRanges) {
return {
checks: ids.map((id) => ({
id,
score: 10,
reasoning:
'Deterministic oracle grader marks the reference fixture as satisfying this criterion.',
})),
overall_reasoning: 'Deterministic oracle grader response.',
};
}

return {
checks: ids.map(
(id): RubricCheck => ({
id,
satisfied: true,
reasoning:
'Deterministic oracle grader marks the reference fixture as satisfying this criterion.',
}),
),
overall_reasoning: 'Deterministic oracle grader response.',
};
}

return {
score: 1,
assertions: [
{
text: 'Reference fixture was accepted by the deterministic oracle grader',
passed: true,
evidence:
'The oracle workflow validates execution and artifact compatibility without live LLM calls.',
},
],
details: {
oracle_grader: true,
},
};
}

async function main() {
const { values } = parseArgs({
options: {
'prompt-file': { type: 'string' },
output: { type: 'string' },
},
});

const promptFile = values['prompt-file'];
const outputFile = values.output;
if (!promptFile || !outputFile) {
throw new Error('Usage: example-oracle-grader.ts --prompt-file <path> --output <path>');
}

const prompt = await readFile(promptFile, 'utf8');
const evaluation = buildEvaluation(prompt);
const response = {
text: JSON.stringify(evaluation),
token_usage: { input: 0, output: 0 },
cost_usd: 0,
duration_ms: 1,
};

await writeFile(outputFile, `${JSON.stringify(response)}\n`, 'utf8');
}

if (import.meta.main) {
main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
});
}
Loading
Loading