Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONCEPTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Shared domain vocabulary for this project — entities, named processes, and sta

**Experiment** — A committed run variant that selects how evals are executed: target or target matrix, setup, scripts, eval filters, repeat counts, timeouts, workers, budgets, and related run knobs. Experiments make A/B setup differences explicit while pointing at stable eval tasks.

**Run manifest** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `artifact_dir`, `task_dir`, `summary_path`, and `grading_path`.
**Run manifest** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `task_dir`, `summary_path`, and `grading_path`.

**Artifact sidecar** — A file beside or below a test-case artifact directory that provides evidence for a result, such as `summary.json`, `grading.json`, `result.json`, transcripts, logs, or outputs. Sidecars are evidence, not the primary discovery mechanism for a run.

Expand Down
4 changes: 2 additions & 2 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ export function buildIndexArtifactEntry(
result: EvaluationResult,
options: {
outputDir: string;
artifactDir?: string;
resultDir?: string;
gradingPath?: string;
timingPath?: string;
summaryPath?: string;
Expand All @@ -115,7 +115,7 @@ export function buildResultIndexArtifact(
result: EvaluationResult,
taskBundle?: MaterializedTaskBundlePaths,
): ResultIndexArtifact {
const artifactSubdir = (buildCoreResultIndexArtifact(result).artifact_dir ?? '').trim();
const artifactSubdir = (buildCoreResultIndexArtifact(result).result_dir ?? '').trim();
const extraIndexFields = taskBundle
? {
task_dir: path.posix.join(artifactSubdir, 'task'),
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/results/combine-run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ function resolveCombinedExperiment(
}

const MANIFEST_PATH_FIELDS = [
'artifact_dir',
'result_dir',
'summary_path',
'grading_path',
'timing_path',
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export interface ResultManifestRecord {
readonly artifact_pointers?: ResultArtifactPointersWire;
readonly external_trace?: ExternalTraceMetadataWire;
readonly response_path?: string;
readonly artifact_dir?: string;
readonly result_dir?: string;
readonly task_dir?: string;
readonly eval_path?: string;
readonly targets_path?: string;
Expand Down
4 changes: 2 additions & 2 deletions apps/cli/src/commands/results/projection-bundle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export interface ProjectionBundleEntry {
export type ProjectionBundleArtifactRefs = Partial<
Pick<
IndexArtifactEntry,
| 'artifact_dir'
| 'result_dir'
| 'summary_path'
| 'grading_path'
| 'timing_path'
Expand Down Expand Up @@ -164,7 +164,7 @@ function artifactRefs(

return dropUndefined({
...metadataRefs,
artifact_dir: indexEntry.artifact_dir,
result_dir: indexEntry.result_dir,
summary_path: indexEntry.summary_path,
grading_path: indexEntry.grading_path,
input_path: indexEntry.input_path,
Expand Down
80 changes: 40 additions & 40 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -644,33 +644,33 @@ function normalizeArtifactRelativePath(relativePath: string): string | undefined
return segments.join('/');
}

function requestedArtifactDir(c: C): { value?: string; error?: string } {
const raw = c.req.query('artifact_dir')?.trim();
function requestedResultDir(c: C): { value?: string; error?: string } {
const raw = c.req.query('result_dir')?.trim();
if (!raw) {
return {};
}
const normalized = normalizeArtifactRelativePath(raw);
if (!normalized) {
return { error: 'Invalid artifact_dir' };
return { error: 'Invalid result_dir' };
}
return { value: normalized };
}

function manifestRecordSelection(
records: readonly ResultManifestRecord[],
evalId: string,
artifactDir?: string,
resultDir?: string,
): { record: ResultManifestRecord; index: number } | undefined {
return records
.map((record, index) => ({ record, index }))
.find(({ record }) => {
if (record.test_id !== evalId) {
return false;
}
if (!artifactDir) {
if (!resultDir) {
return true;
}
return normalizeArtifactRelativePath(record.artifact_dir ?? '') === artifactDir;
return normalizeArtifactRelativePath(record.result_dir ?? '') === resultDir;
});
}

Expand Down Expand Up @@ -804,14 +804,14 @@ function addTrialRunCatalogEntries(
seen: Set<string>,
record: ResultManifestRecord,
): void {
const artifactDir = record.artifact_dir
? normalizeArtifactRelativePath(record.artifact_dir)
const resultDir = record.result_dir
? normalizeArtifactRelativePath(record.result_dir)
: undefined;
if (!artifactDir) return;
if (!resultDir) return;
for (const trial of record.trials ?? []) {
const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined;
if (!runPath) continue;
const runDir = path.posix.join(artifactDir, runPath);
const runDir = path.posix.join(resultDir, runPath);
addDirectArtifactCatalogEntry(
entries,
seen,
Expand Down Expand Up @@ -897,9 +897,9 @@ function artifactTreeCommonDir(
const knownPaths = resultArtifactTreeRootPaths(record, catalog);
if (knownPaths.length === 0) return undefined;

const artifactDirs = knownPaths.map((p) => path.dirname(p));
let commonDir = artifactDirs[0];
for (const dir of artifactDirs) {
const resultDirs = knownPaths.map((p) => path.dirname(p));
let commonDir = resultDirs[0];
for (const dir of resultDirs) {
while (!dir.startsWith(commonDir)) {
const parent = path.dirname(commonDir);
if (parent === commonDir) break;
Expand Down Expand Up @@ -1135,31 +1135,31 @@ function objectField(
}

function caseTrialArtifactPath(
artifactDir: string | undefined,
resultDir: string | undefined,
runPath: string | undefined,
filePath: string,
): string | undefined {
if (!artifactDir || !runPath) return undefined;
return path.posix.join(artifactDir, runPath, filePath);
if (!resultDir || !runPath) return undefined;
return path.posix.join(resultDir, runPath, filePath);
}

function buildRepeatTrialReadModels(
baseDir: string,
record: ResultManifestRecord,
): Array<Record<string, unknown>> | undefined {
if (!record.trials || record.trials.length === 0) return undefined;
const artifactDir = record.artifact_dir
? normalizeArtifactRelativePath(record.artifact_dir)
const resultDir = record.result_dir
? normalizeArtifactRelativePath(record.result_dir)
: undefined;

return record.trials.map((trial) => {
const runPath = trial.run_path ? normalizeArtifactRelativePath(trial.run_path) : undefined;
const metricsPath = caseTrialArtifactPath(artifactDir, runPath, 'metrics.json');
const timingPath = caseTrialArtifactPath(artifactDir, runPath, 'timing.json');
const gradingPath = caseTrialArtifactPath(artifactDir, runPath, 'grading.json');
const transcriptPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript.jsonl');
const transcriptRawPath = caseTrialArtifactPath(artifactDir, runPath, 'transcript-raw.jsonl');
const answerPath = caseTrialArtifactPath(artifactDir, runPath, 'outputs/answer.md');
const metricsPath = caseTrialArtifactPath(resultDir, runPath, 'metrics.json');
const timingPath = caseTrialArtifactPath(resultDir, runPath, 'timing.json');
const gradingPath = caseTrialArtifactPath(resultDir, runPath, 'grading.json');
const transcriptPath = caseTrialArtifactPath(resultDir, runPath, 'transcript.jsonl');
const transcriptRawPath = caseTrialArtifactPath(resultDir, runPath, 'transcript-raw.jsonl');
const answerPath = caseTrialArtifactPath(resultDir, runPath, 'outputs/answer.md');
const metrics = readArtifactJsonObject(baseDir, metricsPath);
const timing = readArtifactJsonObject(baseDir, timingPath);
const toolCalls = objectField(metrics, 'tool_calls');
Expand Down Expand Up @@ -1203,7 +1203,7 @@ function attachRunDetailReadModelFields<T extends Record<string, unknown>>(
return {
...result,
...(record.aggregation && { aggregation: record.aggregation }),
...(record.artifact_dir && { artifact_dir: record.artifact_dir }),
...(record.result_dir && { result_dir: record.result_dir }),
...(record.summary_path && { summary_path: record.summary_path }),
...(record.grading_path && { grading_path: record.grading_path }),
...(record.timing_path && { timing_path: record.timing_path }),
Expand Down Expand Up @@ -1842,14 +1842,14 @@ async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) {
const filename = c.req.param('filename') ?? '';
const evalId = c.req.param('evalId') ?? '';
if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
const artifactDir = requestedArtifactDir(c);
if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
const resultDir = requestedResultDir(c);
if (resultDir.error) return c.json({ error: resultDir.error }, 400);
const meta = await findRunById(searchDir, filename, projectId);
if (!meta) return c.json({ error: 'Run not found' }, 404);
try {
const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId);
const records = await parseManifestForMeta(searchDir, meta, projectId);
const selection = manifestRecordSelection(records, evalId, artifactDir.value);
const selection = manifestRecordSelection(records, evalId, resultDir.value);
const result = selection ? loaded[selection.index] : undefined;
if (!selection || !result) return c.json({ error: 'Eval not found' }, 404);
const baseDir = path.dirname(meta.path);
Expand All @@ -1868,13 +1868,13 @@ async function handleEvalFiles(c: C, { searchDir, projectId }: DataContext) {
const filename = c.req.param('filename') ?? '';
const evalId = c.req.param('evalId') ?? '';
if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
const artifactDir = requestedArtifactDir(c);
if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
const resultDir = requestedResultDir(c);
if (resultDir.error) return c.json({ error: resultDir.error }, 400);
const meta = await findRunById(searchDir, filename, projectId);
if (!meta) return c.json({ error: 'Run not found' }, 404);
try {
const records = await parseManifestForMeta(searchDir, meta, projectId);
const selection = manifestRecordSelection(records, evalId, artifactDir.value);
const selection = manifestRecordSelection(records, evalId, resultDir.value);
if (!selection) return c.json({ error: 'Eval not found' }, 404);
const { record } = selection;

Expand All @@ -1895,8 +1895,8 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext
const filename = c.req.param('filename') ?? '';
const evalId = c.req.param('evalId') ?? '';
if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
const artifactDir = requestedArtifactDir(c);
if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
const resultDir = requestedResultDir(c);
if (resultDir.error) return c.json({ error: resultDir.error }, 400);
const meta = await findRunById(searchDir, filename, projectId);
if (!meta) return c.json({ error: 'Run not found' }, 404);

Expand All @@ -1915,7 +1915,7 @@ async function handleEvalFileContent(c: C, { searchDir, projectId }: DataContext

await ensureRunReadable(searchDir, meta, projectId);
const records = parseResultManifest(readFileSync(meta.path, 'utf8'));
const selection = manifestRecordSelection(records, evalId, artifactDir.value);
const selection = manifestRecordSelection(records, evalId, resultDir.value);
if (!selection) return c.json({ error: 'Eval not found' }, 404);
const { record } = selection;
const catalog = buildResultArtifactCatalog(record, {
Expand All @@ -1942,14 +1942,14 @@ async function handleEvalTraceSession(c: C, { searchDir, projectId }: DataContex
const filename = c.req.param('filename') ?? '';
const evalId = c.req.param('evalId') ?? '';
if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
const artifactDir = requestedArtifactDir(c);
if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
const resultDir = requestedResultDir(c);
if (resultDir.error) return c.json({ error: resultDir.error }, 400);
const meta = await findRunById(searchDir, filename, projectId);
if (!meta) return c.json({ error: 'Run not found' }, 404);

try {
const records = await parseManifestForMeta(searchDir, meta, projectId);
const selection = manifestRecordSelection(records, evalId, artifactDir.value);
const selection = manifestRecordSelection(records, evalId, resultDir.value);
if (!selection) return c.json({ error: 'Eval not found' }, 404);
const { record } = selection;

Expand Down Expand Up @@ -2074,14 +2074,14 @@ async function handleEvalTranscript(c: C, { searchDir, projectId }: DataContext)
const filename = c.req.param('filename') ?? '';
const evalId = c.req.param('evalId') ?? '';
if (!evalId) return c.json({ error: 'Eval id is required' }, 400);
const artifactDir = requestedArtifactDir(c);
if (artifactDir.error) return c.json({ error: artifactDir.error }, 400);
const resultDir = requestedResultDir(c);
if (resultDir.error) return c.json({ error: resultDir.error }, 400);
const meta = await findRunById(searchDir, filename, projectId);
if (!meta) return c.json({ error: 'Run not found' }, 404);

try {
const records = await parseManifestForMeta(searchDir, meta, projectId);
const selection = manifestRecordSelection(records, evalId, artifactDir.value);
const selection = manifestRecordSelection(records, evalId, resultDir.value);
if (!selection) return c.json({ error: 'Eval not found' }, 404);
const { record } = selection;

Expand Down
10 changes: 5 additions & 5 deletions apps/cli/src/commands/results/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ interface IndexEntry {
readonly summary_path?: string;
readonly grading_path?: string;
readonly timing_path?: string;
readonly artifact_dir?: string;
readonly result_dir?: string;
readonly trials?: readonly { readonly run_path?: string }[];
readonly [key: string]: unknown;
}
Expand Down Expand Up @@ -237,22 +237,22 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[]
}

for (const trial of entry.trials ?? []) {
if (!entry.artifact_dir || !trial.run_path) {
if (!entry.result_dir || !trial.run_path) {
continue;
}
const runDirPath = path.join(runDir, entry.artifact_dir, trial.run_path);
const runDirPath = path.join(runDir, entry.result_dir, trial.run_path);
const resultPath = path.join(runDirPath, 'result.json');
const gradingPath = path.join(runDirPath, 'grading.json');
if (!existsSync(resultPath)) {
diagnostics.push({
severity: 'error',
message: `${testId}: result.json not found at '${path.posix.join(entry.artifact_dir, trial.run_path, 'result.json')}'`,
message: `${testId}: result.json not found at '${path.posix.join(entry.result_dir, trial.run_path, 'result.json')}'`,
});
}
if (!existsSync(gradingPath)) {
diagnostics.push({
severity: 'error',
message: `${testId}: grading.json not found at '${path.posix.join(entry.artifact_dir, trial.run_path, 'grading.json')}'`,
message: `${testId}: grading.json not found at '${path.posix.join(entry.result_dir, trial.run_path, 'grading.json')}'`,
});
}
}
Expand Down
17 changes: 7 additions & 10 deletions apps/cli/src/commands/runs/rerun.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ interface SelectedTaskBundle {
readonly record: ResultManifestRecord;
readonly testId: string;
readonly sourceTarget: string;
readonly artifactDir: string;
readonly resultDir: string;
readonly taskDir: string;
readonly evalPath: string;
readonly targetsPath: string;
Expand Down Expand Up @@ -254,10 +254,7 @@ function forbiddenOutputRoots(
): readonly string[] {
return [
path.resolve(sourceRunDir),
...selected.flatMap((bundle) => [
path.resolve(bundle.artifactDir),
path.resolve(bundle.taskDir),
]),
...selected.flatMap((bundle) => [path.resolve(bundle.resultDir), path.resolve(bundle.taskDir)]),
];
}

Expand Down Expand Up @@ -340,11 +337,11 @@ async function loadSelectedTaskBundles(options: {
const taskDir =
resolveRelativeRunPath(options.sourceRunDir, record.task_dir) ??
(evalPath ? path.dirname(evalPath) : undefined);
const artifactDir =
resolveRelativeRunPath(options.sourceRunDir, record.artifact_dir) ??
const resultDir =
resolveRelativeRunPath(options.sourceRunDir, record.result_dir) ??
(taskDir ? path.dirname(taskDir) : undefined);

if (!evalPath || !targetsPath || !taskDir || !artifactDir) {
if (!evalPath || !targetsPath || !taskDir || !resultDir) {
throw new Error(
`Selected result ${recordLabel} is missing task bundle paths. Re-run requires task/EVAL.yaml and task/targets.yaml.`,
);
Expand All @@ -357,7 +354,7 @@ async function loadSelectedTaskBundles(options: {
record,
testId,
sourceTarget,
artifactDir,
resultDir,
taskDir,
evalPath,
targetsPath,
Expand Down Expand Up @@ -386,7 +383,7 @@ function buildSourceMetadataByEvalFile(
mode: 'rerun',
sourceRunDir: path.resolve(sourceRunDir),
sourceIndexPath: path.resolve(indexPath),
sourceArtifactDir: path.resolve(bundle.artifactDir),
sourceResultDir: path.resolve(bundle.resultDir),
sourceTaskDir: path.resolve(bundle.taskDir),
sourceTestId: bundle.testId,
sourceTarget: bundle.sourceTarget,
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ describe('writeArtifactsFromResults', () => {
ci95_upper: 1,
stddev: 0.53,
});
expect(indexEntry?.artifact_dir).toBe('repeat-case');
expect(indexEntry?.result_dir).toBe('repeat-case');
expect(indexEntry?.summary_path).toBe('repeat-case/summary.json');
expect(indexEntry?.task_dir).toBeUndefined();
expect(indexEntry?.input_path).toBeUndefined();
Expand Down Expand Up @@ -1808,7 +1808,7 @@ describe('writeArtifactsFromResults', () => {
.trim()
.split('\n')
.map(JSON.parse);
expect(indexLine.artifact_dir).toBe('imported-suite/shared-id');
expect(indexLine.result_dir).toBe('imported-suite/shared-id');
expect(indexLine.grading_path).toBe('imported-suite/shared-id/run-1/grading.json');
});

Expand Down Expand Up @@ -1937,7 +1937,7 @@ describe('writeArtifactsFromResults', () => {
const indexLine = JSON.parse((await readFile(paths.indexPath, 'utf8')).trim());

expect(indexLine).toMatchObject({
artifact_dir: 'trace-case',
result_dir: 'trace-case',
task_dir: 'trace-case/task',
eval_path: 'trace-case/task/EVAL.yaml',
targets_path: 'trace-case/task/targets.yaml',
Expand Down
Loading
Loading