diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 6d230755c..32efbaad5 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -19,6 +19,8 @@ import { buildAggregateGradingArtifact, buildIndexArtifactEntry as buildCoreIndexArtifactEntry, buildResultIndexArtifact as buildCoreResultIndexArtifact, + buildEvalTestTargetKey, + buildEvaluationResultTargetKey, buildGradingArtifact, buildRunSummaryArtifact, buildTestTargetKey, @@ -41,6 +43,8 @@ import { export { aggregateRunDir, buildAggregateGradingArtifact, + buildEvalTestTargetKey, + buildEvaluationResultTargetKey, buildRunSummaryArtifact, buildGradingArtifact, buildTestTargetKey, diff --git a/apps/cli/src/commands/eval/progress-display.ts b/apps/cli/src/commands/eval/progress-display.ts index f5db6a1b2..58805e0bd 100644 --- a/apps/cli/src/commands/eval/progress-display.ts +++ b/apps/cli/src/commands/eval/progress-display.ts @@ -65,8 +65,6 @@ export class ProgressDisplay { private readonly workers: Map = new Map(); private totalTests = 0; private completedTests = 0; - private readonly logPaths: string[] = []; - private readonly logPathSet = new Set(); private started = false; private finished = false; private readonly verbose: boolean; @@ -133,24 +131,7 @@ export class ProgressDisplay { } addLogPaths(paths: readonly string[]): void { - const newPaths: string[] = []; - for (const path of paths) { - if (this.logPathSet.has(path)) { - continue; - } - this.logPathSet.add(path); - newPaths.push(path); - } - - if (newPaths.length === 0) { - return; - } - - this.logPaths.push(...newPaths); - - for (const p of newPaths) { - console.log(`Provider log: ${p}`); - } + void paths; } finish(): void { diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts index dc02ec6d6..47e8d1fb6 100644 --- a/apps/cli/src/commands/eval/result-layout.ts +++ b/apps/cli/src/commands/eval/result-layout.ts @@ -1,4 +1,4 @@ -import { existsSync, statSync } from 'node:fs'; +import { type Dirent, existsSync, readdirSync, statSync } from 'node:fs'; import path from 'node:path'; export const RESULT_INDEX_FILENAME = 'index.jsonl'; @@ -76,6 +76,37 @@ export function resolveExistingRunPrimaryPath(runDir: string): string | undefine return undefined; } +export function discoverRunManifestPaths(runDir: string): readonly string[] { + const direct = resolveExistingRunPrimaryPath(runDir); + if (direct) { + return [direct]; + } + + const manifests: string[] = []; + function walk(currentDir: string): void { + const primary = resolveExistingRunPrimaryPath(currentDir); + if (primary) { + manifests.push(primary); + return; + } + + let entries: Dirent[]; + try { + entries = readdirSync(currentDir, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + if (entry.isDirectory()) { + walk(path.join(currentDir, entry.name)); + } + } + } + + walk(runDir); + return manifests.sort(); +} + export function isDirectoryPath(filePath: string): boolean { try { return statSync(filePath).isDirectory(); @@ -90,11 +121,20 @@ export function resolveWorkspaceOrFilePath(filePath: string): string { } const existing = resolveExistingRunPrimaryPath(filePath); - if (!existing) { - throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`); + if (existing) { + return existing; } - return existing; + const nested = discoverRunManifestPaths(filePath); + if (nested.length === 1) { + return nested[0]; + } + if (nested.length > 1) { + throw new Error( + `Result workspace contains multiple ${RESULT_INDEX_FILENAME} manifests; pass one bundle directory or manifest: ${filePath}`, + ); + } + throw new Error(`Result workspace is missing ${RESULT_INDEX_FILENAME}: ${filePath}`); } export function resolveRunManifestPath(filePath: string): string { diff --git a/apps/cli/src/commands/eval/run-cache.ts b/apps/cli/src/commands/eval/run-cache.ts index d30c75536..14969e6be 100644 --- a/apps/cli/src/commands/eval/run-cache.ts +++ b/apps/cli/src/commands/eval/run-cache.ts @@ -4,6 +4,7 @@ import path from 'node:path'; import { RESULT_INDEX_FILENAME, + discoverRunManifestPaths, resolveExistingRunPrimaryPath, resolveRunIndexPath, } from './result-layout.js'; @@ -27,7 +28,11 @@ export interface RunCache { */ export function resolveRunCacheFile(cache: RunCache): string { if (cache.lastRunDir) { - return resolveExistingRunPrimaryPath(cache.lastRunDir) ?? resolveRunIndexPath(cache.lastRunDir); + const direct = resolveExistingRunPrimaryPath(cache.lastRunDir); + if (direct) { + return direct; + } + return discoverRunManifestPaths(cache.lastRunDir)[0] ?? resolveRunIndexPath(cache.lastRunDir); } return ''; } @@ -61,14 +66,12 @@ export async function resolveCachedRunDir(cwd: string): Promise { - if (path.basename(resultPath) !== RESULT_INDEX_FILENAME) { - return; - } - const dir = path.join(cwd, '.agentv'); + const lastRunDir = + path.basename(resultPath) === RESULT_INDEX_FILENAME ? path.dirname(resultPath) : resultPath; await mkdir(dir, { recursive: true }); const cache: RunCache = { - lastRunDir: path.dirname(resultPath), + lastRunDir, timestamp: new Date().toISOString(), }; await writeFile(cachePath(cwd), `${JSON.stringify(cache, null, 2)}\n`, 'utf-8'); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 919ccbb36..7490a6cd4 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -52,7 +52,8 @@ import { } from '../results/remote.js'; import { aggregateRunDir, - buildTestTargetKey, + buildEvalTestTargetKey, + buildEvaluationResultTargetKey, deduplicateByTestIdTarget, parseJsonlResults, writeArtifactsFromResults, @@ -65,6 +66,7 @@ import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-d import { buildDefaultRunDirFromName, createRunDirName, + discoverRunManifestPaths, normalizeExperimentName, } from './result-layout.js'; import { @@ -551,16 +553,6 @@ async function ensureFileExists(filePath: string, description: string): Promise< } } -function buildDefaultOutputPathForExperiment( - cwd: string, - resultGroup: string | undefined, - runDirName: string, -): string { - const runDir = buildDefaultRunDirFromName(cwd, resultGroup, runDirName); - mkdirSync(runDir, { recursive: true }); - return path.join(runDir, 'index.jsonl'); -} - function deriveEvalResultGroupName(evalFilePath: string | undefined): string { if (!evalFilePath) { return 'eval'; @@ -1013,6 +1005,94 @@ function applyVerboseOverride(selection: TargetSelection, cliVerbose: boolean): }; } +function safeRunPathSegment(value: string | undefined, fallback: string): string { + const trimmed = value?.trim(); + if (!trimmed) { + return fallback; + } + const segment = trimmed.replace(/[/\\:*?"<>|]/g, '_'); + return !segment || segment === '.' || segment === '..' ? fallback : segment; +} + +function targetVariantForSelection(selection: TargetSelection): string | undefined { + const target = selection.resolvedTarget; + if (target.kind === 'replay') { + return target.config.variant; + } + return undefined; +} + +function resultBundleKey(result: Pick): string { + return JSON.stringify({ + target: result.target ?? 'unknown', + variant: result.variant ?? null, + }); +} + +function resultBundleDir( + invocationDir: string, + result: Pick, +): string { + const targetDir = safeRunPathSegment(result.target, 'unknown-target'); + const variantDir = result.variant ? safeRunPathSegment(result.variant, 'variant') : undefined; + return variantDir + ? path.join(invocationDir, targetDir, variantDir) + : path.join(invocationDir, targetDir); +} + +class BundleOutputWriter implements OutputWriter { + private readonly writers = new Map< + string, + { readonly dir: string; readonly indexPath: string; readonly writer: OutputWriter } + >(); + + constructor( + private readonly invocationDir: string, + private readonly appendMode: boolean, + ) {} + + async append(result: EvaluationResult): Promise { + const writer = await this.writerForResult(result); + await writer.append(result); + } + + async close(): Promise { + await Promise.all([...this.writers.values()].map((entry) => entry.writer.close())); + } + + bundleDirs(): readonly string[] { + return [...this.writers.values()].map((entry) => entry.dir); + } + + bundleIndexPaths(): readonly string[] { + return [...this.writers.values()].map((entry) => entry.indexPath); + } + + private async writerForResult(result: EvaluationResult): Promise { + const key = resultBundleKey(result); + const existing = this.writers.get(key); + if (existing) { + return existing.writer; + } + const dir = resultBundleDir(this.invocationDir, result); + mkdirSync(dir, { recursive: true }); + const indexPath = path.join(dir, 'index.jsonl'); + const writer = await createOutputWriter(indexPath, { append: this.appendMode }); + this.writers.set(key, { dir, indexPath, writer }); + return writer; + } +} + +async function readExistingResultsFromRunDir(runDir: string): Promise { + const manifests = discoverRunManifestPaths(runDir); + const results: EvaluationResult[] = []; + for (const manifest of manifests) { + const content = await readFile(manifest, 'utf8'); + results.push(...parseJsonlResults(content)); + } + return results; +} + async function prepareFileMetadata(params: { readonly testFilePath: string; readonly repoRoot: string; @@ -1317,6 +1397,7 @@ async function runSingleEvalFile(params: { // CLI provider verbose logging should only be enabled when --verbose flag is passed const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose); + const explicitVariant = targetVariantForSelection(resolvedTargetSelection); const providerLabel = resolvedTargetSelection.resolvedTarget.kind; const targetMessage = options.verbose ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} ${buildTargetLabelSuffix(providerLabel, resolvedTargetSelection.resolvedTarget)} via ${resolvedTargetSelection.targetsFilePath}` @@ -1428,7 +1509,9 @@ async function runSingleEvalFile(params: { // Trim output messages for results JSONL based on --output-messages. // Each message is trimmed to { role, content } only (no toolCalls, startTime, etc.). // Full output with tool calls goes to OTel. - const resultWithMetadata = withSourceMetadata(result, testFilePath, options); + const resultWithVariant = + explicitVariant && !result.variant ? { ...result, variant: explicitVariant } : result; + const resultWithMetadata = withSourceMetadata(resultWithVariant, testFilePath, options); const trimmedResult = prepareResultForJsonl(resultWithMetadata, options); await outputWriter.append(trimmedResult); @@ -1482,7 +1565,15 @@ async function runSingleEvalFile(params: { }, }); - return { results: results.map((result) => withSourceMetadata(result, testFilePath, options)) }; + return { + results: results.map((result) => + withSourceMetadata( + explicitVariant && !result.variant ? { ...result, variant: explicitVariant } : result, + testFilePath, + options, + ), + ), + }; } export interface RunEvalResult { @@ -1647,14 +1738,14 @@ export async function runEvalCommand( if (options.resume && !options.retryErrors) { const explicitResumeDir = options.outputDir; if (explicitResumeDir) { - const resumeIndexPath = path.join(path.resolve(explicitResumeDir), 'index.jsonl'); - if (existsSync(resumeIndexPath)) { - const content = await readFile(resumeIndexPath, 'utf8'); - const existingResults = parseJsonlResults(content); + const resumeDir = path.resolve(explicitResumeDir); + const resumeIndexPaths = discoverRunManifestPaths(resumeDir); + if (resumeIndexPaths.length > 0) { + const existingResults = await readExistingResultsFromRunDir(resumeDir); resumeSkipKeys = new Set(); for (const r of existingResults) { if (shouldSkipExistingResultForResume(r, options.rerunFailed)) { - resumeSkipKeys.add(buildTestTargetKey(r.testId, r.target)); + resumeSkipKeys.add(buildEvaluationResultTargetKey(r)); } } isResumeAppend = true; @@ -1663,8 +1754,8 @@ export async function runEvalCommand( `${modeLabel}: found ${existingResults.length} existing result(s), skipping ${resumeSkipKeys.size} completed.`, ); } else { - // No existing index.jsonl — behave like a normal run - console.log('Resume: no existing index.jsonl found, starting fresh run.'); + // No existing bundle index.jsonl — behave like a normal run + console.log('Resume: no existing bundle index.jsonl found, starting fresh run.'); } } else { console.warn( @@ -1695,7 +1786,8 @@ export async function runEvalCommand( console.log(`Repository root: ${repoRoot}`); } - // Resolve artifact directory (runDir) and primary output path. + // Resolve artifact directory. The CLI run dir is an invocation root; each + // target/variant writes its own bundle index below it. // Precedence: --output > config output.dir > default const explicitDir = options.outputDir; let runDir: string; @@ -1705,11 +1797,12 @@ export async function runEvalCommand( if (explicitDir) { runDir = path.resolve(explicitDir); mkdirSync(runDir, { recursive: true }); - outputPath = path.join(runDir, 'index.jsonl'); + outputPath = runDir; } else { // Default: .agentv/results///. - outputPath = buildDefaultOutputPathForExperiment(cwd, resultGroupName, runDirName); - runDir = path.dirname(outputPath); + runDir = buildDefaultRunDirFromName(cwd, resultGroupName, runDirName); + mkdirSync(runDir, { recursive: true }); + outputPath = runDir; } if (!process.env.AGENTV_RUN_TIMESTAMP) { process.env.AGENTV_RUN_TIMESTAMP = path.basename(runDir); @@ -1782,8 +1875,6 @@ export async function runEvalCommand( } } - const primaryWritePath = outputPath; - console.log(`Artifact directory: ${runDir}`); // Log file export paths @@ -1896,10 +1987,9 @@ export async function runEvalCommand( throw new Error('--threshold must be between 0 and 1'); } - // Build the output writer. Primary output is always JSONL to the artifact directory. - const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, { - append: isResumeAppend, - }); + // Build the output writer. Each target/variant gets a separate bundle index + // below the invocation directory. + const outputWriter = new BundleOutputWriter(runDir, isResumeAppend); // Detect matrix mode: multiple targets for any file const isMatrixMode = Array.from(fileMetadata.values()).some((meta) => meta.selections.length > 1); @@ -1908,16 +1998,27 @@ export async function runEvalCommand( // When resuming, subtract tests that will be skipped let totalEvalCount = 0; let resumeSkippedCount = 0; + const plannedBundleCounts = new Map< + string, + { readonly target: string; readonly variant?: string; count: number } + >(); for (const meta of fileMetadata.values()) { - const suiteTargetNames = meta.selections.map((s) => s.selection.targetName); for (const test of meta.testCases) { - const effectiveTargets = suiteTargetNames.length > 0 ? suiteTargetNames : ['unknown']; - for (const tn of effectiveTargets) { - const key = `${test.id}::${tn}`; + for (const { selection } of meta.selections) { + const target = selection.targetName; + const variant = targetVariantForSelection(selection); + const key = buildEvalTestTargetKey(test, target, variant); if (resumeSkipKeys?.has(key)) { resumeSkippedCount++; } else { totalEvalCount++; + const bundleKey = resultBundleKey({ target, variant }); + const existing = plannedBundleCounts.get(bundleKey); + if (existing) { + existing.count += 1; + } else { + plannedBundleCounts.set(bundleKey, { target, variant, count: 1 }); + } } } } @@ -2039,21 +2140,23 @@ export async function runEvalCommand( ); } - // Write a stub summary.json before dispatching tests, carrying the planned - // execution count so an interrupted run can still surface as resumable in - // Dashboard (results.length < planned_test_count) even when every recorded row - // has execution_status: ok. The end-of-run write preserves this value via - // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults. + // Write a stub summary.json in each planned bundle before dispatching tests, + // carrying the planned execution count so an interrupted run can still + // surface as resumable in Dashboard. The end-of-run write preserves this + // value via readPlannedTestCount inside aggregateRunDir / + // writeArtifactsFromResults. // Skip on resume — we want to preserve the *original* planned count. if (!isResumeAppend && totalEvalCount > 0) { const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; - await writeInitialRunSummaryArtifact(runDir, { - evalFile, - plannedTestCount: totalEvalCount, - experiment: normalizeExperimentName(options.experiment), - experimentMetadata: runExperimentMetadata, - runtimeSource: runtimeSourceMetadata, - }); + for (const bundle of plannedBundleCounts.values()) { + await writeInitialRunSummaryArtifact(resultBundleDir(runDir, bundle), { + evalFile, + plannedTestCount: bundle.count, + experiment: normalizeExperimentName(options.experiment), + experimentMetadata: runExperimentMetadata, + runtimeSource: runtimeSourceMetadata, + }); + } } // Periodic WIP checkpoint loop: push partial results to a unique non-default @@ -2099,6 +2202,7 @@ export async function runEvalCommand( const budgetMsg = `Run budget exceeded ($${fileBudgetTracker.currentCostUsd.toFixed(4)} / $${fileBudgetTracker.budgetCapUsd.toFixed(4)})`; console.log(`\n⚠ ${budgetMsg} — skipping ${path.basename(testFilePath)}`); for (const { selection } of targetPrep.selections) { + const explicitVariant = targetVariantForSelection(selection); const skippedResults: EvaluationResult[] = targetPrep.testCases.map((testCase) => ({ timestamp: new Date().toISOString(), testId: testCase.id, @@ -2121,6 +2225,7 @@ export async function runEvalCommand( failureReasonCode: 'budget_exceeded' as const, executionError: { message: budgetMsg, stage: 'setup' as const }, target: selection.targetName, + variant: explicitVariant, })); for (const r of skippedResults) { await outputWriter.append(withSourceMetadata(r, testFilePath, fileOptions)); @@ -2143,7 +2248,10 @@ export async function runEvalCommand( // --resume / --rerun-failed: skip tests that are already completed const filteredTestCases = resumeSkipKeys ? applicableTestCases.filter( - (test) => !resumeSkipKeys.has(buildTestTargetKey(test.id, targetName)), + (test) => + !resumeSkipKeys.has( + buildEvalTestTargetKey(test, targetName, targetVariantForSelection(selection)), + ), ) : applicableTestCases; @@ -2212,6 +2320,7 @@ export async function runEvalCommand( console.error( `\n[ERROR] ⚠ Eval file failed: ${path.basename(testFilePath)} — ${message}\n`, ); + const explicitVariant = targetVariantForSelection(selection); const errorResults: EvaluationResult[] = filteredTestCases.map((testCase) => withSourceMetadata( { @@ -2237,6 +2346,7 @@ export async function runEvalCommand( durationMs: 0, tokenUsage: { input: 0, output: 0 }, target: selection.targetName, + variant: explicitVariant, }, testFilePath, fileOptions, @@ -2270,12 +2380,9 @@ export async function runEvalCommand( // Flush the output writer so all results are on disk before we read back. await outputWriter.close().catch(() => undefined); - // When resuming, compute summary from ALL results (old + new, deduplicated) - let summaryResults = allResults; - if (isResumeAppend) { - const content = await readFile(outputPath, 'utf8'); - summaryResults = deduplicateByTestIdTarget(parseJsonlResults(content)); - } + // Compute summary from the persisted bundle indexes so resume includes old + // rows and normal runs reflect the same manifests Dashboard will read. + const summaryResults = deduplicateByTestIdTarget(await readExistingResultsFromRunDir(runDir)); const thresholdOpts = hasScopedRunPolicies || hasPerFileRuntimeThresholds @@ -2305,57 +2412,77 @@ export async function runEvalCommand( console.log(formatMatrixSummary(summaryResults)); } - // Write artifacts to the run directory (always, not conditional on flags) + // Write artifacts to target/variant bundle directories (always, not + // conditional on flags). The invocation root is only a container. if (allResults.length > 0) { const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; const sourceTests = activeSourceTests; const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata); + const resultsByBundle = new Map(); + for (const result of allResults) { + const key = resultBundleKey(result); + const existing = resultsByBundle.get(key); + if (existing) { + existing.push(result); + } else { + resultsByBundle.set(key, [result]); + } + } if (isResumeAppend) { - // Resume mode: write per-test artifacts for newly-run tests, then aggregate - // from the full index.jsonl (old + new results with deduplication) + // Resume mode: write per-test artifacts for newly-run tests, then + // aggregate each bundle from its full index.jsonl (old + new results + // with deduplication). const { writePerTestArtifacts } = await import('./artifact-writer.js'); - await writePerTestArtifacts(allResults, runDir, { - experiment: normalizeExperimentName(options.experiment), - resultGroup: resultGroupName, - cwd, - repoRoot, - sourceTests, - taskBundleTargets, - runtimeSource: runtimeSourceMetadata, - }); - const { summaryPath } = await aggregateRunDir(runDir, { - evalFile, - experiment: normalizeExperimentName(options.experiment), - experimentMetadata: runExperimentMetadata, - runtimeSource: runtimeSourceMetadata, - }); - const indexPath = path.join(runDir, 'index.jsonl'); - console.log(`Artifact workspace updated: ${runDir}`); - console.log(` Index: ${indexPath}`); - console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`); - console.log(` Summary: ${summaryPath}`); - } else { - const { testArtifactDir, summaryPath, indexPath } = await writeArtifactsFromResults( - allResults, - runDir, - { - evalFile, + for (const bundleResults of resultsByBundle.values()) { + const bundleDir = resultBundleDir(runDir, bundleResults[0]); + await writePerTestArtifacts(bundleResults, bundleDir, { experiment: normalizeExperimentName(options.experiment), - experimentMetadata: runExperimentMetadata, resultGroup: resultGroupName, cwd, repoRoot, sourceTests, taskBundleTargets, runtimeSource: runtimeSourceMetadata, - }, - ); - console.log(`Artifact workspace written to: ${runDir}`); - console.log(` Index: ${indexPath}`); - console.log( - ` Per-test artifacts: ${testArtifactDir} (${allResults.length} test directories)`, - ); - console.log(` Summary: ${summaryPath}`); + }); + const { summaryPath } = await aggregateRunDir(bundleDir, { + evalFile, + experiment: normalizeExperimentName(options.experiment), + experimentMetadata: runExperimentMetadata, + runtimeSource: runtimeSourceMetadata, + }); + const indexPath = path.join(bundleDir, 'index.jsonl'); + console.log(`Artifact bundle updated: ${bundleDir}`); + console.log(` Index: ${indexPath}`); + console.log( + ` Per-test artifacts: ${bundleDir} (${bundleResults.length} new test directories)`, + ); + console.log(` Summary: ${summaryPath}`); + } + } else { + for (const bundleResults of resultsByBundle.values()) { + const bundleDir = resultBundleDir(runDir, bundleResults[0]); + const { testArtifactDir, summaryPath, indexPath } = await writeArtifactsFromResults( + bundleResults, + bundleDir, + { + evalFile, + experiment: normalizeExperimentName(options.experiment), + experimentMetadata: runExperimentMetadata, + resultGroup: resultGroupName, + cwd, + repoRoot, + sourceTests, + taskBundleTargets, + runtimeSource: runtimeSourceMetadata, + }, + ); + console.log(`Artifact bundle written to: ${bundleDir}`); + console.log(` Index: ${indexPath}`); + console.log( + ` Per-test artifacts: ${testArtifactDir} (${bundleResults.length} test directories)`, + ); + console.log(` Summary: ${summaryPath}`); + } } } @@ -2381,10 +2508,16 @@ export async function runEvalCommand( } if (allResults.length > 0) { + const writtenIndexes = outputWriter.bundleIndexPaths(); + outputPath = writtenIndexes[0] ?? outputPath; console.log(`\nResults written to: ${outputPath}`); + console.log(`\nResults written under: ${runDir}`); + for (const indexPath of writtenIndexes) { + console.log(` ${indexPath}`); + } // Persist last run path for `agentv results` commands - await saveRunCache(cwd, outputPath).catch(() => undefined); + await saveRunCache(cwd, runDir).catch(() => undefined); finalExportStatus = await maybeAutoExportRunArtifacts({ cwd, diff --git a/apps/cli/src/commands/pipeline/input.ts b/apps/cli/src/commands/pipeline/input.ts index c71f1fb82..616b09343 100644 --- a/apps/cli/src/commands/pipeline/input.ts +++ b/apps/cli/src/commands/pipeline/input.ts @@ -131,8 +131,7 @@ export const evalInputCommand = command({ } // Use tests[0].suite — loaders (yaml-parser, jsonl-parser) already apply the - // metadata.name → filename-basename → 'eval' fallback. This keeps subagent-mode - // artifact layout aligned with CLI mode (artifact-writer.ts:buildArtifactSubdir). + // metadata.name → filename-basename → 'eval' fallback for subagent-mode labels. const suiteName = tests[0]?.suite?.trim() ?? ''; const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index ff0992cb6..1f8c7fc0f 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -156,8 +156,7 @@ export const evalRunCommand = command({ } // Use tests[0].suite — loaders (yaml-parser, jsonl-parser) already apply the - // metadata.name → filename-basename → 'eval' fallback. This keeps subagent-mode - // artifact layout aligned with CLI mode (artifact-writer.ts:buildArtifactSubdir). + // metadata.name → filename-basename → 'eval' fallback for subagent-mode labels. const suiteName = tests[0]?.suite?.trim() ?? ''; const safeSuiteName = suiteName ? suiteName.replace(/[\/\\:*?"<>|]/g, '_') : ''; diff --git a/apps/cli/src/commands/results/combine-run.ts b/apps/cli/src/commands/results/combine-run.ts index ccced82b1..f8af0f8b4 100644 --- a/apps/cli/src/commands/results/combine-run.ts +++ b/apps/cli/src/commands/results/combine-run.ts @@ -156,7 +156,11 @@ function latestTimestamp(values: readonly (string | undefined)[]): string | unde } function resultKey(record: ResultManifestRecord, result: EvaluationResult): string { - return buildTestTargetKey(record.test_id ?? result.testId, record.target ?? result.target); + return buildTestTargetKey( + record.test_id ?? result.testId, + record.target ?? result.target, + record.variant ?? result.variant, + ); } function loadSources(sources: readonly CombineRunSource[]): LoadedSource[] { diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 0925a671c..e2b017eb8 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -29,6 +29,7 @@ export interface ResultManifestRecord { readonly category?: string; readonly experiment?: string; readonly target?: string; + readonly variant?: string; readonly score: number; readonly scores?: readonly Record[]; readonly trials?: readonly { @@ -232,6 +233,7 @@ function hydrateManifestRecord( suite: record.suite, category: record.category, target: record.target, + variant: record.variant, score: record.score, executionStatus: record.execution_status, error: record.error, @@ -310,6 +312,7 @@ export interface LightweightResultRecord { readonly suite?: string; readonly category?: string; readonly target?: string; + readonly variant?: string; readonly experiment?: string; readonly score: number; readonly scores?: readonly Record[]; @@ -329,6 +332,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec suite: record.suite, category: record.category, target: record.target, + variant: record.variant, experiment: record.experiment, score: record.score, scores: record.scores, diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 32c64335b..9f609e831 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -63,6 +63,7 @@ import { getProject, loadConfig, loadProjectRegistry, + normalizeCategoryPath, normalizeTraceArtifactToTraceSessionResponse, omitExternalTraceMetadataKeys, readGitResultArtifact, @@ -1883,30 +1884,7 @@ async function handleRunCategories(c: C, { searchDir, agentvDir, projectId }: Da try { const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); - const categoryMap = new Map }>(); - for (const r of loaded) { - const cat = r.category ?? DEFAULT_CATEGORY; - const entry = categoryMap.get(cat) ?? { - results: [], - suites: new Set(), - }; - entry.results.push(r); - entry.suites.add(r.suite ?? r.target ?? 'default'); - categoryMap.set(cat, entry); - } - const categories = [...categoryMap.entries()].map(([name, entry]) => { - const qualitySummary = summarizeQualityResults(entry.results, pass_threshold); - return { - name, - total: qualitySummary.totalCount, - passed: qualitySummary.passedCount, - failed: qualitySummary.qualityFailureCount, - avg_score: qualitySummary.avgScore, - execution_error_count: qualitySummary.executionErrorCount, - suite_count: entry.suites.size, - }; - }); - return c.json({ categories }); + return c.json(buildCategoryRollups(loaded, pass_threshold)); } catch { return c.json({ error: 'Failed to load categories' }, 500); } @@ -1920,7 +1898,10 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D try { const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId); const { threshold: pass_threshold } = loadStudioConfig(agentvDir); - const filtered = loaded.filter((r) => (r.category ?? DEFAULT_CATEGORY) === category); + const selectedCategory = normalizeCategoryPath(category); + const filtered = loaded.filter((r) => + isCategoryDescendant(categoryPathFromResult(r), selectedCategory), + ); const suiteMap = new Map(); for (const r of filtered) { const ds = r.suite ?? r.target ?? 'default'; @@ -1945,6 +1926,120 @@ async function handleCategorySuites(c: C, { searchDir, agentvDir, projectId }: D } } +interface CategoryRollupBucket { + readonly results: EvaluationResult[]; + readonly suites: Set; + readonly children: Set; +} + +interface CategoryRollupSummary { + readonly name: string; + readonly label: string; + readonly parent?: string; + readonly depth: number; + readonly total: number; + readonly passed: number; + readonly failed: number; + readonly avg_score: number; + readonly execution_error_count: number; + readonly suite_count: number; + readonly child_count: number; + readonly children?: CategoryRollupSummary[]; +} + +function categoryPathFromResult(result: EvaluationResult): string { + return normalizeCategoryPath(result.category ?? DEFAULT_CATEGORY); +} + +function categoryPrefixes(category: string): string[] { + const parts = category.split('/').filter((part) => part.length > 0); + if (parts.length === 0) return [DEFAULT_CATEGORY]; + return parts.map((_, index) => parts.slice(0, index + 1).join('/')); +} + +function categoryParent(category: string): string | undefined { + const parts = category.split('/'); + return parts.length > 1 ? parts.slice(0, -1).join('/') : undefined; +} + +function categoryLabel(category: string): string { + return category.split('/').at(-1) ?? category; +} + +function isCategoryDescendant(category: string, selectedCategory: string): boolean { + return category === selectedCategory || category.startsWith(`${selectedCategory}/`); +} + +function summarizeCategoryBucket( + name: string, + entry: CategoryRollupBucket, + passThreshold: number, +): CategoryRollupSummary { + const qualitySummary = summarizeQualityResults(entry.results, passThreshold); + const parent = categoryParent(name); + return { + name, + label: categoryLabel(name), + ...(parent && { parent }), + depth: name.split('/').filter(Boolean).length - 1, + total: qualitySummary.totalCount, + passed: qualitySummary.passedCount, + failed: qualitySummary.qualityFailureCount, + avg_score: qualitySummary.avgScore, + execution_error_count: qualitySummary.executionErrorCount, + suite_count: entry.suites.size, + child_count: entry.children.size, + }; +} + +function buildCategoryRollups( + results: readonly EvaluationResult[], + passThreshold: number, +): { categories: CategoryRollupSummary[]; category_tree: CategoryRollupSummary[] } { + const categoryMap = new Map(); + const ensureEntry = (name: string): CategoryRollupBucket => { + const existing = categoryMap.get(name); + if (existing) return existing; + const created = { results: [], suites: new Set(), children: new Set() }; + categoryMap.set(name, created); + return created; + }; + + for (const result of results) { + const category = categoryPathFromResult(result); + const suite = result.suite ?? result.target ?? 'default'; + const prefixes = categoryPrefixes(category); + for (const prefix of prefixes) { + const entry = ensureEntry(prefix); + entry.results.push(result); + entry.suites.add(suite); + } + for (let index = 1; index < prefixes.length; index++) { + ensureEntry(prefixes[index - 1]).children.add(prefixes[index]); + } + } + + const categories = [...categoryMap.entries()] + .map(([name, entry]) => summarizeCategoryBucket(name, entry, passThreshold)) + .sort((a, b) => a.name.localeCompare(b.name)); + + const summariesByName = new Map(categories.map((summary) => [summary.name, summary])); + const buildTreeNode = (summary: CategoryRollupSummary): CategoryRollupSummary => { + const children = [...(categoryMap.get(summary.name)?.children ?? [])] + .map((childName) => summariesByName.get(childName)) + .filter((child): child is CategoryRollupSummary => Boolean(child)) + .sort((a, b) => a.name.localeCompare(b.name)) + .map(buildTreeNode); + return children.length > 0 ? { ...summary, children } : summary; + }; + const categoryTree = categories + .filter((summary) => !summary.parent) + .sort((a, b) => a.name.localeCompare(b.name)) + .map(buildTreeNode); + + return { categories, category_tree: categoryTree }; +} + async function handleEvalDetail(c: C, { searchDir, projectId }: DataContext) { const filename = c.req.param('filename') ?? ''; const evalId = c.req.param('evalId') ?? ''; @@ -2449,7 +2544,7 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont } entry.tests.push({ test_id: r.testId, - ...(r.category && { category: r.category }), + ...(r.category && { category: normalizeCategoryPath(r.category) }), score: r.score, passed, execution_status: r.executionStatus, @@ -2459,7 +2554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir, projectId }: DataCont // Per-run accumulation. Dedupe tests within the run by last-wins. runTestMap.set(r.testId, { test_id: r.testId, - ...(r.category && { category: r.category }), + ...(r.category && { category: normalizeCategoryPath(r.category) }), score: r.score, passed, execution_status: r.executionStatus, diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts index 9ef6034ca..734b400e5 100644 --- a/apps/cli/test/commands/eval/aggregate.test.ts +++ b/apps/cli/test/commands/eval/aggregate.test.ts @@ -1,5 +1,13 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; -import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + rmSync, + writeFileSync, +} from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -45,12 +53,32 @@ function writeJsonlIndex(dir: string, results: Partial[]): str return indexPath; } +function readIndexRows(dir: string): Array<{ test_id: string; result_dir: string }> { + const indexPath = path.join(dir, 'index.jsonl'); + if (!existsSync(indexPath)) { + return readdirSync(dir) + .filter((entry) => /--[a-f0-9]{12}$/.test(entry)) + .map((entry) => ({ test_id: entry.replace(/--[a-f0-9]{12}$/, ''), result_dir: entry })); + } + return readFileSync(path.join(dir, 'index.jsonl'), 'utf8') + .trim() + .split('\n') + .filter(Boolean) + .map((line) => JSON.parse(line) as { test_id: string; result_dir: string }); +} + +function rowRunPath(dir: string, testId: string, ...segments: string[]): string { + const row = readIndexRows(dir).find((entry) => entry.test_id === testId); + expect(row?.result_dir).toMatch(new RegExp(`^${testId}--[a-f0-9]{12}$`)); + return path.join(dir, row?.result_dir ?? '', ...segments); +} + // --------------------------------------------------------------------------- // deduplicateByTestIdTarget // --------------------------------------------------------------------------- describe('deduplicateByTestIdTarget', () => { - it('keeps last entry per (testId, target) pair', () => { + it('keeps last entry per (testId, target, variant) tuple', () => { const results = [ makeResult({ testId: 'a', target: 'x', score: 0.1 }), makeResult({ testId: 'a', target: 'x', score: 0.9 }), @@ -72,6 +100,63 @@ describe('deduplicateByTestIdTarget', () => { expect(deduped).toHaveLength(2); }); + it('keeps entries with different variants for the same test and target', () => { + const results = [ + makeResult({ testId: 'a', target: 'x', variant: 'baseline', score: 0.3 }), + makeResult({ testId: 'a', target: 'x', variant: 'candidate', score: 0.7 }), + makeResult({ testId: 'a', target: 'x', variant: 'candidate', score: 0.9 }), + ]; + const deduped = deduplicateByTestIdTarget(results); + expect(deduped).toHaveLength(2); + expect(deduped.map((r) => [r.variant, r.score])).toEqual([ + ['baseline', 0.3], + ['candidate', 0.9], + ]); + }); + + it('keeps entries with different suites for the same test and target', () => { + const results = [ + makeResult({ suite: 'suite-a', testId: 'a', target: 'x', score: 0.3 }), + makeResult({ suite: 'suite-b', testId: 'a', target: 'x', score: 0.7 }), + ]; + const deduped = deduplicateByTestIdTarget(results); + expect(deduped).toHaveLength(2); + expect(deduped.map((r) => r.suite)).toEqual(['suite-a', 'suite-b']); + }); + + it('keeps duplicate suite labels from different eval paths', () => { + const results = [ + makeResult({ + suite: 'duplicate-suite', + testId: 'a', + target: 'x', + source: { + evalFilePath: 'evals/a/cases.eval.yaml', + evalFileAbsolutePath: '/repo/evals/a/cases.eval.yaml', + testId: 'a', + testSnapshotYaml: 'id: a\n', + graderDefinitions: [], + references: [], + }, + }), + makeResult({ + suite: 'duplicate-suite', + testId: 'a', + target: 'x', + source: { + evalFilePath: 'evals/b/cases.eval.yaml', + evalFileAbsolutePath: '/repo/evals/b/cases.eval.yaml', + testId: 'a', + testSnapshotYaml: 'id: a\n', + graderDefinitions: [], + references: [], + }, + }), + ]; + const deduped = deduplicateByTestIdTarget(results); + expect(deduped).toHaveLength(2); + }); + it('handles empty input', () => { expect(deduplicateByTestIdTarget([])).toHaveLength(0); }); @@ -180,17 +265,17 @@ describe('writePerTestArtifacts', () => { await writePerTestArtifacts(results, tmpDir); const grading1 = JSON.parse( - readFileSync(path.join(tmpDir, 'test-1', 'run-1', 'grading.json'), 'utf8'), + readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'grading.json'), 'utf8'), ); expect(grading1.assertions).toHaveLength(1); const timing1 = JSON.parse( - readFileSync(path.join(tmpDir, 'test-1', 'run-1', 'timing.json'), 'utf8'), + readFileSync(rowRunPath(tmpDir, 'test-1', 'run-1', 'timing.json'), 'utf8'), ); expect(timing1.total_tokens).toBeGreaterThanOrEqual(0); const grading2 = JSON.parse( - readFileSync(path.join(tmpDir, 'test-2', 'run-1', 'grading.json'), 'utf8'), + readFileSync(rowRunPath(tmpDir, 'test-2', 'run-1', 'grading.json'), 'utf8'), ); expect(grading2.assertions).toHaveLength(1); }); @@ -201,7 +286,7 @@ describe('writePerTestArtifacts', () => { await writePerTestArtifacts(results, tmpDir); const answer = readFileSync( - path.join(tmpDir, 'test-1', 'run-1', 'outputs', 'answer.md'), + rowRunPath(tmpDir, 'test-1', 'run-1', 'outputs', 'answer.md'), 'utf8', ); expect(answer).toContain('hello'); diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index d248f474f..2ba014d14 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -10,6 +10,7 @@ import { type GraderResult, METRICS_SCHEMA_VERSION, MetricsArtifactWireSchema, + buildResultIndexArtifact, buildTraceFromMessages, parseYamlValue, } from '@agentv/core'; @@ -75,6 +76,33 @@ function makeEvaluatorResult(overrides: Partial = {}): GraderResul } as GraderResult; } +async function readIndexLines(indexPath: string): Promise { + const content = (await readFile(indexPath, 'utf8')).trim(); + if (!content) return []; + return content.split('\n').map((line) => JSON.parse(line) as IndexArtifactEntry); +} + +function escapeRegex(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function expectRowDir( + entry: Pick | undefined, + expectedPrefix = entry?.test_id ?? 'unknown', +): string { + expect(entry?.result_dir).toMatch(new RegExp(`^${escapeRegex(expectedPrefix)}--[a-f0-9]{12}$`)); + return entry?.result_dir ?? ''; +} + +function runArtifactPath( + rootDir: string, + entry: Pick | undefined, + ...segments: string[] +): string { + expect(entry?.result_dir).toBeTruthy(); + return path.join(rootDir, entry?.result_dir ?? '', ...segments); +} + // --------------------------------------------------------------------------- // Grading artifact // --------------------------------------------------------------------------- @@ -876,14 +904,25 @@ describe('writeArtifactsFromResults', () => { evalFile: 'my-eval.yaml', }); + const indexLines = await readIndexLines(paths.indexPath); + expect(indexLines).toHaveLength(2); + const alphaRowDir = expectRowDir(indexLines[0], 'alpha'); + const betaRowDir = expectRowDir(indexLines[1], 'beta'); + expect(alphaRowDir).not.toBe(betaRowDir); + // Check per-test artifact directories const artifactEntries = await readdir(paths.testArtifactDir); - expect(artifactEntries.sort()).toEqual(['alpha', 'beta', 'index.jsonl', 'summary.json']); + expect(artifactEntries.sort()).toEqual([ + alphaRowDir, + betaRowDir, + 'index.jsonl', + 'summary.json', + ]); - const alphaEntries = await readdir(path.join(paths.testArtifactDir, 'alpha')); + const alphaEntries = await readdir(path.join(paths.testArtifactDir, alphaRowDir)); expect(alphaEntries.sort()).toEqual(['run-1', 'summary.json']); - const alphaRunEntries = await readdir(path.join(paths.testArtifactDir, 'alpha', 'run-1')); + const alphaRunEntries = await readdir(path.join(paths.testArtifactDir, alphaRowDir, 'run-1')); expect(alphaRunEntries.sort()).toEqual([ 'grading.json', 'metrics.json', @@ -895,13 +934,16 @@ describe('writeArtifactsFromResults', () => { ]); const alphaGrading: GradingArtifact = JSON.parse( - await readFile(path.join(paths.testArtifactDir, 'alpha', 'run-1', 'grading.json'), 'utf8'), + await readFile( + path.join(paths.testArtifactDir, alphaRowDir, 'run-1', 'grading.json'), + 'utf8', + ), ); expect(alphaGrading.summary).toBeDefined(); expect(alphaGrading).not.toHaveProperty('execution_metrics'); const alphaTiming: TimingArtifact = JSON.parse( - await readFile(path.join(paths.testArtifactDir, 'alpha', 'run-1', 'timing.json'), 'utf8'), + await readFile(path.join(paths.testArtifactDir, alphaRowDir, 'run-1', 'timing.json'), 'utf8'), ); expect(alphaTiming.duration_ms).toBe(5000); @@ -910,15 +952,10 @@ describe('writeArtifactsFromResults', () => { expect(summary.metadata.tests_run.sort()).toEqual(['alpha', 'beta']); expect(summary.timing.duration_ms).toBe(13000); - const indexLines = (await readFile(paths.indexPath, 'utf8')) - .trim() - .split('\n') - .map((line) => JSON.parse(line) as IndexArtifactEntry); - expect(indexLines).toHaveLength(2); - expect(indexLines[0]?.summary_path).toBe('alpha/summary.json'); - expect(indexLines[0]?.grading_path).toBe('alpha/run-1/grading.json'); - expect(indexLines[0]?.timing_path).toBe('alpha/run-1/timing.json'); - expect(indexLines[0]?.metrics_path).toBe('alpha/run-1/metrics.json'); + expect(indexLines[0]?.summary_path).toBe(`${alphaRowDir}/summary.json`); + expect(indexLines[0]?.grading_path).toBe(`${alphaRowDir}/run-1/grading.json`); + expect(indexLines[0]?.timing_path).toBe(`${alphaRowDir}/run-1/timing.json`); + expect(indexLines[0]?.metrics_path).toBe(`${alphaRowDir}/run-1/metrics.json`); }); it('writes optional runtime source metadata to summary and index rows', async () => { @@ -1001,10 +1038,8 @@ describe('writeArtifactsFromResults', () => { const paths = await writeArtifactsFromResults(results, testDir, { sourceTests }); - const [indexEntry] = (await readFile(paths.indexPath, 'utf8')) - .trim() - .split('\n') - .map((line) => JSON.parse(line) as IndexArtifactEntry); + const [indexEntry] = await readIndexLines(paths.indexPath); + const repeatRowDir = expectRowDir(indexEntry, 'repeat-case'); expect(indexEntry?.trials).toEqual([ { attempt: 0, run_path: 'run-1', score: 0.25, verdict: 'fail' }, { attempt: 1, run_path: 'run-2', score: 1, verdict: 'pass' }, @@ -1016,19 +1051,19 @@ describe('writeArtifactsFromResults', () => { ci95_upper: 1, stddev: 0.53, }); - expect(indexEntry?.result_dir).toBe('repeat-case'); - expect(indexEntry?.summary_path).toBe('repeat-case/summary.json'); + expect(indexEntry?.result_dir).toBe(repeatRowDir); + expect(indexEntry?.summary_path).toBe(`${repeatRowDir}/summary.json`); expect(indexEntry?.task_dir).toBeUndefined(); expect(indexEntry?.input_path).toBeUndefined(); expect(indexEntry?.grading_path).toBeUndefined(); expect(indexEntry?.timing_path).toBeUndefined(); expect(indexEntry?.metrics_path).toBeUndefined(); - const repeatEntries = await readdir(path.join(paths.testArtifactDir, 'repeat-case')); + const repeatEntries = await readdir(path.join(paths.testArtifactDir, repeatRowDir)); expect(repeatEntries.sort()).toEqual(['run-1', 'run-2', 'summary.json']); const caseSummary = JSON.parse( - await readFile(path.join(paths.testArtifactDir, 'repeat-case', 'summary.json'), 'utf8'), + await readFile(path.join(paths.testArtifactDir, repeatRowDir, 'summary.json'), 'utf8'), ) as Record; expect(caseSummary).toMatchObject({ total_runs: 2, @@ -1060,11 +1095,11 @@ describe('writeArtifactsFromResults', () => { expect(typeof caseSummary.fingerprint).toBe('string'); await expect( - readFile(path.join(paths.testArtifactDir, 'repeat-case', 'grading.json'), 'utf8'), + readFile(path.join(paths.testArtifactDir, repeatRowDir, 'grading.json'), 'utf8'), ).rejects.toThrow(); for (const runDir of ['run-1', 'run-2']) { - const runEntries = await readdir(path.join(paths.testArtifactDir, 'repeat-case', runDir)); + const runEntries = await readdir(path.join(paths.testArtifactDir, repeatRowDir, runDir)); expect(runEntries.sort()).toEqual([ 'grading.json', 'metrics.json', @@ -1078,7 +1113,7 @@ describe('writeArtifactsFromResults', () => { const runOneResult = JSON.parse( await readFile( - path.join(paths.testArtifactDir, 'repeat-case', 'run-1', 'result.json'), + path.join(paths.testArtifactDir, repeatRowDir, 'run-1', 'result.json'), 'utf8', ), ) as Record; @@ -1098,14 +1133,14 @@ describe('writeArtifactsFromResults', () => { }); const runTwoAnswer = await readFile( - path.join(paths.testArtifactDir, 'repeat-case', 'run-2', 'outputs', 'answer.md'), + path.join(paths.testArtifactDir, repeatRowDir, 'run-2', 'outputs', 'answer.md'), 'utf8', ); expect(runTwoAnswer).toBe('second attempt'); const runTwoResult = JSON.parse( await readFile( - path.join(paths.testArtifactDir, 'repeat-case', 'run-2', 'result.json'), + path.join(paths.testArtifactDir, repeatRowDir, 'run-2', 'result.json'), 'utf8', ), ) as Record; @@ -1147,16 +1182,19 @@ describe('writeArtifactsFromResults', () => { }), ]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); + const indexLines = await readIndexLines(paths.indexPath); + const testOne = indexLines.find((line) => line.test_id === 'test-1'); + const testTwo = indexLines.find((line) => line.test_id === 'test-2'); const gradingOne: GradingArtifact = JSON.parse( - await readFile(path.join(testDir, 'test-1', 'run-1', 'grading.json'), 'utf8'), + await readFile(runArtifactPath(testDir, testOne, 'run-1', 'grading.json'), 'utf8'), ); const gradingTwo: GradingArtifact = JSON.parse( - await readFile(path.join(testDir, 'test-2', 'run-1', 'grading.json'), 'utf8'), + await readFile(runArtifactPath(testDir, testTwo, 'run-1', 'grading.json'), 'utf8'), ); const timingOne: TimingArtifact = JSON.parse( - await readFile(path.join(testDir, 'test-1', 'run-1', 'timing.json'), 'utf8'), + await readFile(runArtifactPath(testDir, testOne, 'run-1', 'timing.json'), 'utf8'), ); expect(gradingOne.summary.total).toBe(1); @@ -1215,16 +1253,18 @@ describe('writeArtifactsFromResults', () => { }), ]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); + const [indexLine] = await readIndexLines(paths.indexPath); + const rowDir = expectRowDir(indexLine, 'transcript-case'); - const transcriptPath = path.join(testDir, 'transcript-case', 'run-1', 'transcript.jsonl'); + const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'); const transcriptLines = (await readFile(transcriptPath, 'utf8')) .trim() .split('\n') .map((line) => JSON.parse(line)); const rawTranscriptLines = ( - await readFile(path.join(testDir, 'transcript-case', 'run-1', 'transcript-raw.jsonl'), 'utf8') + await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl'), 'utf8') ) .trim() .split('\n') @@ -1275,20 +1315,15 @@ describe('writeArtifactsFromResults', () => { message_index: 0, role: 'user', }); + await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow(); await expect( - readFile(path.join(testDir, 'transcript-case', 'transcript.json'), 'utf8'), - ).rejects.toThrow(); - await expect( - readFile(path.join(testDir, 'transcript-case', 'run-1', 'trace.json'), 'utf8'), + readFile(runArtifactPath(testDir, indexLine, 'run-1', 'trace.json'), 'utf8'), ).rejects.toThrow(); - const indexLine = JSON.parse( - (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), - ); expect(indexLine).not.toHaveProperty('trace_path'); - expect(indexLine.transcript_path).toBe('transcript-case/run-1/transcript.jsonl'); - expect(indexLine.transcript_raw_path).toBe('transcript-case/run-1/transcript-raw.jsonl'); - expect(indexLine.metrics_path).toBe('transcript-case/run-1/metrics.json'); + expect(indexLine?.transcript_path).toBe(`${rowDir}/run-1/transcript.jsonl`); + expect(indexLine?.transcript_raw_path).toBe(`${rowDir}/run-1/transcript-raw.jsonl`); + expect(indexLine?.metrics_path).toBe(`${rowDir}/run-1/metrics.json`); expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true); expect(indexLine.artifact_pointers).toBeUndefined(); @@ -1383,16 +1418,15 @@ describe('writeArtifactsFromResults', () => { }), ]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); + const [indexLine] = await readIndexLines(paths.indexPath); + const rowDir = expectRowDir(indexLine, 'summary-case'); - const indexLine = JSON.parse( - (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), - ); - expect(indexLine.metrics_path).toBe('summary-case/run-1/metrics.json'); + expect(indexLine?.metrics_path).toBe(`${rowDir}/run-1/metrics.json`); const summary = MetricsArtifactWireSchema.parse( JSON.parse( - await readFile(path.join(testDir, 'summary-case', 'run-1', 'metrics.json'), 'utf8'), + await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'metrics.json'), 'utf8'), ), ); @@ -1410,7 +1444,7 @@ describe('writeArtifactsFromResults', () => { }); expect(summary.source_artifacts).not.toHaveProperty('trace_path'); await expect( - readFile(path.join(testDir, 'summary-case', 'run-1', 'trace.json'), 'utf8'), + readFile(runArtifactPath(testDir, indexLine, 'run-1', 'trace.json'), 'utf8'), ).rejects.toThrow(); expect(summary.metrics.total_turns).toBe(2); expect(summary.metrics.total_tool_calls).toBe(4); @@ -1483,7 +1517,7 @@ describe('writeArtifactsFromResults', () => { expect(summary).not.toHaveProperty('usage_summary'); const timing = JSON.parse( - await readFile(path.join(testDir, 'summary-case', 'run-1', 'timing.json'), 'utf8'), + await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'timing.json'), 'utf8'), ); expect(timing).toMatchObject({ total_tokens: 140, @@ -1535,24 +1569,27 @@ describe('writeArtifactsFromResults', () => { }), ]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); + const indexLines = await readIndexLines(paths.indexPath); + const aggregateRow = indexLines.find((line) => line.test_id === 'aggregate-usage'); + const estimatedRow = indexLines.find((line) => line.test_id === 'estimated-usage'); const aggregateTiming = JSON.parse( - await readFile(path.join(testDir, 'aggregate-usage', 'run-1', 'timing.json'), 'utf8'), + await readFile(runArtifactPath(testDir, aggregateRow, 'run-1', 'timing.json'), 'utf8'), ); const estimatedTiming = JSON.parse( - await readFile(path.join(testDir, 'estimated-usage', 'run-1', 'timing.json'), 'utf8'), + await readFile(runArtifactPath(testDir, estimatedRow, 'run-1', 'timing.json'), 'utf8'), ); const runSummary = JSON.parse(await readFile(path.join(testDir, 'summary.json'), 'utf8')); MetricsArtifactWireSchema.parse( JSON.parse( - await readFile(path.join(testDir, 'aggregate-usage', 'run-1', 'metrics.json'), 'utf8'), + await readFile(runArtifactPath(testDir, aggregateRow, 'run-1', 'metrics.json'), 'utf8'), ), ); MetricsArtifactWireSchema.parse( JSON.parse( - await readFile(path.join(testDir, 'estimated-usage', 'run-1', 'metrics.json'), 'utf8'), + await readFile(runArtifactPath(testDir, estimatedRow, 'run-1', 'metrics.json'), 'utf8'), ), ); @@ -1609,19 +1646,20 @@ describe('writeArtifactsFromResults', () => { }), ]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); + const [indexLine] = await readIndexLines(paths.indexPath); + const rowDir = expectRowDir(indexLine, 'raw-log-case'); - const copiedRawLogPath = path.join(testDir, 'raw-log-case', 'run-1', 'provider.log'); + const copiedRawLogPath = runArtifactPath(testDir, indexLine, 'run-1', 'provider.log'); await expect(readFile(copiedRawLogPath, 'utf8')).rejects.toThrow(); - const transcriptPath = path.join(testDir, 'raw-log-case', 'run-1', 'transcript-raw.jsonl'); + const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog); - await expect( - readFile(path.join(testDir, 'raw-log-case', 'transcript.json'), 'utf8'), - ).rejects.toThrow(); + await expect(readFile(rawLogPath, 'utf8')).resolves.toBe(rawLog); + await expect(readFile(path.join(testDir, rowDir, 'transcript.json'), 'utf8')).rejects.toThrow(); const transcriptLines = ( - await readFile(path.join(testDir, 'raw-log-case', 'run-1', 'transcript.jsonl'), 'utf8') + await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'), 'utf8') ) .trim() .split('\n') @@ -1633,12 +1671,9 @@ describe('writeArtifactsFromResults', () => { content: [{ type: 'text', text: 'Raw log copied' }], }); - const indexLine = JSON.parse( - (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), - ); expect(indexLine.raw_provider_log_path).toBeUndefined(); - expect(indexLine.transcript_path).toBe('raw-log-case/run-1/transcript.jsonl'); - expect(indexLine.transcript_raw_path).toBe('raw-log-case/run-1/transcript-raw.jsonl'); + expect(indexLine.transcript_path).toBe(`${rowDir}/run-1/transcript.jsonl`); + expect(indexLine.transcript_raw_path).toBe(`${rowDir}/run-1/transcript-raw.jsonl`); expect(indexLine).not.toHaveProperty('transcript_json_path'); }); @@ -1663,11 +1698,9 @@ describe('writeArtifactsFromResults', () => { }), ]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); - const indexLine = JSON.parse( - (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), - ); + const [indexLine] = await readIndexLines(paths.indexPath); expect(indexLine.external_trace).toEqual({ provider: 'phoenix', source: 'codex', @@ -1685,7 +1718,7 @@ describe('writeArtifactsFromResults', () => { expect(JSON.stringify(indexLine)).not.toContain('api_key'); const transcriptJson = await readFile( - path.join(testDir, 'external-trace-case', 'run-1', 'transcript.jsonl'), + runArtifactPath(testDir, indexLine, 'run-1', 'transcript.jsonl'), 'utf8', ); expect(transcriptJson).not.toContain('secret'); @@ -1701,33 +1734,29 @@ describe('writeArtifactsFromResults', () => { }), ]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); + const [indexLine] = await readIndexLines(paths.indexPath); - const transcriptPath = path.join( - testDir, - 'no-transcript-case', - 'run-1', - 'transcript-raw.jsonl', - ); + const transcriptPath = runArtifactPath(testDir, indexLine, 'run-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).rejects.toThrow(); - const indexLine = JSON.parse( - (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), - ); expect(indexLine).not.toHaveProperty('transcript_path'); - expect(indexLine.metrics_path).toBe('no-transcript-case/run-1/metrics.json'); + expect(indexLine.metrics_path).toBe( + `${expectRowDir(indexLine, 'no-transcript-case')}/run-1/metrics.json`, + ); expect(indexLine.artifact_pointers).toBeUndefined(); }); it('sanitizes test IDs for directory names', async () => { const results = [makeResult({ testId: 'path/to:test*1' })]; - await writeArtifactsFromResults(results, testDir); + const paths = await writeArtifactsFromResults(results, testDir); + const [indexLine] = await readIndexLines(paths.indexPath); const artifactEntries = await readdir(testDir); - expect(artifactEntries).toContain('path_to_test_1'); + expect(artifactEntries).toContain(expectRowDir(indexLine, 'path_to_test_1')); }); - it('writes artifacts without target subdirectory (one run = one target)', async () => { + it('writes artifacts in a deterministic row id directory without target hierarchy', async () => { const results = [ makeResult({ testId: 'shared-id', @@ -1739,83 +1768,82 @@ describe('writeArtifactsFromResults', () => { ]; const paths = await writeArtifactsFromResults(results, testDir); - const indexLines = (await readFile(paths.indexPath, 'utf8')).trim().split('\n').map(JSON.parse); + const [indexLine] = await readIndexLines(paths.indexPath); + const rowDir = expectRowDir(indexLine, 'shared-id'); - expect(indexLines[0].grading_path).toBe('shared-id/run-1/grading.json'); + expect(indexLine.grading_path).toBe(`${rowDir}/run-1/grading.json`); + expect(rowDir).not.toContain('/'); const grading: GradingArtifact = JSON.parse( - await readFile(path.join(testDir, 'shared-id', 'run-1', 'grading.json'), 'utf8'), + await readFile(runArtifactPath(testDir, indexLine, 'run-1', 'grading.json'), 'utf8'), ); expect(grading.assertions[0].text).toBe('baseline-check'); }); - it('prefixes artifact paths with suite when present', async () => { + it('uses distinct row ids for the same test id across targets', async () => { const paths = await writeArtifactsFromResults( - [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], + [ + makeResult({ testId: 'shared-id', target: 'mock-alpha', output: 'alpha answer' }), + makeResult({ testId: 'shared-id', target: 'mock-beta', output: 'beta answer' }), + ], testDir, ); - const [indexLine] = (await readFile(paths.indexPath, 'utf8')) - .trim() - .split('\n') - .map(JSON.parse); - expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json'); + const indexLines = await readIndexLines(paths.indexPath); + const rowDirs = indexLines.map((line) => expectRowDir(line, 'shared-id')); + expect(new Set(rowDirs).size).toBe(2); + expect(indexLines.map((line) => line.grading_path)).toEqual( + rowDirs.map((rowDir) => `${rowDir}/run-1/grading.json`), + ); + const answers = await Promise.all( + indexLines.map((line) => + readFile(runArtifactPath(testDir, line, 'run-1', 'outputs', 'answer.md'), 'utf8'), + ), + ); + expect(answers.sort()).toEqual(['alpha answer', 'beta answer']); }); - it('does not prefix artifact paths with suite when it matches the result group', async () => { + it('uses distinct row ids for the same test id across suites', async () => { const paths = await writeArtifactsFromResults( - [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], + [ + makeResult({ suite: 'suite-a', testId: 'shared-id', target: 'baseline' }), + makeResult({ suite: 'suite-b', testId: 'shared-id', target: 'baseline' }), + ], testDir, - { resultGroup: 'eval-top-months-chart' }, ); - const [indexLine] = (await readFile(paths.indexPath, 'utf8')) - .trim() - .split('\n') - .map(JSON.parse); - expect(indexLine.suite).toBe('eval-top-months-chart'); - expect(indexLine.grading_path).toBe('shared-id/run-1/grading.json'); + const indexLines = await readIndexLines(paths.indexPath); + const rowDirs = indexLines.map((line) => expectRowDir(line, 'shared-id')); + expect(indexLines.map((line) => line.suite).sort()).toEqual(['suite-a', 'suite-b']); + expect(new Set(rowDirs).size).toBe(2); + expect(rowDirs.every((rowDir) => !rowDir.includes('/'))).toBe(true); }); - it('prefixes imported suite artifacts even when the suite matches the result group', async () => { + it('uses distinct row ids for duplicate suite labels from different eval paths', async () => { const sourceTests = [ { id: 'shared-id', - suite: 'eval-top-months-chart', + suite: 'duplicate-suite', source: { - evalFilePath: 'evals/imported.eval.yaml', - evalFileAbsolutePath: path.join(testDir, 'evals/imported.eval.yaml'), - importedSuiteName: 'eval-top-months-chart', + evalFilePath: 'evals/one.eval.yaml', + evalFileAbsolutePath: path.join(testDir, 'evals/one.eval.yaml'), + evalFileRepoPath: 'evals/one.eval.yaml', + importedSuiteName: 'duplicate-suite', testId: 'shared-id', testSnapshotYaml: 'id: shared-id', graderDefinitions: [], references: [], }, } as EvalTest, - ]; - const paths = await writeArtifactsFromResults( - [makeResult({ suite: 'eval-top-months-chart', testId: 'shared-id', target: 'baseline' })], - testDir, - { resultGroup: 'eval-top-months-chart', sourceTests }, - ); - - const [indexLine] = (await readFile(paths.indexPath, 'utf8')) - .trim() - .split('\n') - .map(JSON.parse); - expect(indexLine.grading_path).toBe('eval-top-months-chart/shared-id/run-1/grading.json'); - }); - - it('uses the imported suite name for wrapper suite artifact paths', async () => { - const sourceTests = [ { id: 'shared-id', - suite: 'wrapper-suite', + suite: 'duplicate-suite', source: { - evalFilePath: 'evals/imported.eval.yaml', - evalFileAbsolutePath: path.join(testDir, 'evals/imported.eval.yaml'), - importedSuiteName: 'imported-suite', + evalFilePath: 'evals/two.eval.yaml', + evalFileAbsolutePath: path.join(testDir, 'evals/two.eval.yaml'), + evalFileRepoPath: 'evals/two.eval.yaml', + importedSuiteName: 'duplicate-suite', testId: 'shared-id', testSnapshotYaml: 'id: shared-id', graderDefinitions: [], @@ -1824,17 +1852,82 @@ describe('writeArtifactsFromResults', () => { } as EvalTest, ]; const paths = await writeArtifactsFromResults( - [makeResult({ suite: 'wrapper-suite', testId: 'shared-id', target: 'baseline' })], + [ + makeResult({ + suite: 'duplicate-suite', + testId: 'shared-id', + target: 'baseline', + source: sourceTests[0].source, + }), + makeResult({ + suite: 'duplicate-suite', + testId: 'shared-id', + target: 'baseline', + source: sourceTests[1].source, + }), + ], testDir, - { resultGroup: 'wrapper-suite', sourceTests }, + { sourceTests }, ); - const [indexLine] = (await readFile(paths.indexPath, 'utf8')) - .trim() - .split('\n') - .map(JSON.parse); - expect(indexLine.result_dir).toBe('imported-suite/shared-id'); - expect(indexLine.grading_path).toBe('imported-suite/shared-id/run-1/grading.json'); + const indexLines = await readIndexLines(paths.indexPath); + const rowDirs = indexLines.map((line) => expectRowDir(line, 'shared-id')); + expect(new Set(rowDirs).size).toBe(2); + expect(indexLines.map((line) => line.projection_identity?.dimensions.eval_path).sort()).toEqual( + ['evals/one.eval.yaml', 'evals/two.eval.yaml'], + ); + }); + + it('includes variant in deterministic row id hashing when projection identity exposes it', () => { + const base = makeResult({ suite: 'variant-suite', testId: 'shared-id', target: 'replay' }); + const alpha = buildResultIndexArtifact(base, undefined, { + projectionIdentity: { + schemaVersion: 'agentv.projection_identity.v1', + id: 'alpha', + key: 'alpha', + dimensions: { + runId: 'run-1', + suite: 'variant-suite', + evalPath: 'evals/variant.eval.yaml', + testId: 'shared-id', + target: 'replay', + sourceTarget: 'codex', + attempt: 0, + variant: 'alpha', + envelopeId: 'envelope-alpha', + traceId: 'trace-alpha', + rootSpanId: 'root-alpha', + projectionFormat: 'execution_trace', + projectionVersion: 'agentv.execution_trace.v1', + }, + }, + }); + const beta = buildResultIndexArtifact(base, undefined, { + projectionIdentity: { + schemaVersion: 'agentv.projection_identity.v1', + id: 'beta', + key: 'beta', + dimensions: { + runId: 'run-1', + suite: 'variant-suite', + evalPath: 'evals/variant.eval.yaml', + testId: 'shared-id', + target: 'replay', + sourceTarget: 'codex', + attempt: 0, + variant: 'beta', + envelopeId: 'envelope-beta', + traceId: 'trace-beta', + rootSpanId: 'root-beta', + projectionFormat: 'execution_trace', + projectionVersion: 'agentv.execution_trace.v1', + }, + }, + }); + + expectRowDir(alpha, 'shared-id'); + expectRowDir(beta, 'shared-id'); + expect(alpha.result_dir).not.toBe(beta.result_dir); }); it('writes task bundle artifacts with local source paths when source metadata is provided', async () => { @@ -1954,20 +2047,21 @@ describe('writeArtifactsFromResults', () => { }, ); - const taskDir = path.join(outputDir, 'trace-case', 'task'); + const [indexLine] = await readIndexLines(paths.indexPath); + const rowDir = expectRowDir(indexLine, 'trace-case'); + const taskDir = path.join(outputDir, rowDir, 'task'); const evalPath = path.join(taskDir, 'EVAL.yaml'); const targetsPath = path.join(taskDir, 'targets.yaml'); const taskEval = await readFile(evalPath, 'utf8'); const taskTargets = await readFile(targetsPath, 'utf8'); - const indexLine = JSON.parse((await readFile(paths.indexPath, 'utf8')).trim()); expect(indexLine).toMatchObject({ - result_dir: 'trace-case', - task_dir: 'trace-case/task', - eval_path: 'trace-case/task/EVAL.yaml', - targets_path: 'trace-case/task/targets.yaml', - files_path: 'trace-case/task/files', - graders_path: 'trace-case/task/graders', + result_dir: rowDir, + task_dir: `${rowDir}/task`, + eval_path: `${rowDir}/task/EVAL.yaml`, + targets_path: `${rowDir}/task/targets.yaml`, + files_path: `${rowDir}/task/files`, + graders_path: `${rowDir}/task/graders`, }); expect(await readFile(path.join(taskDir, 'files', 'src', 'input.txt'), 'utf8')).toBe( 'input fixture\n', @@ -1998,9 +2092,7 @@ describe('writeArtifactsFromResults', () => { expect(taskTargets).toContain('api_key: "[redacted]"'); expect(taskEval).not.toContain('literal-secret'); expect(taskTargets).not.toContain('literal-secret'); - await expect( - readdir(path.join(outputDir, 'trace-case', '.agentv', 'results')), - ).rejects.toThrow(); + await expect(readdir(path.join(outputDir, rowDir, '.agentv', 'results'))).rejects.toThrow(); await expect(readdir(path.join(taskDir, '.agentv', 'results'))).rejects.toThrow(); }); @@ -2049,13 +2141,17 @@ describe('writeArtifactsFromResults', () => { }, ); - const indexLines = (await readFile(paths.indexPath, 'utf8')) - .trim() - .split('\n') - .map((line) => JSON.parse(line) as IndexArtifactEntry); - expect(indexLines.map((line) => line.task_dir)).toEqual(['alpha/task', 'beta/task']); - expect(await readdir(path.join(testDir, 'multi-out', 'alpha', 'task'))).toContain('EVAL.yaml'); - expect(await readdir(path.join(testDir, 'multi-out', 'beta', 'task'))).toContain('EVAL.yaml'); + const indexLines = await readIndexLines(paths.indexPath); + const rowDirs = indexLines.map((line) => expectRowDir(line, line.test_id)); + expect(indexLines.map((line, index) => line.task_dir)).toEqual( + rowDirs.map((rowDir) => `${rowDir}/task`), + ); + expect(await readdir(path.join(testDir, 'multi-out', rowDirs[0] ?? '', 'task'))).toContain( + 'EVAL.yaml', + ); + expect(await readdir(path.join(testDir, 'multi-out', rowDirs[1] ?? '', 'task'))).toContain( + 'EVAL.yaml', + ); }); it('matches task bundle targets by resolved result target while preserving selected target name', async () => { @@ -2097,11 +2193,12 @@ describe('writeArtifactsFromResults', () => { }, ); - const indexLine = JSON.parse((await readFile(paths.indexPath, 'utf8')).trim()); - expect(indexLine.task_dir).toBe('alias-case/task'); + const [indexLine] = await readIndexLines(paths.indexPath); + const rowDir = expectRowDir(indexLine, 'alias-case'); + expect(indexLine.task_dir).toBe(`${rowDir}/task`); const taskEval = await readFile( - path.join(testDir, 'resolved-target-out', 'alias-case', 'task', 'EVAL.yaml'), + path.join(testDir, 'resolved-target-out', rowDir, 'task', 'EVAL.yaml'), 'utf8', ); const parsedEval = parseYamlValue(taskEval) as Record; @@ -2141,7 +2238,8 @@ describe('writeArtifacts (from JSONL file)', () => { const paths = await writeArtifacts(jsonlPath, outputDir); const artifactEntries = await readdir(paths.testArtifactDir); - expect(artifactEntries).toContain('from-file'); + const [indexLine] = await readIndexLines(paths.indexPath); + expect(artifactEntries).toContain(expectRowDir(indexLine, 'from-file')); expect(artifactEntries).toContain('index.jsonl'); const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); diff --git a/apps/cli/test/commands/eval/bundle.test.ts b/apps/cli/test/commands/eval/bundle.test.ts index c205e8f41..2e6f3e8fa 100644 --- a/apps/cli/test/commands/eval/bundle.test.ts +++ b/apps/cli/test/commands/eval/bundle.test.ts @@ -166,7 +166,7 @@ tests: ../data/cases.yaml expect(run.exitCode).toBe(0); expect(run.stdout).toContain('RESULT: PASS'); - await expectFileExists(path.join(bundleDir, 'run', 'index.jsonl')); + await expectFileExists(path.join(bundleDir, 'run', 'inherited', 'index.jsonl')); }, 60_000); it('reports unbundleable workspace references with their eval location', async () => { diff --git a/apps/cli/test/commands/eval/progress-display.test.ts b/apps/cli/test/commands/eval/progress-display.test.ts index 5b505791c..9f82b9571 100644 --- a/apps/cli/test/commands/eval/progress-display.test.ts +++ b/apps/cli/test/commands/eval/progress-display.test.ts @@ -105,4 +105,24 @@ describe('ProgressDisplay', () => { expect(logs).toEqual(['1/1 ✅ test-01-biosecurity | wtalms-stg | 98% PASS']); }); + + it('does not print provider staging log paths', () => { + const display = new ProgressDisplay(1); + const logs: string[] = []; + const logSpy = mock((message?: unknown) => { + logs.push(String(message ?? '')); + }); + const originalLog = console.log; + console.log = logSpy as typeof console.log; + + try { + display.addLogPaths([ + '/tmp/agentv-provider-streams/run-001/case/logs/codex/codex-stream.log', + ]); + } finally { + console.log = originalLog; + } + + expect(logs).toEqual([]); + }); }); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index 46f1a2c58..4b2b46d7d 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -12,6 +12,7 @@ import path from 'node:path'; import type { GradingArtifact, + IndexArtifactEntry, RunSummaryArtifact, TimingArtifact, } from '../../../src/commands/eval/artifact-writer.js'; @@ -210,13 +211,33 @@ function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } -function artifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { - const testId = record.test_id ?? 'unknown'; - return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId); +function readIndex(outputDir: string): IndexArtifactEntry[] { + return readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8') + .trim() + .split('\n') + .filter(Boolean) + .map((line) => JSON.parse(line) as IndexArtifactEntry); } -function runArtifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { - return path.join(artifactDir(outputDir, record), 'run-1'); +function findIndexEntry( + outputDir: string, + record: { suite?: string; target?: string; test_id?: string }, +): IndexArtifactEntry { + const entry = readIndex(outputDir).find( + (candidate) => + candidate.test_id === (record.test_id ?? 'unknown') && + candidate.target === (record.target ?? 'unknown') && + candidate.suite === record.suite, + ); + expect(entry?.result_dir).toMatch(/^[^/]+--[a-f0-9]{12}$/); + return entry as IndexArtifactEntry; +} + +function runArtifactDir( + outputDir: string, + record: { suite?: string; target?: string; test_id?: string }, +): string { + return path.join(outputDir, findIndexEntry(outputDir, record).result_dir, 'run-1'); } describe('export e2e — multi-provider metrics verification', () => { diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index c512806a0..a9ad11237 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -163,15 +163,6 @@ function toJsonl(...records: object[]): string { return `${records.map((r) => JSON.stringify(r)).join('\n')}\n`; } -function artifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { - const testId = record.test_id ?? 'unknown'; - return path.join(outputDir, ...(record.suite ? [record.suite] : []), testId); -} - -function runArtifactDir(outputDir: string, record: { suite?: string; test_id?: string }): string { - return path.join(artifactDir(outputDir, record), 'run-1'); -} - function readIndex(outputDir: string): IndexArtifactEntry[] { return readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8') .trim() @@ -180,7 +171,38 @@ function readIndex(outputDir: string): IndexArtifactEntry[] { .map((line) => JSON.parse(line) as IndexArtifactEntry); } -function readAnswer(outputDir: string, record: { suite?: string; test_id?: string }): string { +function findIndexEntry( + outputDir: string, + record: { suite?: string; target?: string; test_id?: string }, +): IndexArtifactEntry { + const entry = readIndex(outputDir).find( + (candidate) => + candidate.test_id === (record.test_id ?? 'unknown') && + candidate.target === (record.target ?? 'unknown') && + candidate.suite === record.suite, + ); + expect(entry?.result_dir).toMatch(/^[^/]+--[a-f0-9]{12}$/); + return entry as IndexArtifactEntry; +} + +function artifactDir( + outputDir: string, + record: { suite?: string; target?: string; test_id?: string }, +): string { + return path.join(outputDir, findIndexEntry(outputDir, record).result_dir); +} + +function runArtifactDir( + outputDir: string, + record: { suite?: string; target?: string; test_id?: string }, +): string { + return path.join(artifactDir(outputDir, record), 'run-1'); +} + +function readAnswer( + outputDir: string, + record: { suite?: string; target?: string; test_id?: string }, +): string { return readFileSync(path.join(runArtifactDir(outputDir, record), 'outputs', 'answer.md'), 'utf8'); } @@ -276,7 +298,7 @@ describe('results export', () => { }); expect(first.entries[0].artifact_refs).toMatchObject({ status: 'planned_export', - timing_path: 'privacy/test-private/run-1/timing.json', + timing_path: expect.stringMatching(/^test-private--[a-f0-9]{12}\/run-1\/timing\.json$/), }); expect(first.entries[0].artifact_refs).not.toHaveProperty('input_path'); expect(first.entries[0].artifact_refs).not.toHaveProperty('output_path'); @@ -351,24 +373,26 @@ describe('results export', () => { content: 'full', redaction_level: 'none', }); + const resultDir = bundle.entries[0].artifact_refs.result_dir; + expect(resultDir).toMatch(/^test-private--[a-f0-9]{12}$/); expect(bundle.entries[0].artifact_refs).toMatchObject({ status: 'planned_export', - result_dir: 'privacy/test-private', - summary_path: 'privacy/test-private/summary.json', - grading_path: 'privacy/test-private/run-1/grading.json', - timing_path: 'privacy/test-private/run-1/timing.json', - metrics_path: 'privacy/test-private/run-1/metrics.json', - output_path: 'privacy/test-private/run-1/outputs/answer.md', - answer_path: 'privacy/test-private/run-1/outputs/answer.md', - transcript_path: 'privacy/test-private/run-1/transcript.jsonl', - transcript_raw_path: 'privacy/test-private/run-1/transcript-raw.jsonl', + result_dir: resultDir, + summary_path: `${resultDir}/summary.json`, + grading_path: `${resultDir}/run-1/grading.json`, + timing_path: `${resultDir}/run-1/timing.json`, + metrics_path: `${resultDir}/run-1/metrics.json`, + output_path: `${resultDir}/run-1/outputs/answer.md`, + answer_path: `${resultDir}/run-1/outputs/answer.md`, + transcript_path: `${resultDir}/run-1/transcript.jsonl`, + transcript_raw_path: `${resultDir}/run-1/transcript-raw.jsonl`, }); expect(bundle.entries[0].artifact_refs).not.toHaveProperty('trace_path'); expect(bundle.entries[0].artifact_refs).not.toHaveProperty('input_path'); expect(bundle.entries[0].trace).not.toHaveProperty('envelope_ref'); expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined(); expect(bundle.entries[0].trace_envelope.artifacts).not.toHaveProperty('trace_path'); - expect(bundle.entries[0].feedback.grading_path).toBe('privacy/test-private/run-1/grading.json'); + expect(bundle.entries[0].feedback.grading_path).toBe(`${resultDir}/run-1/grading.json`); expect(bundle.entries[0].raw_content).toBeDefined(); expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence'); expect(serialized).toContain('SECRET_PROMPT_TEXT'); @@ -420,19 +444,21 @@ describe('results export', () => { .map((line) => JSON.parse(line) as IndexArtifactEntry); expect(entries).toHaveLength(1); + const rowDir = entries[0].result_dir; + expect(rowDir).toMatch(/^test-greeting--[a-f0-9]{12}$/); expect(entries[0]).toMatchObject({ test_id: 'test-greeting', target: 'gpt-4o', execution_status: 'ok', - result_dir: 'demo/test-greeting', - summary_path: 'demo/test-greeting/summary.json', - grading_path: 'demo/test-greeting/run-1/grading.json', - timing_path: 'demo/test-greeting/run-1/timing.json', - metrics_path: 'demo/test-greeting/run-1/metrics.json', - output_path: 'demo/test-greeting/run-1/outputs/answer.md', - answer_path: 'demo/test-greeting/run-1/outputs/answer.md', - transcript_path: 'demo/test-greeting/run-1/transcript.jsonl', - transcript_raw_path: 'demo/test-greeting/run-1/transcript-raw.jsonl', + result_dir: rowDir, + summary_path: `${rowDir}/summary.json`, + grading_path: `${rowDir}/run-1/grading.json`, + timing_path: `${rowDir}/run-1/timing.json`, + metrics_path: `${rowDir}/run-1/metrics.json`, + output_path: `${rowDir}/run-1/outputs/answer.md`, + answer_path: `${rowDir}/run-1/outputs/answer.md`, + transcript_path: `${rowDir}/run-1/transcript.jsonl`, + transcript_raw_path: `${rowDir}/run-1/transcript-raw.jsonl`, }); expect(entries[0]).not.toHaveProperty('input_path'); expect(entries[0].projection_identity).toMatchObject({ diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 8cf3b9767..3e0295fcd 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1457,17 +1457,113 @@ describe('serve app', () => { suite_count: number; }>; }; - expect(categoriesData.categories).toEqual([ - { - name: 'runtime', - total: 3, - passed: 1, - failed: 1, - avg_score: 0.75, - execution_error_count: 1, - suite_count: 1, - }, - ]); + expect(categoriesData.categories).toHaveLength(1); + expect(categoriesData.categories[0]).toMatchObject({ + name: 'runtime', + total: 3, + passed: 1, + failed: 1, + avg_score: 0.75, + execution_error_count: 1, + suite_count: 1, + }); + }); + + it('returns hierarchical category rollups and descendant category drilldown', async () => { + const runsDir = localResultsExperimentDir(tempDir); + mkdirSync(runsDir, { recursive: true }); + const filename = '2026-03-25T10-30-00-000Z'; + const runDir = path.join(runsDir, filename); + mkdirSync(runDir, { recursive: true }); + writeFileSync( + path.join(runDir, 'index.jsonl'), + toJsonl( + { + ...RESULT_A, + test_id: 'network-pass', + suite: 'network-suite', + category: 'security/network', + score: 1, + }, + { + ...RESULT_B, + test_id: 'security-fail', + suite: 'root-suite', + category: 'security', + score: 0, + }, + { + ...RESULT_A, + test_id: 'flat-pass', + suite: 'legacy-suite', + category: 'legacy-flat', + score: 1, + }, + ), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + + const categoriesRes = await app.request(`/api/runs/${filename}/categories`); + expect(categoriesRes.status).toBe(200); + const categoriesData = (await categoriesRes.json()) as { + categories: Array<{ + name: string; + parent?: string; + total: number; + passed: number; + failed: number; + child_count?: number; + }>; + category_tree?: Array<{ name: string; children?: Array<{ name: string }> }>; + }; + + expect(categoriesData.categories).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + name: 'security', + total: 2, + passed: 1, + failed: 1, + child_count: 1, + }), + expect.objectContaining({ + name: 'security/network', + parent: 'security', + total: 1, + passed: 1, + failed: 0, + }), + expect.objectContaining({ + name: 'legacy-flat', + total: 1, + passed: 1, + failed: 0, + }), + ]), + ); + expect(categoriesData.category_tree).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + name: 'security', + children: [expect.objectContaining({ name: 'security/network' })], + }), + ]), + ); + + const suitesRes = await app.request( + `/api/runs/${filename}/categories/${encodeURIComponent('security')}/suites`, + ); + expect(suitesRes.status).toBe(200); + const suitesData = (await suitesRes.json()) as { + suites: Array<{ name: string; total: number }>; + }; + expect(suitesData.suites).toEqual( + expect.arrayContaining([ + expect.objectContaining({ name: 'network-suite', total: 1 }), + expect.objectContaining({ name: 'root-suite', total: 1 }), + ]), + ); }); it('infers the experiment name from the run id when live results have not written it yet', async () => { @@ -3850,6 +3946,61 @@ describe('serve app', () => { }); describe('GET /api/runs/:filename/evals/:evalId/files/*', () => { + it('discovers nested bundle indexes and loads the requested row sidecar by manifest metadata', async () => { + const runsDir = localResultsExperimentDir(tempDir, 'multi-target'); + const timestampDir = path.join(runsDir, '2026-03-25T10-00-00-000Z'); + const alphaDir = 'case-one--111111111111'; + const betaDir = 'case-one--222222222222'; + const alphaBundleDir = path.join(timestampDir, 'storage-alpha'); + const betaBundleDir = path.join(timestampDir, 'storage-beta'); + const alphaAnswer = path.join(alphaBundleDir, alphaDir, 'run-1', 'outputs', 'answer.md'); + const betaAnswer = path.join(betaBundleDir, betaDir, 'run-1', 'outputs', 'answer.md'); + + mkdirSync(path.dirname(alphaAnswer), { recursive: true }); + mkdirSync(path.dirname(betaAnswer), { recursive: true }); + writeFileSync(alphaAnswer, 'alpha answer'); + writeFileSync(betaAnswer, 'beta answer'); + writeFileSync( + path.join(alphaBundleDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'multi-target', + test_id: 'case-one', + target: 'mock-alpha', + result_dir: alphaDir, + answer_path: `${alphaDir}/run-1/outputs/answer.md`, + }), + ); + writeFileSync( + path.join(betaBundleDir, 'index.jsonl'), + toJsonl({ + ...RESULT_A, + experiment: 'multi-target', + test_id: 'case-one', + target: 'mock-beta', + result_dir: betaDir, + answer_path: `${betaDir}/run-1/outputs/answer.md`, + }), + ); + + const app = createApp([], tempDir, tempDir, undefined, { studioDir }); + const listRes = await app.request('/api/runs'); + expect(listRes.status).toBe(200); + const listData = (await listRes.json()) as { + runs: Array<{ filename: string; target?: string }>; + }; + const betaRun = listData.runs.find((run) => run.target === 'mock-beta'); + expect(betaRun?.filename).toBeTruthy(); + + const res = await app.request( + `/api/runs/${encodeURIComponent(betaRun?.filename ?? '')}/evals/case-one/files/${betaDir}/run-1/outputs/answer.md?result_dir=${encodeURIComponent(betaDir)}`, + ); + + expect(res.status).toBe(200); + const data = (await res.json()) as { content: string }; + expect(data.content).toBe('beta answer'); + }); + it('loads file content for experiment-scoped run ids', async () => { const runsDir = localResultsExperimentDir(tempDir, 'with-skills'); const runId = 'with-skills::2026-03-25T10-00-00-000Z'; diff --git a/apps/cli/test/commands/runs/rerun.test.ts b/apps/cli/test/commands/runs/rerun.test.ts index 0e90b7318..28016c0ba 100644 --- a/apps/cli/test/commands/runs/rerun.test.ts +++ b/apps/cli/test/commands/runs/rerun.test.ts @@ -1,5 +1,5 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; -import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -145,6 +145,28 @@ async function readJsonLines(filePath: string): Promise JSON.parse(line) as Record); } +async function discoverIndexPaths(dir: string): Promise { + const entries = await readdir(dir, { withFileTypes: true }); + if (entries.some((entry) => entry.isFile() && entry.name === 'index.jsonl')) { + return [path.join(dir, 'index.jsonl')]; + } + const discovered: string[] = []; + for (const entry of entries) { + if (entry.isDirectory()) { + discovered.push(...(await discoverIndexPaths(path.join(dir, entry.name)))); + } + } + return discovered.sort(); +} + +async function readOutputBundle( + outputDir: string, +): Promise<{ readonly indexPath: string; readonly rows: readonly Record[] }> { + const [indexPath] = await discoverIndexPaths(outputDir); + expect(indexPath).toBeTruthy(); + return { indexPath, rows: await readJsonLines(indexPath ?? '') }; +} + function extractRerunOutputDir(stdout: string): string { const line = stdout.split(/\r?\n/).find((entry) => entry.startsWith('Rerun output directory:')); if (!line) { @@ -186,7 +208,7 @@ describe('agentv runs rerun', () => { expect(result.exitCode).toBe(0); expect(result.stdout).toContain('Rerunning 2 captured task bundle(s)'); - const rows = await readJsonLines(path.join(created.outputDir, 'index.jsonl')); + const { indexPath, rows } = await readOutputBundle(created.outputDir); expect(rows.map((row) => row.test_id)).toEqual(['case-alpha', 'case-beta']); expect(rows.every((row) => row.target === 'captured')).toBe(true); expect(rows[0].metadata).toMatchObject({ @@ -197,7 +219,7 @@ describe('agentv runs rerun', () => { }, }); - const answerPath = path.join(created.outputDir, String(rows[0].answer_path)); + const answerPath = path.join(path.dirname(indexPath), String(rows[0].answer_path)); const answer = await readFile(answerPath, 'utf8'); expect(answer).toContain('Alpha answer'); expect(answer).not.toContain('Captured answer'); @@ -274,7 +296,7 @@ describe('agentv runs rerun', () => { ]); expect(result.exitCode).toBe(0); - const rows = await readJsonLines(path.join(created.outputDir, 'index.jsonl')); + const { rows } = await readOutputBundle(created.outputDir); expect(rows.map((row) => row.test_id)).toEqual(['case-alpha']); }, 30_000); @@ -291,7 +313,7 @@ describe('agentv runs rerun', () => { expect(result.exitCode).toBe(0); const outputDir = extractRerunOutputDir(result.stdout); expect(path.relative(taskDir, outputDir).startsWith('..')).toBe(true); - const rows = await readJsonLines(path.join(outputDir, 'index.jsonl')); + const { rows } = await readOutputBundle(outputDir); expect(rows.map((row) => row.test_id)).toEqual(['case-alpha']); }, 30_000); @@ -356,7 +378,7 @@ describe('agentv runs rerun', () => { ]); expect(result.exitCode).toBe(0); - const rows = await readJsonLines(path.join(created.outputDir, 'index.jsonl')); + const { rows } = await readOutputBundle(created.outputDir); expect(rows.every((row) => row.target === 'local')).toBe(true); }, 30_000); }); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index cc3ed6c4c..77896ddc5 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -338,16 +338,20 @@ describe('agentv eval CLI', () => { ]); expect(exitCode).toBe(0); - expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl')); + const indexPath = path.join(outputDir, 'file-target', 'index.jsonl'); + expect(extractOutputPath(stdout)).toBe(indexPath); expect(stdout).toContain(`Artifact directory: ${outputDir}`); - const results = await readJsonLines(path.join(outputDir, 'index.jsonl')); + const results = await readJsonLines(indexPath); expect(results).toHaveLength(2); - await expectFileExists(path.join(outputDir, 'summary.json')); - await expectFileExists(path.join(outputDir, 'case-alpha', 'summary.json')); - await expectFileExists(path.join(outputDir, 'case-alpha', 'run-1', 'grading.json')); - await expectFileExists(path.join(outputDir, 'case-beta', 'summary.json')); - await expectFileExists(path.join(outputDir, 'case-beta', 'run-1', 'grading.json')); + await expectFileExists(path.join(outputDir, 'file-target', 'summary.json')); + for (const row of results as Array>) { + const resultDir = row.result_dir as string; + await expectFileExists(path.join(outputDir, 'file-target', resultDir, 'summary.json')); + await expectFileExists( + path.join(outputDir, 'file-target', resultDir, 'run-1', 'grading.json'), + ); + } } finally { await rm(fixture.baseDir, { recursive: true, force: true }); } @@ -362,11 +366,17 @@ describe('agentv eval CLI', () => { const outputDir = path.join(fixture.suiteDir, 'configured-results'); expect(exitCode).toBe(0); - expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl')); - await expectFileExists(path.join(outputDir, 'index.jsonl')); - await expectFileExists(path.join(outputDir, 'summary.json')); - await expectFileExists(path.join(outputDir, 'case-alpha', 'summary.json')); - await expectFileExists(path.join(outputDir, 'case-alpha', 'run-1', 'grading.json')); + const indexPath = path.join(outputDir, 'file-target', 'index.jsonl'); + expect(extractOutputPath(stdout)).toBe(indexPath); + await expectFileExists(indexPath); + await expectFileExists(path.join(outputDir, 'file-target', 'summary.json')); + const [firstRow] = (await readJsonLines(indexPath)) as Array>; + await expectFileExists( + path.join(outputDir, 'file-target', firstRow.result_dir as string, 'summary.json'), + ); + await expectFileExists( + path.join(outputDir, 'file-target', firstRow.result_dir as string, 'run-1', 'grading.json'), + ); } finally { await rm(fixture.baseDir, { recursive: true, force: true }); } @@ -400,17 +410,20 @@ describe('agentv eval CLI', () => { ]); expect(exitCode).toBe(1); - expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl')); + const indexPath = path.join(outputDir, 'file-target', 'index.jsonl'); + expect(extractOutputPath(stdout)).toBe(indexPath); expect(stdout).not.toContain('Export files:'); - const canonicalResults = await readJsonLines(path.join(outputDir, 'index.jsonl')); + const canonicalResults = await readJsonLines(indexPath); expect(canonicalResults).toHaveLength(2); - await expectFileExists(path.join(outputDir, 'summary.json')); + await expectFileExists(path.join(outputDir, 'file-target', 'summary.json')); for (const row of canonicalResults) { expect(row.transcript_path).toMatch(/run-1\/transcript\.jsonl$/); - await expectFileExists(path.join(outputDir, row.transcript_path as string)); + await expectFileExists(path.join(outputDir, 'file-target', row.transcript_path as string)); expect(row.transcript_raw_path).toMatch(/run-1\/transcript-raw\.jsonl$/); - await expectFileExists(path.join(outputDir, row.transcript_raw_path as string)); + await expectFileExists( + path.join(outputDir, 'file-target', row.transcript_raw_path as string), + ); } } finally { await rm(fixture.baseDir, { recursive: true, force: true }); diff --git a/apps/dashboard/src/components/RunDetail.tsx b/apps/dashboard/src/components/RunDetail.tsx index 1f20c2950..944f7a8cb 100644 --- a/apps/dashboard/src/components/RunDetail.tsx +++ b/apps/dashboard/src/components/RunDetail.tsx @@ -22,9 +22,9 @@ import { Link } from '@tanstack/react-router'; import type { EvalResult } from '~/lib/types'; import { useRunLog, useStudioConfig } from '~/lib/api'; +import { type CategoryTreeNode, buildCategoryTree } from '~/lib/category-tree'; import { findPhoenixExternalTraceUrl } from '~/lib/external-trace-link'; import { summarizeQuality } from '~/lib/result-summary'; -import { formatCategoryDisplay } from '~/lib/run-detail-context'; import { PassRatePill } from './PassRatePill'; import { ResultTable } from './ResultTable'; @@ -36,91 +36,21 @@ interface RunDetailProps { projectId?: string; } -interface SuiteStats { - name: string; - passed: number; - failed: number; - executionErrors: number; - total: number; - avgScore: number; -} - -interface CategoryGroup { - name: string; - displayName: string; - mutedDisplayName?: string; - suites: SuiteStats[]; - total: number; - passed: number; - failed: number; - executionErrors: number; - avgScore: number; -} - -function buildCategoryGroups(results: EvalResult[], passThreshold: number): CategoryGroup[] { - const categoryMap = new Map>(); - - for (const r of results) { - const cat = r.category ?? 'Uncategorized'; - const ds = r.suite ?? 'Uncategorized'; - if (!categoryMap.has(cat)) categoryMap.set(cat, new Map()); - // biome-ignore lint/style/noNonNullAssertion: map entry guaranteed by line above - const dsMap = categoryMap.get(cat)!; - const entry = dsMap.get(ds) ?? []; - entry.push(r); - dsMap.set(ds, entry); - } - - return Array.from(categoryMap.entries()) - .map(([catName, dsMap]) => { - const suites = Array.from(dsMap.entries()) - .map(([dsName, suiteResults]) => { - const stats = summarizeQuality(suiteResults, passThreshold); - return { - name: dsName, - passed: stats.passed, - failed: stats.failed, - executionErrors: stats.executionErrors, - total: stats.total, - avgScore: stats.avgScore, - }; - }) - .sort((a, b) => a.name.localeCompare(b.name)); - - const total = suites.reduce((s, d) => s + d.total, 0); - const passed = suites.reduce((s, d) => s + d.passed, 0); - const failed = suites.reduce((s, d) => s + d.failed, 0); - const executionErrors = suites.reduce((s, d) => s + d.executionErrors, 0); - const qualityTotal = total - executionErrors; - const scoreSum = suites.reduce((s, d) => s + d.avgScore * (d.total - d.executionErrors), 0); - - const display = formatCategoryDisplay(catName); - - return { - name: catName, - displayName: display.label, - mutedDisplayName: display.mutedLabel, - suites, - total, - passed, - failed, - executionErrors, - avgScore: qualityTotal > 0 ? scoreSum / qualityTotal : 0, - }; - }) - .sort((a, b) => a.name.localeCompare(b.name)); -} - export function RunDetail({ results, runId, projectId }: RunDetailProps) { const { data: config } = useStudioConfig(projectId); const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; + const [expandedCategories, setExpandedCategories] = useState>({}); const phoenixUrl = findPhoenixExternalTraceUrl(results); const total = results.length; const summary = summarizeQuality(results, passThreshold); const totalCost = results.reduce((sum, r) => sum + (r.costUsd ?? 0), 0); - const categories = buildCategoryGroups(results, passThreshold); + const categoryTree = buildCategoryTree(results, passThreshold); + const visibleCategories = visibleCategoryRows(categoryTree, expandedCategories); + const toggleCategory = (category: string) => { + setExpandedCategories((current) => ({ ...current, [category]: !current[category] })); + }; if (total === 0) { return ( @@ -166,43 +96,59 @@ export function RunDetail({ results, runId, projectId }: RunDetailProps) { - {categories.map((cat) => { - const label = ( - - {cat.displayName} - {cat.mutedDisplayName ? ( - - {cat.mutedDisplayName} - - ) : null} - - ); - + {visibleCategories.map((cat) => { + const expanded = expandedCategories[cat.name] === true; return ( - {projectId ? ( - - {label} - - ) : ( - - {label} - - )} + + + {cat.childCount > 0 ? ( + + ) : ( + + )} + {projectId ? ( + + {cat.label} + + ) : ( + + {cat.label} + + )} + {cat.depth > 0 ? ( + + {cat.name} + + ) : null} + {cat.childCount > 0 ? ( + + {cat.childCount} + + ) : null} + , +): CategoryTreeNode[] { + return nodes.flatMap((node) => [ + node, + ...(expanded[node.name] ? visibleCategoryRows(node.children, expanded) : []), + ]); +} + function ExternalTraceLink({ href }: { href?: string }) { if (!href) return null; diff --git a/apps/dashboard/src/components/RunList.tsx b/apps/dashboard/src/components/RunList.tsx index d6c83421a..1e377133e 100644 --- a/apps/dashboard/src/components/RunList.tsx +++ b/apps/dashboard/src/components/RunList.tsx @@ -529,12 +529,13 @@ export function RunList({
- +
{enableCombine && + + @@ -563,6 +564,7 @@ export function RunList({ const selectionDisabledReason = runSelectionDisabledReason(run); const selectable = !selectionDisabledReason && selectableRunIds.includes(run.filename); + const targetLabel = run.target?.trim() || display.primary; return ( {enableCombine && ( @@ -587,32 +589,39 @@ export function RunList({ - {/* Run name */} - + + {/* Target */} + @@ -654,7 +663,7 @@ export function RunList({ {(hasNextPage || isFetchingNextPage) && (
} - RunExperimentTarget Remote Passed Failures
+ {/* Experiment */} + +
+
+ {experimentNamespace} +
+ {runtimeSourceLabel ? ( +
+ {runtimeSourceLabel} +
+ ) : null} +
+
{metadataDirty ? : null}
- {display.secondary ? ( -
- {display.secondary} -
- ) : null} -
{isFetchingNextPage ? 'Loading more runs...' : 'Scroll to load more...'} diff --git a/apps/dashboard/src/lib/category-tree.test.ts b/apps/dashboard/src/lib/category-tree.test.ts new file mode 100644 index 000000000..bfd60eb07 --- /dev/null +++ b/apps/dashboard/src/lib/category-tree.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from 'bun:test'; + +import { buildCategoryTree, flattenCategoryTree, normalizeCategoryPath } from './category-tree'; +import type { EvalResult } from './types'; + +function result(overrides: Partial): EvalResult { + return { + testId: overrides.testId ?? 'case', + suite: overrides.suite ?? 'suite', + category: overrides.category, + score: overrides.score ?? 1, + ...overrides, + }; +} + +describe('category tree model', () => { + it('builds parent rollups from slash-delimited category metadata', () => { + const tree = buildCategoryTree( + [ + result({ testId: 'network-pass', category: 'security/network', score: 1 }), + result({ testId: 'security-fail', category: 'security', score: 0 }), + result({ testId: 'quality-pass', category: 'quality/regression', score: 0.9 }), + ], + 0.8, + ); + + const nodes = flattenCategoryTree(tree); + const security = nodes.find((node) => node.name === 'security'); + const network = nodes.find((node) => node.name === 'security/network'); + + expect(tree.map((node) => node.name)).toEqual(['quality', 'security']); + expect(security).toMatchObject({ + name: 'security', + label: 'security', + total: 2, + passed: 1, + failed: 1, + childCount: 1, + }); + expect(network).toMatchObject({ + name: 'security/network', + label: 'network', + parent: 'security', + depth: 1, + total: 1, + passed: 1, + }); + }); + + it('preserves existing flat categories as one-node paths', () => { + const tree = buildCategoryTree( + [result({ testId: 'flat', category: 'Safety > PII', score: 0.5 })], + 0.8, + ); + + expect(tree).toHaveLength(1); + expect(tree[0]).toMatchObject({ + name: 'Safety > PII', + label: 'Safety > PII', + total: 1, + failed: 1, + children: [], + }); + }); + + it('canonicalizes explicit slash category strings', () => { + expect(normalizeCategoryPath(' security / network ')).toBe('security/network'); + expect(normalizeCategoryPath('security\\network')).toBe('security/network'); + }); +}); diff --git a/apps/dashboard/src/lib/category-tree.ts b/apps/dashboard/src/lib/category-tree.ts new file mode 100644 index 000000000..22ede49c4 --- /dev/null +++ b/apps/dashboard/src/lib/category-tree.ts @@ -0,0 +1,134 @@ +import { summarizeQuality } from './result-summary'; +import type { EvalResult } from './types'; + +export const DEFAULT_CATEGORY = 'Uncategorized'; + +export interface CategoryTreeNode { + name: string; + label: string; + parent?: string; + depth: number; + total: number; + passed: number; + failed: number; + executionErrors: number; + avgScore: number; + suiteCount: number; + childCount: number; + children: CategoryTreeNode[]; +} + +interface CategoryBucket { + results: EvalResult[]; + suites: Set; + children: Set; +} + +export function normalizeCategoryPath(category: string | undefined): string { + const normalized = category + ?.replace(/\\/g, '/') + .split('/') + .map((part) => part.trim()) + .filter((part) => part.length > 0) + .join('/'); + return normalized && normalized.length > 0 ? normalized : DEFAULT_CATEGORY; +} + +export function buildCategoryTree( + results: readonly EvalResult[], + passThreshold: number, +): CategoryTreeNode[] { + const buckets = new Map(); + const ensureBucket = (name: string): CategoryBucket => { + const existing = buckets.get(name); + if (existing) return existing; + const created = { results: [], suites: new Set(), children: new Set() }; + buckets.set(name, created); + return created; + }; + + for (const result of results) { + const category = normalizeCategoryPath(result.category); + const suite = result.suite ?? 'Uncategorized'; + const prefixes = categoryPrefixes(category); + for (const prefix of prefixes) { + const bucket = ensureBucket(prefix); + bucket.results.push(result); + bucket.suites.add(suite); + } + for (let index = 1; index < prefixes.length; index++) { + ensureBucket(prefixes[index - 1]).children.add(prefixes[index]); + } + } + + const nodeByName = new Map( + [...buckets.entries()].map(([name, bucket]) => [ + name, + summarizeCategoryBucket(name, bucket, passThreshold), + ]), + ); + + return [...nodeByName.values()] + .filter((node) => !node.parent) + .sort(compareCategoryNodes) + .map((node) => attachChildren(node, buckets, nodeByName)); +} + +export function flattenCategoryTree(nodes: readonly CategoryTreeNode[]): CategoryTreeNode[] { + return nodes.flatMap((node) => [node, ...flattenCategoryTree(node.children)]); +} + +function categoryPrefixes(category: string): string[] { + const parts = category.split('/').filter((part) => part.length > 0); + if (parts.length === 0) return [DEFAULT_CATEGORY]; + return parts.map((_, index) => parts.slice(0, index + 1).join('/')); +} + +function categoryParent(category: string): string | undefined { + const parts = category.split('/'); + return parts.length > 1 ? parts.slice(0, -1).join('/') : undefined; +} + +function categoryLabel(category: string): string { + return category.split('/').at(-1) ?? category; +} + +function summarizeCategoryBucket( + name: string, + bucket: CategoryBucket, + passThreshold: number, +): CategoryTreeNode { + const summary = summarizeQuality(bucket.results, passThreshold); + const parent = categoryParent(name); + return { + name, + label: categoryLabel(name), + ...(parent && { parent }), + depth: name.split('/').filter(Boolean).length - 1, + total: summary.total, + passed: summary.passed, + failed: summary.failed, + executionErrors: summary.executionErrors, + avgScore: summary.avgScore, + suiteCount: bucket.suites.size, + childCount: bucket.children.size, + children: [], + }; +} + +function attachChildren( + node: CategoryTreeNode, + buckets: Map, + nodeByName: Map, +): CategoryTreeNode { + const children = [...(buckets.get(node.name)?.children ?? [])] + .map((childName) => nodeByName.get(childName)) + .filter((child): child is CategoryTreeNode => Boolean(child)) + .sort(compareCategoryNodes) + .map((child) => attachChildren(child, buckets, nodeByName)); + return { ...node, children }; +} + +function compareCategoryNodes(first: CategoryTreeNode, second: CategoryTreeNode): number { + return first.name.localeCompare(second.name); +} diff --git a/apps/dashboard/src/lib/score-distribution.test.ts b/apps/dashboard/src/lib/score-distribution.test.ts index 6e45f91c6..bba65b3be 100644 --- a/apps/dashboard/src/lib/score-distribution.test.ts +++ b/apps/dashboard/src/lib/score-distribution.test.ts @@ -91,6 +91,67 @@ describe('buildScoreDistributionModel', () => { ]); }); + it('treats parent category filters as descendant rollups from category metadata', () => { + const data = compareFixture(); + if (data.runs) { + data.runs[0].tests = [ + { + test_id: 'network', + category: 'security/network', + score: 0.45, + passed: false, + }, + { + test_id: 'application', + category: 'security/application', + score: 0.85, + passed: true, + }, + ]; + } + + const model = buildScoreDistributionModel(data, filters({ category: 'security' }), NOW); + + expect(model.categoryOptions).toEqual( + expect.arrayContaining([ + { value: 'security', label: 'security', count: 2 }, + { value: 'security/application', label: 'security/application', count: 1 }, + { value: 'security/network', label: 'security/network', count: 1 }, + ]), + ); + expect(model.filteredScores).toBe(2); + }); + + it('does not derive category metadata from eval paths', () => { + const data = { + experiments: ['exp-a'], + targets: ['gpt-4o'], + cells: [ + { + experiment: 'exp-a', + target: 'gpt-4o', + eval_count: 1, + passed_count: 1, + pass_rate: 1, + avg_score: 1, + tests: [ + { + test_id: 'path-only', + eval_path: 'security/network.eval.yaml', + score: 1, + passed: true, + }, + ], + }, + ], + } as unknown as CompareResponse; + + const model = buildScoreDistributionModel(data, filters({ category: 'security' }), NOW); + + expect(model.categoryAvailable).toBe(false); + expect(model.filteredScores).toBe(0); + }); + it('returns empty buckets when no scores match the selected slice', () => { const model = buildScoreDistributionModel( compareFixture(), diff --git a/apps/dashboard/src/lib/score-distribution.ts b/apps/dashboard/src/lib/score-distribution.ts index efd43db3d..dfa2fe1bc 100644 --- a/apps/dashboard/src/lib/score-distribution.ts +++ b/apps/dashboard/src/lib/score-distribution.ts @@ -8,6 +8,7 @@ * metadata field is needed, then filter samples in `buildScoreDistributionModel`. */ +import { normalizeCategoryPath } from './category-tree'; import type { CompareResponse, CompareRunEntry, CompareTestResult } from './types'; export const ALL_DISTRIBUTION_FILTER_VALUE = ''; @@ -68,7 +69,9 @@ export function buildScoreDistributionModel( ): ScoreDistributionModel { const samples = collectScoreSamples(data); const experimentOptions = buildExperimentOptions(data, samples); - const categoryOptions = buildOptions(samples.flatMap((sample) => sample.category ?? [])); + const categoryOptions = buildOptions( + samples.flatMap((sample) => (sample.category ? categoryPrefixes(sample.category) : [])), + ); const categoryAvailable = categoryOptions.length > 0; const hasTimestampedScores = samples.some((sample) => sample.startedAtMs !== undefined); const activePeriod = @@ -79,7 +82,7 @@ export function buildScoreDistributionModel( const filtered = samples.filter((sample) => { if (filters.experiment && sample.experiment !== filters.experiment) return false; - if (filters.category && sample.category !== filters.category) return false; + if (filters.category && !isCategoryDescendant(sample.category, filters.category)) return false; if (windowStartMs !== undefined) { return sample.startedAtMs !== undefined && sample.startedAtMs >= windowStartMs; } @@ -174,7 +177,19 @@ function buildBuckets(scores: number[]): ScoreDistributionBucket[] { function normalizeCategory(value: string | undefined): string | undefined { const trimmed = value?.trim(); - return trimmed ? trimmed : undefined; + return trimmed ? normalizeCategoryPath(trimmed) : undefined; +} + +function categoryPrefixes(category: string): string[] { + const parts = category.split('/').filter((part) => part.length > 0); + return parts.map((_, index) => parts.slice(0, index + 1).join('/')); +} + +function isCategoryDescendant(category: string | undefined, selectedCategory: string): boolean { + return ( + category !== undefined && + (category === selectedCategory || category.startsWith(`${selectedCategory}/`)) + ); } function parseTimestamp(value: string): number | undefined { diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts index f928a3ea9..ea5084dea 100644 --- a/apps/dashboard/src/lib/types.ts +++ b/apps/dashboard/src/lib/types.ts @@ -502,16 +502,22 @@ export interface FileContentResponse { export interface CategorySummary { name: string; + label?: string; + parent?: string; + depth?: number; total: number; passed: number; failed: number; avg_score: number; execution_error_count?: number; suite_count: number; + child_count?: number; + children?: CategorySummary[]; } export interface CategoriesResponse { categories: CategorySummary[]; + category_tree?: CategorySummary[]; } export interface StudioConfigResponse { diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index 5b91ebd1e..1ffdbb25e 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -109,6 +109,7 @@ tests: |-------|-------------| | `description` | Human-readable description of the evaluation | | `suite` | Optional suite identifier | +| `category` | Optional slash-delimited analytics taxonomy path. Overrides the category derived from the eval file path. | | `experiment` | Runtime policy (`target`, `targets`, `workers`, `repeat`, `threshold`, `timeout_seconds`, `budget_usd`, etc.) | | `workspace` | Suite-level task environment — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). | | `tests` | Array of individual tests, include entries, or a string path to an external file or directory. Tests and include entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. | @@ -154,6 +155,13 @@ tests: input: Screen "Acme Corp" against denied parties list ``` +When `category` is omitted, AgentV derives it from the eval file path. Generic +filenames do not add a leaf: `security/eval.yaml` becomes `security`, and +`security/network/dataset.eval.yaml` becomes `security/network`. A meaningful +named eval file contributes a leaf, so `security/network.eval.yaml` becomes +`security/network`. Existing flat category strings remain valid one-node +category paths. + ### Suite-level Assertions The `assertions` field is the canonical way to define suite-level graders. Suite-level assertions are appended to every test's graders unless a test sets `execution.skip_defaults: true`. diff --git a/packages/core/src/evaluation/category.ts b/packages/core/src/evaluation/category.ts index 7f4a39e5a..e09bfdde0 100644 --- a/packages/core/src/evaluation/category.ts +++ b/packages/core/src/evaluation/category.ts @@ -1,18 +1,52 @@ -/** Default category for eval files without subdirectory structure. */ +/** Default category for eval files without category taxonomy metadata. */ export const DEFAULT_CATEGORY = 'Uncategorized'; +const GENERIC_EVAL_FILE_STEMS = new Set(['eval', 'dataset']); + /** - * Derive a human-readable category from an eval file's relative path. + * Canonicalize analytics category taxonomy paths. * - * Strips the filename and any `evals` directory segments, then joins - * remaining directories with `/`. Returns {@link DEFAULT_CATEGORY} for files - * at the root level. + * Categories are slash-delimited analytics paths, not filesystem paths. Existing + * flat labels remain valid one-node paths, while repeated slash separators and + * surrounding whitespace are normalized for derived and explicit categories. + */ +export function normalizeCategoryPath(category: string | undefined): string { + const normalized = category + ?.replace(/\\/g, '/') + .split('/') + .map((part) => part.trim()) + .filter((part) => part.length > 0) + .join('/'); + return normalized && normalized.length > 0 ? normalized : DEFAULT_CATEGORY; +} + +function evalFileStem(fileName: string): string { + return fileName.replace(/\.eval\.[^.]+$/i, '').replace(/\.[^.]+$/i, ''); +} + +/** + * Derive a canonical slash-delimited analytics category path from an eval file. + * + * Generic eval filenames such as `eval.yaml` and `dataset.eval.yaml` do not add + * a taxonomy leaf. Meaningful named eval files such as `network.eval.yaml` do + * contribute a leaf. Any `evals` directory segment is treated as organization + * only and is removed from the analytics taxonomy. */ export function deriveCategory(relativePath: string): string { - const parts = relativePath.split(/[/\\]/); - if (parts.length <= 1) { + const parts = relativePath + .split(/[/\\]/) + .map((part) => part.trim()) + .filter((part) => part.length > 0); + const fileName = parts.at(-1); + if (!fileName) { return DEFAULT_CATEGORY; } - const dirs = parts.slice(0, -1).filter((d) => d !== 'evals'); - return dirs.length > 0 ? dirs.join('/') : DEFAULT_CATEGORY; + + const taxonomyParts = parts.slice(0, -1).filter((part) => part !== 'evals'); + const stem = evalFileStem(fileName).trim(); + if (stem && !GENERIC_EVAL_FILE_STEMS.has(stem.toLowerCase())) { + taxonomyParts.push(stem); + } + + return normalizeCategoryPath(taxonomyParts.join('/')); } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 3d49090e4..bce43a88a 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -693,6 +693,7 @@ export async function gradePreparedEvalCase( const baseResult = { timestamp: timestamp.toISOString(), testId: evalCase.id, + source: evalCase.source, suite: evalCase.suite, category: evalCase.category, conversationId: evalCase.conversation_id, @@ -2558,6 +2559,7 @@ async function evaluateCandidate(options: { return { timestamp: completedAt.toISOString(), testId: evalCase.id, + source: evalCase.source, suite: evalCase.suite, category: evalCase.category, conversationId: evalCase.conversation_id, diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index 2be54ac7d..27dbd2312 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -8,7 +8,8 @@ */ import { createHash } from 'node:crypto'; -import { copyFile, mkdir, readFile, writeFile } from 'node:fs/promises'; +import { copyFile, mkdir, readFile, rm, rmdir, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; import path from 'node:path'; import { @@ -88,8 +89,51 @@ export interface RunRuntimeSourceMetadata { readonly source_eval_files?: readonly string[]; } -export function buildTestTargetKey(testId?: string, target?: string): string { - return `${testId ?? 'unknown'}::${target ?? 'unknown'}`; +export function buildTestTargetKey(testId?: string, target?: string, variant?: string): string { + return `${testId ?? 'unknown'}::${target ?? 'unknown'}::${variant ?? ''}`; +} + +function stringField(record: Record | undefined, key: string): string | undefined { + const value = record?.[key]; + return typeof value === 'string' && value.trim().length > 0 ? value : undefined; +} + +function resultProjectionDimensions(result: EvaluationResult): Record | undefined { + const projectionIdentity = (result as unknown as Record).projectionIdentity; + if (!isRecord(projectionIdentity)) { + return undefined; + } + const dimensions = projectionIdentity.dimensions; + return isRecord(dimensions) ? dimensions : undefined; +} + +export function buildEvaluationResultTargetKey(result: EvaluationResult): string { + const dimensions = resultProjectionDimensions(result); + return JSON.stringify({ + eval_path: + stringField(dimensions, 'evalPath') ?? + sourceEvalPath(result, undefined) ?? + stringField(result as unknown as Record, 'evalPath') ?? + null, + suite: stringField(dimensions, 'suite') ?? getSuite(result) ?? null, + test_id: stringField(dimensions, 'testId') ?? result.testId ?? 'unknown', + target: stringField(dimensions, 'target') ?? result.target ?? 'unknown', + variant: stringField(dimensions, 'variant') ?? result.variant ?? null, + }); +} + +export function buildEvalTestTargetKey( + test: Pick, + target?: string, + variant?: string, +): string { + return JSON.stringify({ + eval_path: evalSourcePath(test.source) ?? null, + suite: test.suite ?? null, + test_id: test.id ?? 'unknown', + target: target ?? 'unknown', + variant: variant ?? null, + }); } export function deduplicateByTestIdTarget( @@ -97,11 +141,11 @@ export function deduplicateByTestIdTarget( ): EvaluationResult[] { const seen = new Map(); for (let i = 0; i < results.length; i++) { - seen.set(buildTestTargetKey(results[i].testId, results[i].target), i); + seen.set(buildEvaluationResultTargetKey(results[i]), i); } const deduped: EvaluationResult[] = []; for (let i = 0; i < results.length; i++) { - const key = buildTestTargetKey(results[i].testId, results[i].target); + const key = buildEvaluationResultTargetKey(results[i]); if (seen.get(key) === i) { deduped.push(results[i]); } @@ -300,6 +344,7 @@ export interface RunSummaryArtifact { readonly eval_file: string; readonly timestamp: string; readonly targets: readonly string[]; + readonly variants?: readonly string[]; readonly tests_run: readonly string[]; readonly experiment?: string; readonly experiment_config?: ExperimentArtifactMetadata; @@ -345,6 +390,7 @@ export interface IndexArtifactEntry { readonly experiment?: string; readonly score: number; readonly target: string; + readonly variant?: string; readonly token_usage?: EvaluationResult['tokenUsage']; readonly cost_usd?: number; readonly duration_ms?: number; @@ -889,7 +935,6 @@ async function writeTrialRunArtifacts(params: { const envelope = buildTraceEnvelopeSidecar({ result, outputDir: params.outputDir, - testDir: runDir, evalPath: resolveEnvelopeEvalPath(result, params.testByTestId, params.evalFile), experiment: params.experiment, runId: attemptRunId, @@ -1178,13 +1223,18 @@ export function buildRunSummaryArtifact( runtimeSource?: RunRuntimeSourceMetadata, ): RunSummaryArtifact { const targetSet = new Set(); + const variantSet = new Set(); const testIdSet = new Set(); for (const result of results) { targetSet.add(result.target ?? 'unknown'); + if (result.variant) { + variantSet.add(result.variant); + } testIdSet.add(result.testId ?? 'unknown'); } const targets = [...targetSet].sort(); + const variants = [...variantSet].sort(); const testIds = [...testIdSet].sort(); const runSummary: RunSummaryArtifact['run_summary'] = {}; @@ -1264,6 +1314,7 @@ export function buildRunSummaryArtifact( eval_file: evalFile, timestamp, targets, + variants: variants.length > 0 ? variants : undefined, tests_run: testIds, experiment, experiment_config: experimentMetadata, @@ -1344,25 +1395,62 @@ function safeTestId(testId: string | undefined): string { return safeArtifactPathSegment(testId, 'unknown'); } +const ROW_ID_PREFIX_MAX_LENGTH = 64; +const ROW_ID_HASH_LENGTH = 12; + function getSuite(result: EvaluationResult): string | undefined { return result.suite; } +function evalSourcePath(source: EvalTest['source'] | undefined): string | undefined { + return source?.evalFileRepoPath ?? source?.evalFilePath; +} + +function sourceEvalPath( + result: EvaluationResult, + sourceTest: EvalTest | undefined, +): string | undefined { + return evalSourcePath(result.source) ?? evalSourcePath(sourceTest?.source); +} + +function compactRowIdPrefix(testId: string | undefined): string { + const safe = safeTestId(testId); + return safe.length > ROW_ID_PREFIX_MAX_LENGTH ? safe.slice(0, ROW_ID_PREFIX_MAX_LENGTH) : safe; +} + +function buildRowArtifactHashInput( + result: EvaluationResult, + sourceTest?: EvalTest, + projectionIdentity?: ProjectionIdentity, +): { + readonly eval_path: string | null; + readonly suite: string | null; + readonly test_id: string; + readonly target: string; + readonly variant: string | null; +} { + const dimensions = projectionIdentity?.dimensions; + return { + eval_path: dimensions?.evalPath ?? sourceEvalPath(result, sourceTest) ?? null, + suite: dimensions?.suite ?? getSuite(result) ?? null, + test_id: dimensions?.testId ?? result.testId ?? 'unknown', + target: dimensions?.target ?? result.target ?? 'unknown', + variant: dimensions?.variant ?? result.variant ?? null, + }; +} + function buildArtifactSubdir( result: EvaluationResult, - resultGroup?: string, + _resultGroup?: string, sourceTest?: EvalTest, + projectionIdentity?: ProjectionIdentity, ): string { - const segments = []; - const evalSet = getSuite(result); - const importedSuiteName = sourceTest?.source?.importedSuiteName; - if (importedSuiteName !== undefined) { - segments.push(safeArtifactPathSegment(importedSuiteName, 'default')); - } else if (evalSet && evalSet !== resultGroup) { - segments.push(safeArtifactPathSegment(evalSet, 'default')); - } - segments.push(safeTestId(result.testId)); - return path.posix.join(...segments); + const hashInput = buildRowArtifactHashInput(result, sourceTest, projectionIdentity); + const digest = createHash('sha256') + .update(JSON.stringify(hashInput)) + .digest('hex') + .slice(0, ROW_ID_HASH_LENGTH); + return `${compactRowIdPrefix(hashInput.test_id)}--${digest}`; } function toRelativeArtifactPath(outputDir: string, filePath: string): string { @@ -1374,6 +1462,13 @@ function findResultSourceTest( testByTestId: ReadonlyMap, ): EvalTest | undefined { const testId = result.testId ?? 'unknown'; + const resultSourcePath = evalSourcePath(result.source); + if (resultSourcePath) { + const sourceMatch = testByTestId.get(sourceTestLookupKey(`source:${resultSourcePath}`, testId)); + if (sourceMatch) { + return sourceMatch; + } + } const suite = getSuite(result); if (suite) { const suiteMatch = testByTestId.get(sourceTestLookupKey(suite, testId)); @@ -1397,6 +1492,10 @@ function buildSourceTestLookup( if (test.suite) { lookup.set(sourceTestLookupKey(test.suite, test.id), test); } + const sourcePath = evalSourcePath(test.source); + if (sourcePath) { + lookup.set(sourceTestLookupKey(`source:${sourcePath}`, test.id), test); + } if (!lookup.has(test.id)) { lookup.set(test.id, test); } @@ -1422,10 +1521,38 @@ function rawProviderLogSourcePath(result: EvaluationResult): string | undefined return sourcePath ? sourcePath : undefined; } +function providerStagingRoot(): string { + return path.resolve(tmpdir(), 'agentv-provider-streams'); +} + +function isAgentvProviderStagingPath(filePath: string): boolean { + const root = providerStagingRoot(); + const resolved = path.resolve(filePath); + return resolved.startsWith(`${root}${path.sep}`); +} + +async function cleanupProviderStagingFile(filePath: string): Promise { + if (!isAgentvProviderStagingPath(filePath)) { + return; + } + + await rm(filePath, { force: true }); + + const root = providerStagingRoot(); + let current = path.dirname(path.resolve(filePath)); + while (current !== root && current.startsWith(`${root}${path.sep}`)) { + try { + await rmdir(current); + } catch { + break; + } + current = path.dirname(current); + } +} + interface TraceEnvelopeSidecarParams { readonly result: EvaluationResult; readonly outputDir: string; - readonly testDir: string; readonly evalPath?: string; readonly experiment?: string; readonly runId?: string; @@ -1438,6 +1565,7 @@ function buildTraceEnvelopeSidecar(params: TraceEnvelopeSidecarParams): TraceEnv evalPath: params.evalPath, runId: params.runId ?? path.basename(params.outputDir), experiment: params.experiment, + variant: params.result.variant, source: { path: RESULT_INDEX_FILENAME }, capture: { content: 'full', redactionLevel: 'none', redactedFields: [] }, artifacts: { @@ -1478,6 +1606,7 @@ export function buildIndexArtifactEntry( conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', + variant: result.variant, token_usage: result.tokenUsage, cost_usd: result.costUsd, duration_ms: result.durationMs, @@ -1543,7 +1672,12 @@ export function buildResultIndexArtifact( runtimeSource?: RunRuntimeSourceMetadata; }, ): ResultIndexArtifact { - const artifactSubdir = buildArtifactSubdir(result); + const artifactSubdir = buildArtifactSubdir( + result, + undefined, + undefined, + options?.projectionIdentity, + ); const hasAnswer = result.output.length > 0; const hasTranscript = resultHasExecutionTraceTranscript(result); const isSingleRun = !hasPersistedTrialRuns(result); @@ -1557,6 +1691,7 @@ export function buildResultIndexArtifact( conversation_id: result.conversationId, score: result.score, target: result.target ?? 'unknown', + variant: result.variant, token_usage: result.tokenUsage, cost_usd: result.costUsd, duration_ms: result.durationMs, @@ -1643,6 +1778,7 @@ async function writeRawTranscriptJsonl( const rawSource = rawProviderLogSourcePath(result); if (rawSource) { await copyFile(rawSource, filePath); + await cleanupProviderStagingFile(rawSource).catch(() => undefined); return; } await writeGeneratedRawTranscriptJsonl(filePath, result, envelope); @@ -1694,7 +1830,12 @@ function indexRecordKey(record: unknown): string | undefined { ? record.testId : undefined; const target = typeof record.target === 'string' ? record.target : undefined; - return testId ? buildTestTargetKey(testId, target) : undefined; + const variant = typeof record.variant === 'string' ? record.variant : undefined; + return testId ? buildTestTargetKey(testId, target, variant) : undefined; +} + +function indexRecordReplacementKey(record: unknown): string | undefined { + return projectionIdentityRecordKey(record) ?? indexRecordKey(record); } function projectionIdentityRecordKey(record: unknown): string | undefined { @@ -1780,7 +1921,10 @@ async function rewriteExistingIndexRecords( } const replacementsByKey = new Map( - replacements.map((record) => [buildTestTargetKey(record.test_id, record.target), record]), + replacements.flatMap((record) => { + const key = indexRecordReplacementKey(record); + return key ? [[key, record] as const] : []; + }), ); const seen = new Set(); const records: unknown[] = []; @@ -1790,7 +1934,7 @@ async function rewriteExistingIndexRecords( } try { const parsed = JSON.parse(line) as unknown; - const key = indexRecordKey(parsed); + const key = indexRecordReplacementKey(parsed); const replacement = key ? replacementsByKey.get(key) : undefined; if (key && replacement) { records.push(replacement); @@ -1802,8 +1946,8 @@ async function rewriteExistingIndexRecords( } for (const replacement of replacements) { - const key = buildTestTargetKey(replacement.test_id, replacement.target); - if (!seen.has(key)) { + const key = indexRecordReplacementKey(replacement); + if (!key || !seen.has(key)) { records.push(replacement); } } @@ -2011,14 +2155,11 @@ export async function writePerTestArtifacts( for (const result of results) { const sourceTest = findResultSourceTest(result, testByTestId); - const artifactSubdir = buildArtifactSubdir(result, options?.resultGroup, sourceTest); - const testDir = path.join(outputDir, artifactSubdir); - await mkdir(testDir, { recursive: true }); + const evalPath = resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile); const envelope = buildTraceEnvelopeSidecar({ result, outputDir, - testDir, - evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile), + evalPath, experiment: options?.experiment, runId: options?.runId, duplicatePolicy, @@ -2027,6 +2168,14 @@ export async function writePerTestArtifacts( if (!projectionIdentity) { throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); } + const artifactSubdir = buildArtifactSubdir( + result, + options?.resultGroup, + sourceTest, + projectionIdentity, + ); + const testDir = path.join(outputDir, artifactSubdir); + await mkdir(testDir, { recursive: true }); const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME); const aggregateTiming = buildRepeatAggregateTimingArtifact(result); const summary = buildRepeatCaseSummaryArtifact(result, aggregateTiming, projectionIdentity.id); @@ -2130,14 +2279,11 @@ export async function writeArtifactsFromResults( const plans = results.map((result) => { const sourceTest = findResultSourceTest(result, testByTestId); - const artifactSubdir = buildArtifactSubdir(result, options?.resultGroup, sourceTest); - const testDir = path.join(outputDir, artifactSubdir); - const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME); + const evalPath = resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile); const envelope = buildTraceEnvelopeSidecar({ result, outputDir, - testDir, - evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile), + evalPath, experiment: options?.experiment, runId: options?.runId, duplicatePolicy, @@ -2146,6 +2292,14 @@ export async function writeArtifactsFromResults( if (!projectionIdentity) { throw new Error(`Result ${result.testId ?? 'unknown'} is missing projection identity`); } + const artifactSubdir = buildArtifactSubdir( + result, + options?.resultGroup, + sourceTest, + projectionIdentity, + ); + const testDir = path.join(outputDir, artifactSubdir); + const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME); const identityId = projectionIdentity.id; const isSingleRun = !hasPersistedTrialRuns(result); const singleRunDir = path.join(testDir, trialRunDirName(0)); diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 7d93c1762..647871c25 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1180,12 +1180,15 @@ export type FailOnError = boolean; export interface EvaluationResult { readonly timestamp: string; readonly testId: string; + readonly source?: EvalTestSource; readonly suite?: string; readonly category?: string; readonly conversationId?: string; readonly score: number; readonly assertions: readonly AssertionEntry[]; readonly target: string; + /** Optional explicit comparable variant. Path segments are not authoritative for this value. */ + readonly variant?: string; /** * The target that actually served the response, when different from the * primary target. Present only when a fallback target was used. diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index df047d03e..4d3faf794 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -4,6 +4,7 @@ import fg from 'fast-glob'; import micromatch from 'micromatch'; import { stringify as stringifyYaml } from 'yaml'; +import { normalizeCategoryPath } from './category.js'; import { type ExperimentConfig, normalizeExperimentConfig, @@ -789,10 +790,12 @@ async function loadTestsFromParsedYamlValue( ? (renderedCase.window_size as number) : undefined; + const category = normalizeCategoryPath(suite.category ?? options?.category); + const testCase: EvalTest = { id, suite: suiteName, - category: suite.category ?? options?.category, + category, conversation_id: conversationId, question: question, input: inputMessages, diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 00b441734..072bbded2 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -63,6 +63,8 @@ export { aggregateRunDir, buildAggregateGradingArtifact, buildRunSummaryArtifact, + buildEvalTestTargetKey, + buildEvaluationResultTargetKey, buildGradingArtifact, buildIndexArtifactEntry, buildResultIndexArtifact, @@ -186,7 +188,7 @@ export { } from './projects.js'; export { syncProject, syncProjects } from './project-sync.js'; export { trimBaselineResult } from './evaluation/baseline.js'; -export { DEFAULT_CATEGORY, deriveCategory } from './evaluation/category.js'; +export { DEFAULT_CATEGORY, deriveCategory, normalizeCategoryPath } from './evaluation/category.js'; export * from './observability/index.js'; // Registry exports diff --git a/packages/core/test/evaluation/category.test.ts b/packages/core/test/evaluation/category.test.ts index 9b8c62d01..3dc3f70f9 100644 --- a/packages/core/test/evaluation/category.test.ts +++ b/packages/core/test/evaluation/category.test.ts @@ -1,27 +1,35 @@ import { describe, expect, test } from 'bun:test'; -import { DEFAULT_CATEGORY, deriveCategory } from '../../src/evaluation/category.js'; +import { + DEFAULT_CATEGORY, + deriveCategory, + normalizeCategoryPath, +} from '../../src/evaluation/category.js'; describe('deriveCategory', () => { test('returns Uncategorized for single-segment path (root-level file)', () => { expect(deriveCategory('dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); }); + test('uses a meaningful root-level eval filename as a one-node category path', () => { + expect(deriveCategory('network.eval.yaml')).toBe('network'); + }); + test('returns Uncategorized when only directory is evals', () => { expect(deriveCategory('evals/dataset.eval.yaml')).toBe(DEFAULT_CATEGORY); }); - test('strips evals segment and returns remaining directory', () => { - expect(deriveCategory('evals/fundamentals/greetings.eval.yaml')).toBe('fundamentals'); + test('strips evals segment and appends meaningful named eval files as a leaf', () => { + expect(deriveCategory('evals/fundamentals/greetings.eval.yaml')).toBe('fundamentals/greetings'); }); - test('preserves nested directory paths', () => { + test('does not append generic eval filenames to nested directory paths', () => { expect(deriveCategory('evals/cargowise-customs/layout-engine/eval.yaml')).toBe( 'cargowise-customs/layout-engine', ); }); - test('handles paths without evals segment', () => { + test('handles generic filenames without evals segment', () => { expect(deriveCategory('examples/showcase/eval.yaml')).toBe('examples/showcase'); }); @@ -38,4 +46,27 @@ describe('deriveCategory', () => { test('returns Uncategorized for just a filename with no directory', () => { expect(deriveCategory('eval.yaml')).toBe(DEFAULT_CATEGORY); }); + + test('matches the hierarchical category derivation contract', () => { + expect(deriveCategory('security/eval.yaml')).toBe('security'); + expect(deriveCategory('security/network.eval.yaml')).toBe('security/network'); + expect(deriveCategory('security/network/dataset.eval.yaml')).toBe('security/network'); + }); +}); + +describe('normalizeCategoryPath', () => { + test('canonicalizes explicit slash-delimited taxonomy paths', () => { + expect(normalizeCategoryPath(' security / network ')).toBe('security/network'); + expect(normalizeCategoryPath('security//network')).toBe('security/network'); + expect(normalizeCategoryPath('security\\network')).toBe('security/network'); + }); + + test('preserves existing flat category strings as one-node paths', () => { + expect(normalizeCategoryPath('Safety > PII')).toBe('Safety > PII'); + }); + + test('returns Uncategorized for empty explicit categories', () => { + expect(normalizeCategoryPath(' / ')).toBe(DEFAULT_CATEGORY); + expect(normalizeCategoryPath(undefined)).toBe(DEFAULT_CATEGORY); + }); }); diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts index cba36f5d2..97be77621 100644 --- a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts +++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts @@ -141,6 +141,7 @@ describe('evaluate() — programmatic API extensions', () => { .trim() .split('\n') .map((line) => JSON.parse(line) as { result_dir?: string }); + const resultDir = indexRow?.result_dir; const summaryArtifact = JSON.parse( await readFile(path.join(outputDir, 'summary.json'), 'utf8'), @@ -153,29 +154,12 @@ describe('evaluate() — programmatic API extensions', () => { expect(summaryArtifact.metadata.eval_file).toBe(''); expect(summaryArtifact.timing.duration_ms).toBeGreaterThanOrEqual(0); - expect(indexRow?.result_dir).toBe('__programmatic__.yaml/programmatic-artifacts'); + expect(resultDir).toMatch(/^programmatic-artifacts--[a-f0-9]{12}$/); + expect(existsSync(path.join(outputDir, resultDir ?? '', 'run-1', 'grading.json'))).toBe( + true, + ); expect( - existsSync( - path.join( - outputDir, - '__programmatic__.yaml', - 'programmatic-artifacts', - 'run-1', - 'grading.json', - ), - ), - ).toBe(true); - expect( - existsSync( - path.join( - outputDir, - '__programmatic__.yaml', - 'programmatic-artifacts', - 'run-1', - 'outputs', - 'answer.md', - ), - ), + existsSync(path.join(outputDir, resultDir ?? '', 'run-1', 'outputs', 'answer.md')), ).toBe(true); } finally { rmSync(outputDir, { recursive: true, force: true }); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 3e7e81e7c..9ddaa1c05 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -1,5 +1,12 @@ import { afterEach, describe, expect, it, mock } from 'bun:test'; -import { mkdtempSync, readFileSync, readdirSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + writeFileSync, +} from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -723,9 +730,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(result.failureReasonCode).toBe('provider_error'); }); - it('stores raw provider logs once as transcript-raw evidence', async () => { - const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-raw-provider-log-')); - const rawLogPath = path.join(tempDir, 'provider-native-session.jsonl'); + it('stores raw provider logs once as transcript-raw evidence and cleans staging files', async () => { + const stagingRoot = path.join(tmpdir(), 'agentv-provider-streams'); + mkdirSync(stagingRoot, { recursive: true }); + const tempDir = mkdtempSync(path.join(stagingRoot, 'raw-provider-log-')); + const rawLogDir = path.join(tempDir, 'suite', 'case-1', 'logs', 'codex'); + mkdirSync(rawLogDir, { recursive: true }); + const rawLogPath = path.join(rawLogDir, 'provider-native-session.jsonl'); writeFileSync(rawLogPath, '{"event":"provider-native"}\n', 'utf8'); const provider = new SequenceProvider('mock', { @@ -749,8 +760,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, const outputDir = path.join(tempDir, 'artifacts'); await writeArtifactsFromResults([result], outputDir); - const artifactDir = path.join(outputDir, 'test-dataset', 'case-1'); - const runDir = path.join(artifactDir, 'run-1'); + const indexRows = readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as Record); + const resultDir = indexRows[0]?.result_dir; + expect(resultDir).toMatch(/^case-1--[a-f0-9]{12}$/); + const runDir = path.join(outputDir, resultDir ?? '', 'run-1'); const outputsDir = path.join(runDir, 'outputs'); expect(readdirSync(runDir)).not.toContain('provider.log'); expect(readdirSync(runDir)).toContain('transcript-raw.jsonl'); @@ -761,16 +777,11 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(readdirSync(outputsDir)).not.toContain('transcript.jsonl'); expect(readdirSync(outputsDir)).not.toContain('transcript.json'); - const indexRows = readFileSync(path.join(outputDir, 'index.jsonl'), 'utf8') - .trim() - .split('\n') - .map((line) => JSON.parse(line) as Record); expect(indexRows[0]?.raw_provider_log_path).toBeUndefined(); expect(indexRows[0]?.trace_path).toBeUndefined(); - expect(indexRows[0]?.transcript_path).toBe('test-dataset/case-1/run-1/transcript.jsonl'); - expect(indexRows[0]?.transcript_raw_path).toBe( - 'test-dataset/case-1/run-1/transcript-raw.jsonl', - ); + expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/run-1/transcript.jsonl`); + expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/run-1/transcript-raw.jsonl`); + expect(existsSync(rawLogPath)).toBe(false); }); it('reports failed progress status for batch item errors', async () => { diff --git a/packages/core/test/evaluation/yaml-parser-metadata.test.ts b/packages/core/test/evaluation/yaml-parser-metadata.test.ts index e9653f61d..772aeed95 100644 --- a/packages/core/test/evaluation/yaml-parser-metadata.test.ts +++ b/packages/core/test/evaluation/yaml-parser-metadata.test.ts @@ -74,6 +74,19 @@ tests: expect(suite.metadata).toBeUndefined(); }); + it('uses explicit YAML category as a canonical taxonomy path override', async () => { + const { filePath, dir } = createTempYaml(` +category: " security / network " +tests: + - id: test-1 + input: "Hello" + criteria: "Greet" +`); + + const suite = await loadTestSuite(filePath, dir, { category: 'derived/path' }); + expect(suite.tests[0].category).toBe('security/network'); + }); + it('still loads tests correctly when metadata is present', async () => { const { filePath, dir } = createTempYaml(` name: my-eval