diff --git a/CONCEPTS.md b/CONCEPTS.md
index bbeeefdff..92c60acdd 100644
--- a/CONCEPTS.md
+++ b/CONCEPTS.md
@@ -12,9 +12,17 @@ Shared domain vocabulary for this project — entities, named processes, and sta
## Evaluation Model
-**Eval** — The frozen task and grading definition: prompts, datasets, input files, fixtures, assertions, and judge criteria. An eval defines what is being tested, not which agent, model, setup variant, or run policy executes it.
+**Eval / Eval YAML** — The only composable and runnable AgentV authoring primitive. An eval YAML file can be a reusable task suite that owns task context, a wrapper eval that imports suites and carries an inline `experiment:` block, or a sidecar around raw JSONL cases. AgentV does not have a separate runnable `experiment.yaml` artifact.
-**Experiment** — A committed run variant that selects how evals are executed: target or target matrix, setup, scripts, eval filters, repeat counts, timeouts, workers, budgets, and related run knobs. Experiments make A/B setup differences explicit while pointing at stable eval tasks.
+**Task suite** — Eval YAML that owns what is being tested: prompts, datasets, input files, fixtures, `workspace`, assertions, expected references, and judge criteria. It can run directly or be imported by another eval with `tests[].include` and `type: suite`.
+
+**Raw case file** — YAML, JSONL, or directory case data imported with `tests: ./cases.yaml`, string shorthand, or `type: tests`. Raw cases are reusable data inputs; they do not carry imported suite context such as shared `workspace`, shared `input`, or shared `assertions`.
+
+**Wrapper eval** — Eval YAML whose main job is to import task suites and bind runtime policy with an inline `experiment:` block. Wrapper evals may live under an `experiments/` directory, but that path is an optional user-owned convention and AgentV does not infer behavior from it. A wrapper that imports suites with `type: suite` does not define parent workspace fields such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`; imported suites own task environment.
+
+**Experiment** — The run-policy namespace for how evals are executed: target or target matrix, eval filters, repeat counts, timeouts, workers, budgets, thresholds, and related run knobs. In authored files it lives as inline `experiment:` inside eval YAML; CLI `--experiment` and `experiment.name` choose the result bucket. Lifecycle setup belongs in `workspace.hooks` or `targets[].hooks`, not in a separate experiment artifact.
+
+**Workspace** — The task environment an eval prepares for the agent: repositories, templates, fixture files, and lifecycle hooks. It is not prompt input; use `input` for instructions and `workspace.repos[]` for multi-repo workspaces the agent can inspect or modify through tools.
**Run manifest** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `task_dir`, `summary_path`, and `grading_path`.
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index b9e8b4627..6d230755c 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -12,6 +12,7 @@ import {
RESULT_INDEX_FILENAME,
RUN_SUMMARY_FILENAME,
type ResultIndexArtifact,
+ type RunRuntimeSourceMetadata,
type RunSummaryArtifact,
type TimingArtifact,
aggregateRunDir,
@@ -215,6 +216,7 @@ export async function writePerTestArtifacts(
repoRoot?: string;
sourceTests?: readonly EvalTest[];
taskBundleTargets?: readonly TaskBundleTargetSelection[];
+ runtimeSource?: RunRuntimeSourceMetadata;
},
): Promise {
await writeCorePerTestArtifacts(results, outputDir, {
@@ -224,6 +226,7 @@ export async function writePerTestArtifacts(
duplicatePolicy: options?.duplicatePolicy,
sourceTests: options?.sourceTests,
additionalArtifacts: createTaskBundleArtifactsWriter(options),
+ runtimeSource: options?.runtimeSource,
});
}
@@ -242,6 +245,7 @@ export async function writeArtifactsFromResults(
repoRoot?: string;
sourceTests?: readonly EvalTest[];
taskBundleTargets?: readonly TaskBundleTargetSelection[];
+ runtimeSource?: RunRuntimeSourceMetadata;
},
): Promise<{
testArtifactDir: string;
@@ -258,5 +262,6 @@ export async function writeArtifactsFromResults(
resultGroup: options?.resultGroup,
sourceTests: options?.sourceTests,
additionalArtifacts: createTaskBundleArtifactsWriter(options),
+ runtimeSource: options?.runtimeSource,
});
}
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 182a3a3ed..6e456d599 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -19,6 +19,7 @@ import {
type ResolvedTarget,
ResponseCache,
RunBudgetTracker,
+ type RunRuntimeSourceMetadata,
type TrialsConfig,
buildExperimentArtifactMetadata,
buildTraceFromMessages,
@@ -589,6 +590,163 @@ function deriveEvalResultGroupName(evalFilePath: string | undefined): string {
);
}
+const CLI_RUNTIME_SOURCE_OPTION_KEYS = [
+ 'target',
+ 'targets',
+ 'filter',
+ 'tag',
+ 'excludeTag',
+ 'workers',
+ 'dryRun',
+ 'dryRunDelay',
+ 'dryRunDelayMin',
+ 'dryRunDelayMax',
+ 'agentTimeout',
+ 'maxRetries',
+ 'cache',
+ 'cachePath',
+ 'noCache',
+ 'graderTarget',
+ 'model',
+ 'threshold',
+ 'budgetUsd',
+ 'transcript',
+ 'recordReplay',
+ 'recordReplayVariant',
+ 'workspacePath',
+ 'workspaceMode',
+] as const;
+
+function hasCliRuntimeSource(rawOptions: Record): boolean {
+ return CLI_RUNTIME_SOURCE_OPTION_KEYS.some((key) => {
+ const value = rawOptions[key];
+ if (Array.isArray(value)) {
+ return value.some((entry) => typeof entry === 'string' && entry.trim().length > 0);
+ }
+ if (typeof value === 'string') {
+ return value.trim().length > 0 && value.trim() !== 'default';
+ }
+ if (typeof value === 'number') {
+ return Number.isFinite(value) && value !== 0;
+ }
+ return value === true;
+ });
+}
+
+function toRuntimeSourcePath(cwd: string, filePath: string | undefined): string | undefined {
+ const trimmed = filePath?.trim();
+ if (!trimmed) {
+ return undefined;
+ }
+ const resolved = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed);
+ const relative = path.relative(cwd, resolved);
+ const displayPath =
+ relative && !relative.startsWith('..') && !path.isAbsolute(relative) ? relative : trimmed;
+ return displayPath.split(path.sep).join('/');
+}
+
+function uniqueRuntimeSourcePaths(values: Iterable): readonly string[] {
+ return [...new Set([...values].filter((value): value is string => Boolean(value)))].sort();
+}
+
+function testSourceEvalPath(cwd: string, test: EvalTest): string | undefined {
+ return (
+ toRuntimeSourcePath(cwd, test.source?.evalFileRepoPath) ??
+ toRuntimeSourcePath(cwd, test.source?.evalFileAbsolutePath) ??
+ toRuntimeSourcePath(cwd, test.source?.evalFilePath)
+ );
+}
+
+function testSourceEvalPathForComparison(test: EvalTest): string | undefined {
+ const sourcePath = test.source?.evalFileAbsolutePath ?? test.source?.evalFilePath;
+ return sourcePath ? path.resolve(sourcePath) : undefined;
+}
+
+function buildRuntimeConfigSource(params: {
+ readonly activeTestFiles: readonly string[];
+ readonly fileMetadata: ReadonlyMap;
+ readonly hasCliRuntimeConfig: boolean;
+}): RunRuntimeSourceMetadata['config_source'] {
+ const inlineFingerprints = new Set();
+ let hasInlineExperiment = false;
+ let hasDefaultRuntime = false;
+
+ for (const activeTestFile of params.activeTestFiles) {
+ const experimentMetadata = params.fileMetadata.get(activeTestFile)?.options.experimentMetadata;
+ if (experimentMetadata) {
+ hasInlineExperiment = true;
+ inlineFingerprints.add(experimentMetadata.fingerprint ?? activeTestFile);
+ } else {
+ hasDefaultRuntime = true;
+ }
+ }
+
+ if (
+ (hasInlineExperiment && params.hasCliRuntimeConfig) ||
+ (hasInlineExperiment && hasDefaultRuntime) ||
+ inlineFingerprints.size > 1
+ ) {
+ return 'mixed';
+ }
+ if (params.hasCliRuntimeConfig) {
+ return 'cli_flags';
+ }
+ if (hasInlineExperiment) {
+ return 'inline_experiment';
+ }
+ return 'defaults';
+}
+
+function buildRuntimeSourceMetadata(params: {
+ readonly cwd: string;
+ readonly activeTestFiles: readonly string[];
+ readonly sourceTests: readonly EvalTest[];
+ readonly fileMetadata: ReadonlyMap;
+ readonly experimentNamespace: string;
+ readonly experimentNamespaceSource: RunRuntimeSourceMetadata['experiment_namespace_source'];
+ readonly hasCliRuntimeConfig: boolean;
+}): RunRuntimeSourceMetadata {
+ const evalFiles = uniqueRuntimeSourcePaths(
+ params.activeTestFiles.map((filePath) => toRuntimeSourcePath(params.cwd, filePath)),
+ );
+ const activeResolvedFiles = new Set(
+ params.activeTestFiles.map((filePath) => path.resolve(filePath)),
+ );
+ const sourceEvalFiles = uniqueRuntimeSourcePaths(
+ params.sourceTests.map((test) => testSourceEvalPath(params.cwd, test)),
+ );
+ const hasImportedSuite = params.sourceTests.some((test) => test.source?.importedSuiteName);
+ const hasNonActiveSourceFile = params.sourceTests.some((test) => {
+ const sourceFile = testSourceEvalPathForComparison(test);
+ return sourceFile ? !activeResolvedFiles.has(sourceFile) : false;
+ });
+ const kind =
+ params.activeTestFiles.length > 1
+ ? 'multi_eval'
+ : hasImportedSuite || hasNonActiveSourceFile
+ ? 'wrapper_eval'
+ : 'direct_suite';
+ const wrapperEvalFile =
+ kind === 'wrapper_eval'
+ ? toRuntimeSourcePath(params.cwd, params.activeTestFiles[0])
+ : undefined;
+
+ return {
+ schema_version: 'agentv.runtime_source.v1',
+ kind,
+ config_source: buildRuntimeConfigSource({
+ activeTestFiles: params.activeTestFiles,
+ fileMetadata: params.fileMetadata,
+ hasCliRuntimeConfig: params.hasCliRuntimeConfig,
+ }),
+ experiment_namespace: params.experimentNamespace,
+ experiment_namespace_source: params.experimentNamespaceSource,
+ eval_files: evalFiles,
+ ...(wrapperEvalFile && { wrapper_eval_file: wrapperEvalFile }),
+ ...(sourceEvalFiles.length > 0 && { source_eval_files: sourceEvalFiles }),
+ };
+}
+
type ResolvedExperimentForRun = {
readonly name?: string;
};
@@ -1453,10 +1611,19 @@ export async function runEvalCommand(
resolvedTestFiles.length === 1
? (primarySuite?.metadata?.name ?? fallbackResultGroupName)
: fallbackResultGroupName;
+ const experimentNamespaceSource: RunRuntimeSourceMetadata['experiment_namespace_source'] =
+ resolvedExperiment.name
+ ? 'cli'
+ : resolvedTestFiles.length > 1
+ ? 'multi_eval'
+ : primarySuite?.metadata?.name
+ ? 'eval_metadata'
+ : 'eval_filename';
options = {
...options,
experiment: resolvedExperiment.name ?? resultGroupName,
};
+ const hasCliRuntimeConfig = hasCliRuntimeSource(input.rawOptions);
if (!process.env.AGENTV_EXPERIMENT) {
process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
@@ -1885,9 +2052,21 @@ export async function runEvalCommand(
// Use only files that survived tag filtering.
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
+ const activeSourceTests = activeTestFiles.flatMap(
+ (activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],
+ );
const singleActiveFileMetadata =
activeTestFiles.length === 1 ? fileMetadata.get(activeTestFiles[0]) : undefined;
const runExperimentMetadata = singleActiveFileMetadata?.options.experimentMetadata;
+ const runtimeSourceMetadata = buildRuntimeSourceMetadata({
+ cwd,
+ activeTestFiles,
+ sourceTests: activeSourceTests,
+ fileMetadata,
+ experimentNamespace: normalizeExperimentName(options.experiment),
+ experimentNamespaceSource,
+ hasCliRuntimeConfig,
+ });
const hasPerFileRuntimeThresholds =
options.cliThreshold === undefined &&
activeTestFiles.some(
@@ -1932,6 +2111,7 @@ export async function runEvalCommand(
plannedTestCount: totalEvalCount,
experiment: normalizeExperimentName(options.experiment),
experimentMetadata: runExperimentMetadata,
+ runtimeSource: runtimeSourceMetadata,
});
}
@@ -2194,9 +2374,7 @@ export async function runEvalCommand(
// Write artifacts to the run directory (always, not conditional on flags)
if (allResults.length > 0) {
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
- const sourceTests = activeTestFiles.flatMap(
- (activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],
- );
+ const sourceTests = activeSourceTests;
const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata);
if (isResumeAppend) {
// Resume mode: write per-test artifacts for newly-run tests, then aggregate
@@ -2209,11 +2387,13 @@ export async function runEvalCommand(
repoRoot,
sourceTests,
taskBundleTargets,
+ runtimeSource: runtimeSourceMetadata,
});
const { summaryPath } = await aggregateRunDir(runDir, {
evalFile,
experiment: normalizeExperimentName(options.experiment),
experimentMetadata: runExperimentMetadata,
+ runtimeSource: runtimeSourceMetadata,
});
const indexPath = path.join(runDir, 'index.jsonl');
console.log(`Artifact workspace updated: ${runDir}`);
@@ -2233,6 +2413,7 @@ export async function runEvalCommand(
repoRoot,
sourceTests,
taskBundleTargets,
+ runtimeSource: runtimeSourceMetadata,
},
);
console.log(`Artifact workspace written to: ${runDir}`);
diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
index dcbc980c7..0925a671c 100644
--- a/apps/cli/src/commands/results/manifest.ts
+++ b/apps/cli/src/commands/results/manifest.ts
@@ -5,6 +5,7 @@ import {
type EvaluationResult,
type ExternalTraceMetadataWire,
type ResultArtifactPointersWire,
+ type RunRuntimeSourceMetadata,
type TraceSummary,
buildTraceFromMessages,
fromTraceEnvelopeWire,
@@ -60,6 +61,7 @@ export interface ResultManifestRecord {
readonly metrics_path?: string;
readonly raw_provider_log_path?: string;
readonly artifact_pointers?: ResultArtifactPointersWire;
+ readonly runtime_source?: RunRuntimeSourceMetadata;
readonly external_trace?: ExternalTraceMetadataWire;
readonly response_path?: string;
readonly result_dir?: string;
@@ -304,6 +306,7 @@ export function loadManifestResults(
export interface LightweightResultRecord {
readonly testId: string;
+ readonly evalPath?: string;
readonly suite?: string;
readonly category?: string;
readonly target?: string;
@@ -314,6 +317,7 @@ export interface LightweightResultRecord {
readonly error?: string;
readonly costUsd?: number;
readonly timestamp?: string;
+ readonly runtimeSource?: RunRuntimeSourceMetadata;
}
export function loadLightweightResults(sourceFile: string): LightweightResultRecord[] {
@@ -321,6 +325,7 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
const content = readFileSync(resolvedSourceFile, 'utf8');
return parseResultManifest(content).map((record) => ({
testId: record.test_id ?? 'unknown',
+ evalPath: record.eval_path,
suite: record.suite,
category: record.category,
target: record.target,
@@ -331,5 +336,6 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
error: record.error,
costUsd: record.cost_usd,
timestamp: record.timestamp,
+ runtimeSource: record.runtime_source,
}));
}
diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts
index 113523dfd..cf3dc95ef 100644
--- a/apps/cli/src/commands/results/report.ts
+++ b/apps/cli/src/commands/results/report.ts
@@ -3,20 +3,25 @@ import path from 'node:path';
import { command, option, optional, string } from 'cmd-ts';
-import type { EvaluationResult } from '@agentv/core';
+import type { EvaluationResult, RunRuntimeSourceMetadata } from '@agentv/core';
import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js';
import { RESULTS_REPORT_TEMPLATE } from './report-template.js';
import { resolveSourceFile, sourceArg } from './shared.js';
+const DEFAULT_REPORT_SUBTITLE =
+ 'Dashboard-themed HTML generated from an existing AgentV results workspace.';
+
interface ReportManifestRecord {
readonly eval_file?: string;
+ readonly experiment?: string;
+ readonly runtime_source?: RunRuntimeSourceMetadata;
}
-interface RunSummaryMetadata {
- readonly metadata?: {
- readonly eval_file?: string;
- };
+interface RunSummaryReportMetadata {
+ readonly evalFile?: string;
+ readonly experiment?: string;
+ readonly runtimeSource?: RunRuntimeSourceMetadata;
}
function normalizeEvalFileLabel(value: string | undefined): string | undefined {
@@ -34,16 +39,30 @@ function normalizeEvalFileLabel(value: string | undefined): string | undefined {
}
function readSummaryEvalFile(sourceFile: string): string | undefined {
+ return readSummaryReportMetadata(sourceFile).evalFile;
+}
+
+function readSummaryReportMetadata(sourceFile: string): RunSummaryReportMetadata {
const summaryPath = path.join(path.dirname(sourceFile), 'summary.json');
if (!existsSync(summaryPath)) {
- return undefined;
+ return {};
}
try {
- const summary = JSON.parse(readFileSync(summaryPath, 'utf8')) as RunSummaryMetadata;
- return normalizeEvalFileLabel(summary.metadata?.eval_file);
+ const summary = JSON.parse(readFileSync(summaryPath, 'utf8')) as {
+ metadata?: {
+ eval_file?: string;
+ experiment?: string;
+ runtime_source?: RunRuntimeSourceMetadata;
+ };
+ };
+ return {
+ evalFile: normalizeEvalFileLabel(summary.metadata?.eval_file),
+ ...(summary.metadata?.experiment && { experiment: summary.metadata.experiment }),
+ ...(summary.metadata?.runtime_source && { runtimeSource: summary.metadata.runtime_source }),
+ };
} catch {
- return undefined;
+ return {};
}
}
@@ -55,11 +74,18 @@ function serializeReportResult(
result: EvaluationResult,
sourceFile: string,
manifestRecord?: ReportManifestRecord,
- summaryEvalFile?: string,
+ summaryMetadata?: RunSummaryReportMetadata,
): Record {
+ const runtimeSource = manifestRecord?.runtime_source ?? summaryMetadata?.runtimeSource;
+ const resultExperiment = (result as EvaluationResult & { experiment?: string }).experiment;
+ const experimentNamespace =
+ runtimeSource?.experiment_namespace ??
+ manifestRecord?.experiment ??
+ summaryMetadata?.experiment ??
+ resultExperiment;
const fallbackEvalFile =
normalizeEvalFileLabel(manifestRecord?.eval_file) ??
- summaryEvalFile ??
+ summaryMetadata?.evalFile ??
normalizeEvalFileLabel(result.suite) ??
path.basename(path.dirname(sourceFile));
@@ -79,10 +105,122 @@ function serializeReportResult(
input: result.input,
output: result.output,
assertions: result.assertions,
+ experiment: experimentNamespace,
+ experiment_namespace: experimentNamespace,
+ runtime_source: runtimeSource,
+ runtime_source_label: formatRuntimeSourceLabel(runtimeSource),
+ runtime_config_source_label: formatRuntimeConfigSourceLabel(runtimeSource?.config_source),
eval_file: fallbackEvalFile,
};
}
+function formatRuntimeKindLabel(kind: RunRuntimeSourceMetadata['kind'] | undefined): string {
+ switch (kind) {
+ case 'direct_suite':
+ return 'Direct suite';
+ case 'wrapper_eval':
+ return 'Wrapper eval';
+ case 'multi_eval':
+ return 'Multi-eval';
+ default:
+ return 'Unknown source';
+ }
+}
+
+function formatRuntimeConfigSourceLabel(
+ source: RunRuntimeSourceMetadata['config_source'] | undefined,
+): string {
+ switch (source) {
+ case 'inline_experiment':
+ return 'Inline experiment config';
+ case 'cli_flags':
+ return 'CLI runtime flags';
+ case 'mixed':
+ return 'Mixed runtime config';
+ case 'defaults':
+ return 'Default runtime config';
+ default:
+ return '';
+ }
+}
+
+function formatNamespaceSourceLabel(
+ source: RunRuntimeSourceMetadata['experiment_namespace_source'] | undefined,
+): string {
+ switch (source) {
+ case 'cli':
+ return 'CLI namespace';
+ case 'eval_metadata':
+ return 'Eval metadata namespace';
+ case 'eval_filename':
+ return 'Eval filename namespace';
+ case 'multi_eval':
+ return 'Multi-eval namespace';
+ default:
+ return '';
+ }
+}
+
+function formatRuntimeSourceLabel(runtimeSource: RunRuntimeSourceMetadata | undefined): string {
+ if (!runtimeSource) {
+ return '';
+ }
+ return [
+ formatRuntimeKindLabel(runtimeSource.kind),
+ formatNamespaceSourceLabel(runtimeSource.experiment_namespace_source),
+ formatRuntimeConfigSourceLabel(runtimeSource.config_source),
+ ]
+ .filter(Boolean)
+ .join(' · ');
+}
+
+function uniqueStrings(values: readonly (string | undefined)[]): string[] {
+ return [
+ ...new Set(
+ values.map((value) => value?.trim()).filter((value): value is string => Boolean(value)),
+ ),
+ ].sort();
+}
+
+function escapeHtml(value: string): string {
+ return value
+ .replace(/&/g, '&')
+ .replace(//g, '>')
+ .replace(/"/g, '"');
+}
+
+function formatReportHeaderContext(rows: readonly Record[]): string {
+ const experiments = uniqueStrings(
+ rows.map((row) =>
+ typeof row.experiment_namespace === 'string'
+ ? row.experiment_namespace
+ : typeof row.experiment === 'string'
+ ? row.experiment
+ : undefined,
+ ),
+ );
+ const runtimeSources = uniqueStrings(
+ rows.map((row) =>
+ typeof row.runtime_source_label === 'string' ? row.runtime_source_label : undefined,
+ ),
+ );
+ const parts = [
+ experiments.length === 1
+ ? `Experiment namespace: ${experiments[0]}`
+ : experiments.length > 1
+ ? `Experiment namespaces: ${experiments.join(', ')}`
+ : undefined,
+ runtimeSources.length === 1
+ ? `Runtime source: ${runtimeSources[0]}`
+ : runtimeSources.length > 1
+ ? `Runtime sources: ${runtimeSources.join(', ')}`
+ : undefined,
+ ].filter((part): part is string => Boolean(part));
+
+ return parts.length > 0 ? parts.join(' · ') : DEFAULT_REPORT_SUBTITLE;
+}
+
export async function loadReportSource(
source: string | undefined,
cwd: string,
@@ -91,6 +229,7 @@ export async function loadReportSource(
results: EvaluationResult[];
records: readonly ReportManifestRecord[];
summaryEvalFile?: string;
+ summaryMetadata?: RunSummaryReportMetadata;
}> {
const { sourceFile } = await resolveSourceFile(source, cwd);
const resolvedSourceFile = resolveResultSourcePath(sourceFile, cwd);
@@ -107,6 +246,7 @@ export async function loadReportSource(
results,
records,
summaryEvalFile: readSummaryEvalFile(resolvedSourceFile),
+ summaryMetadata: readSummaryReportMetadata(resolvedSourceFile),
};
}
@@ -114,17 +254,20 @@ export function renderResultsReport(
results: readonly EvaluationResult[],
sourceFile: string,
records: readonly ReportManifestRecord[],
- summaryEvalFile?: string,
+ summaryMetadata?: RunSummaryReportMetadata,
): string {
if (!RESULTS_REPORT_TEMPLATE.includes('__DATA_PLACEHOLDER__')) {
throw new Error('Report template is missing __DATA_PLACEHOLDER__');
}
const rows = results.map((result, index) =>
- serializeReportResult(result, sourceFile, records[index], summaryEvalFile),
+ serializeReportResult(result, sourceFile, records[index], summaryMetadata),
);
const dataJson = JSON.stringify(rows).replace(/<\//g, '<\\/');
- return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', () => dataJson);
+ return RESULTS_REPORT_TEMPLATE.replace('__DATA_PLACEHOLDER__', () => dataJson).replace(
+ DEFAULT_REPORT_SUBTITLE,
+ escapeHtml(formatReportHeaderContext(rows)),
+ );
}
export async function writeResultsReport(
@@ -132,13 +275,13 @@ export async function writeResultsReport(
outputPath: string | undefined,
cwd: string,
): Promise<{ sourceFile: string; outputPath: string; html: string }> {
- const { sourceFile, results, records, summaryEvalFile } = await loadReportSource(source, cwd);
+ const { sourceFile, results, records, summaryMetadata } = await loadReportSource(source, cwd);
const resolvedOutputPath = outputPath
? path.isAbsolute(outputPath)
? outputPath
: path.resolve(cwd, outputPath)
: deriveReportPath(sourceFile);
- const html = renderResultsReport(results, sourceFile, records, summaryEvalFile);
+ const html = renderResultsReport(results, sourceFile, records, summaryMetadata);
mkdirSync(path.dirname(resolvedOutputPath), { recursive: true });
writeFileSync(resolvedOutputPath, html, 'utf8');
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 014427332..56852e60f 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -56,6 +56,7 @@ import {
type EvaluationResult,
type ExternalTraceMetadata,
type ExternalTraceMetadataWire,
+ type RunRuntimeSourceMetadata,
type TraceSessionResponse,
addProject,
externalTraceMetadataFromRecord,
@@ -1559,6 +1560,8 @@ async function handleRuns(c: C, { searchDir, agentvDir, projectId }: DataContext
metas.map(async (m) => {
let target: string | undefined;
let experiment = inferExperimentFromRunId(m.raw_filename);
+ const summaryMetadata = readRunSummaryMetadataForDashboard(m.path);
+ let runtimeSource: RunRuntimeSourceMetadata | undefined = summaryMetadata.runtimeSource;
let timestamp = m.timestamp;
let testCount = m.testCount;
let passRate = m.passRate;
@@ -1578,10 +1581,20 @@ async function handleRuns(c: C, { searchDir, agentvDir, projectId }: DataContext
passRate = qualitySummary.passRate;
avgScore = qualitySummary.avgScore;
executionErrorCount = qualitySummary.executionErrorCount;
+ runtimeSource = deriveDashboardRuntimeSource({
+ summaryMetadata,
+ records,
+ inferredExperiment: experiment,
+ });
} else {
// Run is in-progress with 0 results written yet — fall back to the
// in-memory target stored when the Dashboard launched this run.
target = getActiveRunTarget(m.path);
+ runtimeSource = deriveDashboardRuntimeSource({
+ summaryMetadata,
+ records: [],
+ inferredExperiment: experiment,
+ });
}
} catch {
// ignore enrichment errors
@@ -1605,6 +1618,7 @@ async function handleRuns(c: C, { searchDir, agentvDir, projectId }: DataContext
on_remote: m.on_remote,
...(target && { target }),
...(experiment && { experiment }),
+ ...(runtimeSource && { runtime_source: runtimeSource }),
...tagFields,
...(liveStatus && { status: liveStatus }),
};
@@ -1644,10 +1658,17 @@ async function handleRunDetail(c: C, { searchDir, projectId }: DataContext) {
try {
const loaded = await loadManifestResultsForMeta(searchDir, meta, projectId);
const records = await parseManifestForMeta(searchDir, meta, projectId);
+ const summaryMetadata = readRunSummaryMetadataForDashboard(meta.path);
+ const runtimeSource = deriveDashboardRuntimeSource({
+ summaryMetadata,
+ records,
+ inferredExperiment: records[0]?.experiment,
+ });
// Surface run_dir + suite_filter for local runs so the UI can launch a
// Dashboard-side resume against this exact run. Remote runs live in the
// results-repo cache and cannot be resumed in place, so omit both fields.
- const resumeMeta = meta.source === 'local' ? deriveResumeMeta(searchDir, meta.path) : {};
+ const resumeMeta =
+ meta.source === 'local' ? deriveResumeMeta(searchDir, meta.path, summaryMetadata) : {};
const liveStatus = meta.source === 'local' ? getActiveRunStatus(meta.path) : undefined;
const tagFields = await readRunTagFields(searchDir, meta, projectId);
const baseDir = path.dirname(meta.path);
@@ -1658,6 +1679,7 @@ async function handleRunDetail(c: C, { searchDir, projectId }: DataContext) {
),
source: meta.source,
source_label: meta.displayName,
+ ...(runtimeSource && { runtime_source: runtimeSource }),
...tagFields,
...(liveStatus && { status: liveStatus }),
...resumeMeta,
@@ -1707,9 +1729,99 @@ function attachExternalTraceFields>(
* Returns whatever fields could be resolved — both are best-effort and only
* needed by the Dashboard "Resume run" / "Rerun failed" actions.
*/
+interface RunSummaryMetadataForDashboard {
+ readonly evalFile?: string;
+ readonly experiment?: string;
+ readonly plannedTestCount?: number;
+ readonly runtimeSource?: RunRuntimeSourceMetadata;
+}
+
+function readRunSummaryMetadataForDashboard(manifestPath: string): RunSummaryMetadataForDashboard {
+ try {
+ const summaryPath = path.join(path.dirname(manifestPath), 'summary.json');
+ if (!existsSync(summaryPath)) {
+ return {};
+ }
+ const parsed = JSON.parse(readFileSync(summaryPath, 'utf8')) as {
+ metadata?: {
+ eval_file?: string;
+ experiment?: string;
+ planned_test_count?: number;
+ runtime_source?: RunRuntimeSourceMetadata;
+ };
+ };
+ const planned = parsed.metadata?.planned_test_count;
+ return {
+ ...(typeof parsed.metadata?.eval_file === 'string' &&
+ parsed.metadata.eval_file.trim() && { evalFile: parsed.metadata.eval_file.trim() }),
+ ...(typeof parsed.metadata?.experiment === 'string' &&
+ parsed.metadata.experiment.trim() && { experiment: parsed.metadata.experiment.trim() }),
+ ...(typeof planned === 'number' &&
+ Number.isFinite(planned) &&
+ planned > 0 && { plannedTestCount: planned }),
+ ...(parsed.metadata?.runtime_source && { runtimeSource: parsed.metadata.runtime_source }),
+ };
+ } catch {
+ return {};
+ }
+}
+
+function uniqueRuntimeSourceValues(values: Iterable): readonly string[] {
+ return [...new Set([...values].filter((value): value is string => Boolean(value?.trim())))]
+ .map((value) => value.trim())
+ .sort();
+}
+
+function deriveDashboardRuntimeSource(params: {
+ readonly summaryMetadata: RunSummaryMetadataForDashboard;
+ readonly records: readonly {
+ evalPath?: string;
+ eval_path?: string;
+ suite?: string;
+ experiment?: string;
+ runtimeSource?: RunRuntimeSourceMetadata;
+ runtime_source?: RunRuntimeSourceMetadata;
+ }[];
+ readonly inferredExperiment?: string;
+}): RunRuntimeSourceMetadata | undefined {
+ const recordWithRuntimeSource = params.records.find(
+ (record) => record.runtimeSource ?? record.runtime_source,
+ );
+ const explicit =
+ params.summaryMetadata.runtimeSource ??
+ recordWithRuntimeSource?.runtimeSource ??
+ recordWithRuntimeSource?.runtime_source;
+ if (explicit) {
+ return explicit;
+ }
+
+ const experimentNamespace =
+ params.summaryMetadata.experiment ??
+ params.inferredExperiment ??
+ params.records.find((record) => record.experiment)?.experiment ??
+ 'default';
+ const evalFiles = uniqueRuntimeSourceValues([
+ params.summaryMetadata.evalFile,
+ ...params.records.map((record) => record.evalPath ?? record.eval_path),
+ ]);
+ if (evalFiles.length === 0 && !experimentNamespace) {
+ return undefined;
+ }
+
+ return {
+ schema_version: 'agentv.runtime_source.v1',
+ kind: evalFiles.length > 1 ? 'multi_eval' : 'direct_suite',
+ config_source: 'defaults',
+ experiment_namespace: experimentNamespace,
+ experiment_namespace_source: 'unknown',
+ eval_files: evalFiles,
+ };
+}
+
function deriveResumeMeta(
cwd: string,
manifestPath: string,
+ summaryMetadata = readRunSummaryMetadataForDashboard(manifestPath),
): { run_dir?: string; suite_filter?: string; planned_test_count?: number } {
const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {};
const runDir = path.dirname(manifestPath);
@@ -1718,23 +1830,11 @@ function deriveResumeMeta(
// those absolute so the CLI doesn't get confused. An empty string ('' = same
// dir as cwd) is unusual but valid — fall through to absolute in that case.
out.run_dir = relative !== '' && !relative.startsWith('..') ? relative : runDir;
- try {
- const summaryPath = path.join(runDir, 'summary.json');
- if (existsSync(summaryPath)) {
- const parsed = JSON.parse(readFileSync(summaryPath, 'utf8')) as {
- metadata?: { eval_file?: string; planned_test_count?: number };
- };
- const evalFile = parsed.metadata?.eval_file;
- if (typeof evalFile === 'string' && evalFile.trim()) {
- out.suite_filter = evalFile.trim();
- }
- const planned = parsed.metadata?.planned_test_count;
- if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) {
- out.planned_test_count = planned;
- }
- }
- } catch {
- // summary.json missing / unreadable / malformed — leave fields unset.
+ if (summaryMetadata.evalFile) {
+ out.suite_filter = summaryMetadata.evalFile;
+ }
+ if (summaryMetadata.plannedTestCount !== undefined) {
+ out.planned_test_count = summaryMetadata.plannedTestCount;
}
return out;
}
@@ -3135,6 +3235,7 @@ export function createApp(
size_bytes: number;
target?: string;
experiment?: string;
+ runtime_source?: RunRuntimeSourceMetadata;
tags?: string[];
remote_tags?: string[];
pending_tags?: string[];
@@ -3152,6 +3253,8 @@ export function createApp(
for (const m of metas) {
let target: string | undefined;
let experiment = inferExperimentFromRunId(m.raw_filename);
+ const summaryMetadata = readRunSummaryMetadataForDashboard(m.path);
+ let runtimeSource: RunRuntimeSourceMetadata | undefined = summaryMetadata.runtimeSource;
let passRate = m.passRate;
let avgScore = m.avgScore;
let executionErrorCount = 0;
@@ -3167,6 +3270,11 @@ export function createApp(
passRate = qualitySummary.passRate;
avgScore = qualitySummary.avgScore;
executionErrorCount = qualitySummary.executionErrorCount;
+ runtimeSource = deriveDashboardRuntimeSource({
+ summaryMetadata,
+ records,
+ inferredExperiment: experiment,
+ });
}
} catch {
// ignore enrichment errors
@@ -3185,6 +3293,7 @@ export function createApp(
source: m.source,
...(target && { target }),
...(experiment && { experiment }),
+ ...(runtimeSource && { runtime_source: runtimeSource }),
...tagFields,
project_id: p.id,
project_name: p.name,
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 921b266a7..d248f474f 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -921,6 +921,31 @@ describe('writeArtifactsFromResults', () => {
expect(indexLines[0]?.metrics_path).toBe('alpha/run-1/metrics.json');
});
+ it('writes optional runtime source metadata to summary and index rows', async () => {
+ const runtimeSource = {
+ schema_version: 'agentv.runtime_source.v1' as const,
+ kind: 'direct_suite' as const,
+ config_source: 'cli_flags' as const,
+ experiment_namespace: 'cli-smoke',
+ experiment_namespace_source: 'cli' as const,
+ eval_files: ['evals/smoke.eval.yaml'],
+ };
+ const paths = await writeArtifactsFromResults([makeResult({ testId: 'alpha' })], testDir, {
+ evalFile: 'evals/smoke.eval.yaml',
+ experiment: 'cli-smoke',
+ runtimeSource,
+ });
+
+ const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
+ const [indexLine] = (await readFile(paths.indexPath, 'utf8'))
+ .trim()
+ .split('\n')
+ .map(JSON.parse);
+
+ expect(summary.metadata.runtime_source).toEqual(runtimeSource);
+ expect(indexLine.runtime_source).toEqual(runtimeSource);
+ });
+
it('writes repeat runs in Vercel-compatible case and run folders', async () => {
const results = [
makeResult({
diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts
index 0e69332ac..75f82a924 100644
--- a/apps/cli/test/commands/results/report.test.ts
+++ b/apps/cli/test/commands/results/report.test.ts
@@ -150,6 +150,32 @@ describe('results report', () => {
expect(html).not.toContain('Grader Results');
});
+ it('labels experiment namespace and runtime source in the report header', async () => {
+ const runDir = path.join(tempDir, 'run');
+ await writeArtifactsFromResults([makeResult()], runDir, {
+ evalFile: 'evals/wrapper.eval.yaml',
+ experiment: 'named-smoke',
+ runtimeSource: {
+ schema_version: 'agentv.runtime_source.v1',
+ kind: 'wrapper_eval',
+ config_source: 'inline_experiment',
+ experiment_namespace: 'named-smoke',
+ experiment_namespace_source: 'eval_metadata',
+ eval_files: ['evals/wrapper.eval.yaml'],
+ wrapper_eval_file: 'evals/wrapper.eval.yaml',
+ source_eval_files: ['evals/child.eval.yaml'],
+ },
+ });
+
+ const { outputPath } = await writeResultsReport(runDir, undefined, tempDir);
+ const html = readFileSync(outputPath, 'utf8');
+
+ expect(html).toContain('Experiment namespace: named-smoke');
+ expect(html).toContain(
+ 'Runtime source: Wrapper eval · Eval metadata namespace · Inline experiment config',
+ );
+ });
+
it('embeds result text containing replacement tokens without corrupting the inline script', async () => {
const runDir = path.join(tempDir, 'run');
await writeArtifactsFromResults(
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 35b2c8c38..32b25571a 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -1075,6 +1075,62 @@ describe('serve app', () => {
expect(data.runs[0].tag_revision).toStartWith('sha256:');
});
+ it('exposes experiment namespace and runtime source metadata for run list cards', async () => {
+ const experiment = 'named-wrapper';
+ const filename = '2026-03-25T10-05-00-000Z';
+ const runDir = localRunDir(tempDir, experiment, filename);
+ const runtimeSource = {
+ schema_version: 'agentv.runtime_source.v1' as const,
+ kind: 'wrapper_eval' as const,
+ config_source: 'inline_experiment' as const,
+ experiment_namespace: experiment,
+ experiment_namespace_source: 'eval_metadata' as const,
+ eval_files: ['evals/wrapper.eval.yaml'],
+ wrapper_eval_file: 'evals/wrapper.eval.yaml',
+ source_eval_files: ['evals/source.test.yaml'],
+ };
+ mkdirSync(runDir, { recursive: true });
+ writeFileSync(
+ path.join(runDir, 'index.jsonl'),
+ toJsonl({
+ ...RESULT_A,
+ experiment,
+ eval_path: 'evals/wrapper.eval.yaml',
+ runtime_source: runtimeSource,
+ }),
+ );
+ writeFileSync(
+ path.join(runDir, 'summary.json'),
+ JSON.stringify(
+ {
+ metadata: {
+ timestamp: '2026-03-25T10:05:00.000Z',
+ experiment,
+ eval_file: 'evals/wrapper.eval.yaml',
+ runtime_source: runtimeSource,
+ },
+ run_summary: {},
+ },
+ null,
+ 2,
+ ),
+ );
+
+ const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+ const res = await app.request('/api/runs');
+
+ expect(res.status).toBe(200);
+ const data = (await res.json()) as {
+ runs: Array<{
+ experiment?: string;
+ runtime_source?: typeof runtimeSource;
+ }>;
+ };
+ expect(data.runs).toHaveLength(1);
+ expect(data.runs[0].experiment).toBe(experiment);
+ expect(data.runs[0].runtime_source).toEqual(runtimeSource);
+ });
+
it('exposes sanitized Phoenix external_trace metadata through run detail only', async () => {
const filename = '2026-03-25T10-15-00-000Z';
createLocalRun(
@@ -4467,6 +4523,54 @@ describe('serve app', () => {
expect(data.suite_filter).toBe('examples/demo.eval.yaml');
});
+ it('includes runtime source metadata for local run details', async () => {
+ const runsDir = localResultsExperimentDir(tempDir, 'cli-smoke');
+ mkdirSync(runsDir, { recursive: true });
+ const filename = '2026-05-06T00-00-02-000Z';
+ const runDir = path.join(runsDir, filename);
+ const runtimeSource = {
+ schema_version: 'agentv.runtime_source.v1' as const,
+ kind: 'direct_suite' as const,
+ config_source: 'defaults' as const,
+ experiment_namespace: 'cli-smoke',
+ experiment_namespace_source: 'cli' as const,
+ eval_files: ['examples/demo.eval.yaml'],
+ };
+ mkdirSync(runDir, { recursive: true });
+ writeFileSync(
+ path.join(runDir, 'index.jsonl'),
+ toJsonl({
+ ...RESULT_A,
+ experiment: 'cli-smoke',
+ eval_path: 'examples/demo.eval.yaml',
+ }),
+ );
+ writeFileSync(
+ path.join(runDir, 'summary.json'),
+ JSON.stringify(
+ {
+ metadata: {
+ eval_file: 'examples/demo.eval.yaml',
+ experiment: 'cli-smoke',
+ runtime_source: runtimeSource,
+ },
+ run_summary: {},
+ },
+ null,
+ 2,
+ ),
+ );
+
+ const app = createApp([], tempDir, tempDir, undefined, { studioDir });
+ const res = await app.request(`/api/runs/${encodeURIComponent(`cli-smoke::${filename}`)}`);
+
+ expect(res.status).toBe(200);
+ const data = (await res.json()) as {
+ runtime_source?: typeof runtimeSource;
+ };
+ expect(data.runtime_source).toEqual(runtimeSource);
+ });
+
it('omits suite_filter when summary.json is missing', async () => {
const runsDir = localResultsExperimentDir(tempDir);
mkdirSync(runsDir, { recursive: true });
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index 2f75409e6..9499e85cb 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -640,6 +640,16 @@ describe('agentv eval CLI', () => {
expect(
(benchmark.metadata?.experiment_config as Record).fingerprint,
).toMatch(/^[a-f0-9]{64}$/);
+ expect(benchmark.metadata?.runtime_source).toMatchObject({
+ schema_version: 'agentv.runtime_source.v1',
+ kind: 'wrapper_eval',
+ config_source: 'inline_experiment',
+ experiment_namespace: 'native-exp',
+ experiment_namespace_source: 'eval_metadata',
+ eval_files: ['native-exp.eval.yaml'],
+ wrapper_eval_file: 'native-exp.eval.yaml',
+ source_eval_files: ['sample.test.yaml'],
+ });
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
@@ -710,6 +720,46 @@ describe('agentv eval CLI', () => {
runBudgetCapUsd: 0.22,
evalCaseIds: ['second-case'],
});
+
+ const benchmark = JSON.parse(
+ await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'),
+ ) as { metadata?: Record };
+ expect(benchmark.metadata?.runtime_source).toMatchObject({
+ schema_version: 'agentv.runtime_source.v1',
+ kind: 'multi_eval',
+ config_source: 'mixed',
+ experiment_namespace: 'multi-eval',
+ experiment_namespace_source: 'multi_eval',
+ eval_files: ['first.eval.yaml', 'second.eval.yaml'],
+ });
+ } finally {
+ await rm(fixture.baseDir, { recursive: true, force: true });
+ }
+ }, 30_000);
+
+ it('records CLI-named experiment namespace separately from default runtime config', async () => {
+ const fixture = await createFixture();
+ try {
+ const { stdout, exitCode } = await runCli(fixture, [
+ 'eval',
+ fixture.testFilePath,
+ '--experiment',
+ 'cli-smoke',
+ ]);
+
+ expect(exitCode).toBe(0);
+ const outputPath = extractOutputPath(stdout);
+ const benchmark = JSON.parse(
+ await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'),
+ ) as { metadata?: Record };
+ expect(benchmark.metadata?.runtime_source).toMatchObject({
+ schema_version: 'agentv.runtime_source.v1',
+ kind: 'direct_suite',
+ config_source: 'defaults',
+ experiment_namespace: 'cli-smoke',
+ experiment_namespace_source: 'cli',
+ eval_files: ['sample.test.yaml'],
+ });
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
diff --git a/apps/dashboard/src/components/RunList.mobile.spec.tsx b/apps/dashboard/src/components/RunList.mobile.spec.tsx
index 4f3557634..a61bf0537 100644
--- a/apps/dashboard/src/components/RunList.mobile.spec.tsx
+++ b/apps/dashboard/src/components/RunList.mobile.spec.tsx
@@ -63,4 +63,25 @@ describe('buildRunListItemView', () => {
expect(view.display.secondary).toBe('remote-target');
expect(view.label).toBe('27/03 05:00 · remote-target');
});
+
+ it('builds explicit experiment namespace and runtime source labels', () => {
+ const view = buildRunListItemView(
+ runMeta({
+ experiment: 'smoke-suite',
+ runtime_source: {
+ schema_version: 'agentv.runtime_source.v1',
+ kind: 'multi_eval',
+ config_source: 'mixed',
+ experiment_namespace: 'smoke-suite',
+ experiment_namespace_source: 'cli',
+ eval_files: ['evals/a.eval.yaml', 'evals/b.eval.yaml'],
+ },
+ }),
+ 0.8,
+ );
+
+ expect(view.experimentNamespace).toBe('smoke-suite');
+ expect(view.runtimeSourceLabel).toBe('Multi-eval · CLI namespace · Mixed runtime config');
+ expect(view.runtimeSourceTitle).toContain('evals/a.eval.yaml');
+ });
});
diff --git a/apps/dashboard/src/components/RunList.tsx b/apps/dashboard/src/components/RunList.tsx
index c8fe545ca..d6c83421a 100644
--- a/apps/dashboard/src/components/RunList.tsx
+++ b/apps/dashboard/src/components/RunList.tsx
@@ -38,6 +38,11 @@ import {
formatSelectedRunCount,
runSelectionDisabledReason,
} from '~/lib/run-list-actions';
+import {
+ experimentNamespaceLabel,
+ runtimeSourceSummary,
+ runtimeSourceTitle,
+} from '~/lib/runtime-source';
import type { CombineRunsResponse, RunMeta } from '~/lib/types';
import { PassRatePill } from './PassRatePill';
@@ -73,6 +78,9 @@ interface RunListItemView {
passedCount: number;
failedCount: number;
metadataDirty: boolean;
+ experimentNamespace: string;
+ runtimeSourceLabel?: string;
+ runtimeSourceTitle?: string;
}
function formatDate(ts: string | undefined | null): { date: string; full: string } {
@@ -106,6 +114,13 @@ export function buildRunListItemView(run: RunMeta, passThreshold: number): RunLi
const passedCount = Math.round(run.pass_rate * qualityCount);
const failedCount = qualityCount - passedCount;
const metadataDirty = run.metadata_dirty === true;
+ const experimentNamespace = experimentNamespaceLabel(run);
+ const runtimeSourceLabel = run.runtime_source
+ ? runtimeSourceSummary(run.runtime_source)
+ : undefined;
+ const runtimeSourceTooltip = run.runtime_source
+ ? runtimeSourceTitle(run.runtime_source)
+ : undefined;
return {
run,
@@ -119,6 +134,9 @@ export function buildRunListItemView(run: RunMeta, passThreshold: number): RunLi
passedCount,
failedCount,
metadataDirty,
+ experimentNamespace,
+ runtimeSourceLabel,
+ runtimeSourceTitle: runtimeSourceTooltip,
};
}
@@ -420,6 +438,9 @@ export function RunList({
passedCount,
failedCount,
metadataDirty,
+ experimentNamespace,
+ runtimeSourceLabel,
+ runtimeSourceTitle,
} = view;
const selectionDisabledReason = runSelectionDisabledReason(run);
const selectable = !selectionDisabledReason && selectableRunIds.includes(run.filename);
@@ -459,6 +480,11 @@ export function RunList({
{display.secondary}
) : null}
+
{metadataDirty ?
: null}
@@ -530,6 +556,9 @@ export function RunList({
passedCount,
failedCount,
metadataDirty,
+ experimentNamespace,
+ runtimeSourceLabel,
+ runtimeSourceTitle,
} = view;
const selectionDisabledReason = runSelectionDisabledReason(run);
const selectable =
@@ -579,6 +608,11 @@ export function RunList({
{display.secondary}
) : null}
+
@@ -739,6 +773,35 @@ function PendingSyncBadge() {
);
}
+function RunSourceBadges({
+ experimentNamespace,
+ runtimeSourceLabel,
+ runtimeSourceTitle,
+}: {
+ experimentNamespace: string;
+ runtimeSourceLabel?: string;
+ runtimeSourceTitle?: string;
+}) {
+ return (
+
+
+ Experiment: {experimentNamespace}
+
+ {runtimeSourceLabel ? (
+
+ Runtime: {runtimeSourceLabel}
+
+ ) : null}
+
+ );
+}
+
function RunStatusMark({ view, className = '' }: { view: RunListItemView; className?: string }) {
if (view.isActive) {
return (
diff --git a/apps/dashboard/src/lib/run-detail-context.test.ts b/apps/dashboard/src/lib/run-detail-context.test.ts
index aae745b23..b0c7b7a6e 100644
--- a/apps/dashboard/src/lib/run-detail-context.test.ts
+++ b/apps/dashboard/src/lib/run-detail-context.test.ts
@@ -51,6 +51,7 @@ describe('buildRunDetailHeader', () => {
expect(header.sourceBadge).toBe('Remote');
expect(header.sourceLabel).toBe('smoke-wtg-2026-06-04T02-19-00Z');
expect(header.sourceContext).toEqual([
+ { label: 'Experiment namespace', value: 'smoke' },
{ label: 'Repo', value: 'WiseTechGlobal/WTG.AI.Prompts.EvalResults' },
]);
expect(header.meta).toBe('codex · smoke · 2026-06-04T02:19:00.000Z');
@@ -67,7 +68,35 @@ describe('buildRunDetailHeader', () => {
expect(header.heading).toBe('azure');
expect(header.meta).toBe('azure · 2026-06-04T02:19:00.000Z · local');
expect(header.sourceBadge).toBeUndefined();
- expect(header.sourceContext).toEqual([]);
+ expect(header.sourceContext).toEqual([{ label: 'Experiment namespace', value: 'default' }]);
+ });
+
+ it('surfaces derived runtime source metadata when present', () => {
+ const header = buildRunDetailHeader({
+ runId: 'native-exp::2026-06-04T02-19-00Z',
+ source: 'local',
+ results: localRunResults,
+ runtimeSource: {
+ schema_version: 'agentv.runtime_source.v1',
+ kind: 'wrapper_eval',
+ config_source: 'inline_experiment',
+ experiment_namespace: 'native-exp',
+ experiment_namespace_source: 'eval_metadata',
+ eval_files: ['evals/native-exp.eval.yaml'],
+ wrapper_eval_file: 'evals/native-exp.eval.yaml',
+ source_eval_files: ['evals/sample.eval.yaml'],
+ },
+ formatTimestamp: (timestamp) => timestamp,
+ });
+
+ expect(header.sourceContext).toContainEqual({
+ label: 'Experiment namespace',
+ value: 'native-exp',
+ });
+ expect(header.sourceContext).toContainEqual({
+ label: 'Runtime source',
+ value: 'Wrapper eval · Eval metadata namespace · Inline experiment config',
+ });
});
});
diff --git a/apps/dashboard/src/lib/run-detail-context.ts b/apps/dashboard/src/lib/run-detail-context.ts
index 378c07cc6..33cf0c9d3 100644
--- a/apps/dashboard/src/lib/run-detail-context.ts
+++ b/apps/dashboard/src/lib/run-detail-context.ts
@@ -11,9 +11,11 @@
* repeated labels for single-eval runs.
*/
+import { experimentNamespaceLabel, runtimeSourceSummary } from './runtime-source';
import type { EvalResult, RunDetailResponse } from './types';
type RunSource = RunDetailResponse['source'];
+type RunRuntimeSource = RunDetailResponse['runtime_source'];
type HeaderResult = Pick;
type SuiteLabelResult = Pick;
@@ -24,6 +26,7 @@ export interface RunDetailHeaderInput {
results: readonly HeaderResult[];
source?: RunSource;
sourceLabel?: string;
+ runtimeSource?: RunRuntimeSource;
remoteRepo?: string;
formatTimestamp?: (timestamp: string) => string;
}
@@ -86,6 +89,19 @@ export function buildRunDetailHeader(input: RunDetailHeaderInput): RunDetailHead
const remoteRepo = cleanOptional(input.remoteRepo);
const sourceContext: RunDetailHeaderContextItem[] = [];
+ sourceContext.push({
+ label: 'Experiment namespace',
+ value: experimentNamespaceLabel({
+ experiment: firstResult?.experiment,
+ runtime_source: input.runtimeSource,
+ }),
+ });
+ if (input.runtimeSource) {
+ sourceContext.push({
+ label: 'Runtime source',
+ value: runtimeSourceSummary(input.runtimeSource),
+ });
+ }
if (isRemote) {
if (sourceLabel && sourceLabel !== heading) {
sourceContext.push({ label: 'Source', value: sourceLabel });
diff --git a/apps/dashboard/src/lib/runtime-source.ts b/apps/dashboard/src/lib/runtime-source.ts
new file mode 100644
index 000000000..6881a945c
--- /dev/null
+++ b/apps/dashboard/src/lib/runtime-source.ts
@@ -0,0 +1,85 @@
+import type { RunRuntimeSource } from './types';
+
+export function runtimeKindLabel(kind: RunRuntimeSource['kind'] | undefined): string {
+ switch (kind) {
+ case 'direct_suite':
+ return 'Direct suite';
+ case 'wrapper_eval':
+ return 'Wrapper eval';
+ case 'multi_eval':
+ return 'Multi-eval';
+ default:
+ return 'Unknown source';
+ }
+}
+
+export function runtimeConfigSourceLabel(
+ source: RunRuntimeSource['config_source'] | undefined,
+): string {
+ switch (source) {
+ case 'inline_experiment':
+ return 'Inline experiment config';
+ case 'cli_flags':
+ return 'CLI runtime flags';
+ case 'mixed':
+ return 'Mixed runtime config';
+ case 'defaults':
+ return 'Default runtime config';
+ default:
+ return 'Unknown runtime config';
+ }
+}
+
+export function experimentNamespaceSourceLabel(
+ source: RunRuntimeSource['experiment_namespace_source'] | undefined,
+): string {
+ switch (source) {
+ case 'cli':
+ return 'CLI namespace';
+ case 'eval_metadata':
+ return 'Eval metadata namespace';
+ case 'eval_filename':
+ return 'Eval filename namespace';
+ case 'multi_eval':
+ return 'Multi-eval namespace';
+ default:
+ return 'Namespace source unknown';
+ }
+}
+
+export function experimentNamespaceLabel(input: {
+ experiment?: string;
+ runtime_source?: RunRuntimeSource;
+}): string {
+ return (
+ input.runtime_source?.experiment_namespace?.trim() || input.experiment?.trim() || 'default'
+ );
+}
+
+export function runtimeSourceSummary(runtimeSource: RunRuntimeSource | undefined): string {
+ if (!runtimeSource) {
+ return 'Runtime source unknown';
+ }
+ return [
+ runtimeKindLabel(runtimeSource.kind),
+ experimentNamespaceSourceLabel(runtimeSource.experiment_namespace_source),
+ runtimeConfigSourceLabel(runtimeSource.config_source),
+ ].join(' · ');
+}
+
+export function runtimeSourceTitle(runtimeSource: RunRuntimeSource | undefined): string {
+ if (!runtimeSource) {
+ return 'Runtime source metadata was not recorded for this run.';
+ }
+ const lines = [runtimeSourceSummary(runtimeSource)];
+ if (runtimeSource.eval_files && runtimeSource.eval_files.length > 0) {
+ lines.push(`Eval files: ${runtimeSource.eval_files.join(', ')}`);
+ }
+ if (runtimeSource.wrapper_eval_file) {
+ lines.push(`Wrapper eval: ${runtimeSource.wrapper_eval_file}`);
+ }
+ if (runtimeSource.source_eval_files && runtimeSource.source_eval_files.length > 0) {
+ lines.push(`Source eval files: ${runtimeSource.source_eval_files.join(', ')}`);
+ }
+ return lines.join('\n');
+}
diff --git a/apps/dashboard/src/lib/types.ts b/apps/dashboard/src/lib/types.ts
index 835504ce3..89b0081b3 100644
--- a/apps/dashboard/src/lib/types.ts
+++ b/apps/dashboard/src/lib/types.ts
@@ -32,6 +32,7 @@ export interface RunMeta {
size_bytes: number;
target?: string;
experiment?: string;
+ runtime_source?: RunRuntimeSource;
source: 'local' | 'remote';
/**
* True when this run is present on the configured remote results branch.
@@ -266,6 +267,7 @@ export interface RunDetailResponse {
results: EvalResult[];
source: 'local' | 'remote';
source_label?: string;
+ runtime_source?: RunRuntimeSource;
final_state?: RunFinalState;
tag_revision?: string;
/** Live execution status when this run is still tracked in-memory by Dashboard. */
@@ -278,6 +280,23 @@ export interface RunDetailResponse {
planned_test_count?: number;
}
+export interface RunRuntimeSource {
+ schema_version?: 'agentv.runtime_source.v1';
+ kind?: 'direct_suite' | 'wrapper_eval' | 'multi_eval' | string;
+ config_source?: 'defaults' | 'inline_experiment' | 'cli_flags' | 'mixed' | string;
+ experiment_namespace?: string;
+ experiment_namespace_source?:
+ | 'cli'
+ | 'eval_metadata'
+ | 'eval_filename'
+ | 'multi_eval'
+ | 'unknown'
+ | string;
+ eval_files?: string[];
+ wrapper_eval_file?: string;
+ source_eval_files?: string[];
+}
+
export interface SuiteSummary {
name: string;
total: number;
diff --git a/apps/dashboard/src/routes/projects/$projectId_/runs/$runId.tsx b/apps/dashboard/src/routes/projects/$projectId_/runs/$runId.tsx
index 03edba3c6..1192d4463 100644
--- a/apps/dashboard/src/routes/projects/$projectId_/runs/$runId.tsx
+++ b/apps/dashboard/src/routes/projects/$projectId_/runs/$runId.tsx
@@ -57,6 +57,7 @@ function ProjectRunDetailPage() {
results: data?.results ?? [],
source: data?.source,
sourceLabel: data?.source_label,
+ runtimeSource: data?.runtime_source,
remoteRepo: data?.source === 'remote' ? remoteStatus?.repo : undefined,
formatTimestamp: (value) => new Date(value).toLocaleString(),
});
diff --git a/apps/dashboard/src/routes/runs/$runId.tsx b/apps/dashboard/src/routes/runs/$runId.tsx
index 28b403674..e9c93f695 100644
--- a/apps/dashboard/src/routes/runs/$runId.tsx
+++ b/apps/dashboard/src/routes/runs/$runId.tsx
@@ -58,6 +58,7 @@ function RunDetailPage() {
results: data?.results ?? [],
source: data?.source,
sourceLabel: data?.source_label,
+ runtimeSource: data?.runtime_source,
remoteRepo: data?.source === 'remote' ? remoteStatus?.repo : undefined,
formatTimestamp: (value) => new Date(value).toLocaleString(),
});
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
index 3d7cb3aad..f36db7494 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
@@ -5,13 +5,84 @@ sidebar:
order: 1
---
-Evaluation files define the test cases, graders, workspace lifecycle, and inline runtime block for an evaluation run. Runtime choices such as target matrices, thresholds, budgets, and repeat runs belong under top-level [`experiment:`](/docs/evaluation/experiments/). Install, build, and reset commands belong under `workspace.hooks`; runner-specific setup belongs under `targets[].hooks`. AgentV supports two eval formats: YAML and JSONL.
+Evaluation files define the test cases, graders, workspace lifecycle, and inline runtime block for an evaluation run. Runtime choices such as target matrices, thresholds, budgets, and repeat runs belong under top-level [`experiment:`](/docs/evaluation/experiments/). Install, build, and reset commands belong under `workspace.hooks`; runner-specific setup belongs under `targets[].hooks`. AgentV supports two eval data formats: YAML and JSONL.
YAML is the canonical portable model. TypeScript helpers, generated fixtures, and Python scripts should lower to the same YAML/JSONL shapes rather than inventing a separate eval contract.
-## Suites
+## Authoring Shapes
-An eval file is a **suite**: it binds test cases to task context, assertions, reusable fixtures, and the inline runtime block. Test cases can be inline, loaded from an external file via `tests: ./cases.yaml`, or imported with `tests[].include`.
+Eval YAML is AgentV's composable and runnable authoring primitive. Use ordinary
+`*.eval.yaml` files for direct task suites and for wrapper evals that compose
+other suites. Raw case files are reusable data inputs, not a second runnable
+experiment format.
+
+- A **task suite** is eval YAML that owns task context: `workspace`, shared
+ `input`, shared `assertions`, and test cases. It can run directly or be
+ imported with `type: suite`.
+- A **raw case file** is a YAML/JSONL array, directory, or glob of cases. Import
+ it with `tests: ./cases.yaml`, string shorthand, or `type: tests`; parent
+ suite context applies because raw cases do not carry their own suite context.
+- A **wrapper eval** is eval YAML that imports one or more suites with
+ `type: suite` and binds runtime policy in its inline `experiment:` block.
+ Wrapper evals can live anywhere in the repo. A wrapper that imports suites
+ with `type: suite` must not define parent workspace fields such as
+ `workspace`, `experiment.workspace`, or legacy `execution.workspace`;
+ imported suites own task environment.
+
+For example, a reusable task suite can keep the task contract in one file:
+
+```yaml
+# evals/suites/refunds.eval.yaml
+suite: refunds
+workspace:
+ repos:
+ - path: ./support-app
+ repo: acme/support-app
+ commit: main
+input: Answer using the refund policy in the workspace.
+assertions:
+ - Applies the refund policy correctly
+tests:
+ - id: missing-receipt
+ input: Can this customer get a refund without a receipt?
+```
+
+Raw cases are just case data:
+
+```yaml
+# evals/cases/refund-smoke.cases.yaml
+- id: damaged-item
+ input: The item arrived damaged. What should support do?
+ expected_output: Offer a replacement or refund path.
+```
+
+A wrapper eval stays ordinary eval YAML while choosing runtime policy:
+
+```yaml
+# experiments/refunds-codex.eval.yaml
+experiment:
+ name: refunds-codex
+ target: codex-gpt5
+ repeat:
+ count: 2
+ strategy: pass_all
+
+tests:
+ - include: ../evals/suites/refunds.eval.yaml
+ type: suite
+ - include: ../evals/cases/refund-smoke.cases.yaml
+ type: tests
+```
+
+The `experiments/` directory in that example is optional and user-owned. AgentV
+does not infer behavior from the path; the wrapper runs because it is eval YAML
+with an inline `experiment:` block. The wrapper owns runtime policy only. Put
+workspace setup in imported child suites. Parent workspace-affecting fields,
+including `workspace`, `experiment.workspace`, and legacy
+`execution.workspace`, are for parent-owned raw cases, including cases imported
+with `type: tests`. `experiment.workspace` is only a runtime `mode`/`path`
+override; repos, hooks, templates, Docker config, and isolation belong in
+top-level or case-level `workspace`.
## YAML Format
@@ -39,11 +110,15 @@ tests:
| `description` | Human-readable description of the evaluation |
| `suite` | Optional suite identifier |
| `experiment` | Runtime policy (`target`, `targets`, `workers`, `repeat`, `threshold`, `timeout_seconds`, `budget_usd`, etc.) |
-| `workspace` | Suite-level workspace config — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). |
+| `workspace` | Suite-level task environment — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config). Repo entries declare identity and checkout pins; acquisition is covered in [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition). |
| `tests` | Array of individual tests, include entries, or a string path to an external file or directory. Tests and include entries may use scoped `run:` overrides for `threshold`, `repeat`, `timeout_seconds`, and `budget_usd`. |
| `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test |
| `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test |
+`workspace` is what the agent can inspect or modify through tools, not prompt
+input. Put instructions in `input`; put repos, templates, and lifecycle setup in
+`workspace`.
+
For historical or repo-state evals, put the checkout under
`workspace.repos[].commit` or `workspace.repos[].base_commit`. A commit SHA in
the prompt or metadata is useful context, but it does not materialize a repo for
@@ -229,9 +304,9 @@ Use explicit `input` when the prompt is short or generated from YAML variables.
Use `PROMPT.md` when the task text is long enough that duplicating it inside
YAML would make the eval hard to review.
-### Tests as String Path
+### Raw Cases as String Paths
-Instead of inlining tests in the same file, you can point `tests` to an external YAML or JSONL file. This is the inverse of the sidecar pattern — the metadata file references the test data:
+Instead of inlining tests in the same file, you can point `tests` to an external YAML or JSONL file of raw cases. This is the inverse of the sidecar pattern — the metadata file references the test data:
```yaml
name: my-eval
@@ -241,10 +316,10 @@ experiment:
tests: ./cases.yaml
```
-The path is resolved relative to the eval file's directory. The external file
-should contain a YAML array of test objects or a JSONL file with one test per
-line. String entries inside a `tests:` list work the same way and may use direct
-paths, directories, or globs:
+The path is resolved relative to the eval file's directory. The external raw
+case file should contain a YAML array of test objects or a JSONL file with one
+test per line. String entries inside a `tests:` list work the same way and may
+use direct paths, directories, or globs:
```yaml
tests:
@@ -253,10 +328,11 @@ tests:
type: suite
```
-String shorthand is raw-case-only. Import eval suites with object entries using
-`include:` and `type: suite`.
+String shorthand is raw-case-only. Import reusable task suites with object
+entries using `include:` and `type: suite`; use `type: tests` when you want to
+drop suite context and import only raw cases.
-### Tests as Directory Path
+### Raw Cases as Directory Paths
When `tests` points to a directory, AgentV auto-discovers test cases from subdirectories. Each subdirectory containing a `case.yaml` (or `case.yml`) becomes a test case:
@@ -292,7 +368,7 @@ input: Fix the null check bug in parser.ts
- **Alphabetical ordering:** Subdirectories are sorted alphabetically for deterministic order
- **Per-case workspace:** A `workspace/` subdirectory inside the case directory automatically sets `workspace.template` to that path, unless the case already defines a `workspace` field
- **Skipped directories:** Subdirectories without `case.yaml` are skipped with a warning
-- **Suite-level config applies:** Suite-level `assertions`, `input`, `workspace`, and `execution` still apply to directory-discovered cases
+- **Suite-level config applies:** Suite-level `assertions`, `input`, `workspace`, and `experiment` still apply to directory-discovered cases
This pattern is useful for benchmarks with many cases, where each case benefits from its own directory for workspace templates, supporting files, or documentation.
For guidance on keeping provenance metadata, patches, oracle files, and generated
diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
index 8bb05eee3..88f5a8399 100644
--- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
@@ -8,6 +8,8 @@ sidebar:
AgentV eval files are the only runnable authoring artifact. Use top-level
`experiment:` inside `eval.yaml` for runtime choices: targets, workers,
timeout, sandbox/runtime knobs, budgets, thresholds, and repeat-run policy.
+AgentV does not have a separate `experiment.yaml` file, top-level `run_group`,
+or schema-significant `experiments/` directory.
```yaml
name: support-regression
@@ -35,6 +37,43 @@ tests:
`execution:` is accepted only as a legacy top-level alias for existing eval
files. Do not use both `experiment:` and `execution:` in the same eval.
+## Layout Conventions
+
+Use directories for human organization, not schema behavior. A common layout is:
+
+```text
+evals/
+ suites/
+ refunds.eval.yaml
+ cases/
+ refund-smoke.cases.yaml
+experiments/
+ refunds-codex.eval.yaml
+```
+
+In that layout, `evals/suites/refunds.eval.yaml` is a reusable task suite,
+`evals/cases/refund-smoke.cases.yaml` is raw case data, and
+`experiments/refunds-codex.eval.yaml` is a wrapper eval. The wrapper still runs
+only because it is eval YAML:
+
+```yaml
+# experiments/refunds-codex.eval.yaml
+experiment:
+ name: refunds-codex
+ target: codex-gpt5
+ workers: 2
+
+tests:
+ - include: ../evals/suites/refunds.eval.yaml
+ type: suite
+ - include: ../evals/cases/refund-smoke.cases.yaml
+ type: tests
+```
+
+The `experiments/` folder is optional and user-owned. AgentV does not scan it
+for special files or infer runtime behavior from the path; the same wrapper eval
+could live under `evals/wrappers/`, `benchmarks/`, or beside the suite it runs.
+
## Tests Imports
Use `tests[]` for composition, imports, and selection.
@@ -64,13 +103,15 @@ tests:
`type: suite` preserves the imported suite's task contract: metadata,
`workspace`, shared `input`, shared `assertions`, and tests. The parent eval
-still owns the single run bundle. Runtime defaults from an imported suite apply
-only where they can be scoped to that suite's tests: `threshold`, `repeat` or
-`runs`, `timeout_seconds`, and `budget_usd`. If the parent eval supplies one of
-those defaults, the parent value wins for imported tests. Fields that cannot be
-scoped inside one parent run, such as `target`, `targets`, `workers`,
-`workspace`, `agent`, `model`, `agent_options`, and `sandbox`, must be supplied
-by the parent experiment when importing the suite.
+still owns the single run bundle and runtime policy. Child suite
+`experiment:` blocks are ignored when imported; use parent `experiment:` for
+run policy and `tests[].run` for scoped threshold, repeat, timeout, or budget
+overrides.
+
+A parent eval that imports any `type: suite` entry must not define top-level
+`workspace`. Imported suites own task environment. If the parent should provide
+workspace context, import raw cases with `type: tests` or shorthand paths
+instead of importing an eval suite.
`type: tests` imports only raw test entries. It intentionally drops shared
context from an imported eval suite, so parent suite fields apply to those raw
@@ -104,7 +145,7 @@ Use scoped `run:` blocks for result interpretation and scheduling policies that
vary by include group or test case. Precedence is:
```text
-test.run > tests[].run > parent experiment > imported suite experiment defaults
+test.run > tests[].run > parent experiment
```
```yaml
@@ -160,6 +201,7 @@ prepare files, dependencies, repos, or target-specific runner state.
| Reset or apply per-case state | `workspace.hooks.before_each` / `workspace.hooks.after_each` |
| Configure an agent runner or provider variant | `targets[].hooks` |
| Choose targets, repeats, pass policy, budget, threshold | `experiment` |
+| Override run workspace mode/path without changing task setup | `experiment.workspace.mode` / `experiment.workspace.path` |
```yaml
workspace:
@@ -181,6 +223,12 @@ experiment:
strategy: pass_at_k
```
+`experiment.workspace` is intentionally limited to `mode` and `path`, matching
+the `--workspace-mode` and `--workspace-path` CLI flags. Put repos, templates,
+hooks, Docker config, and isolation under top-level or case-level `workspace`.
+Wrapper evals that import child evals with `type: suite` must not define
+`experiment.workspace`; imported suites own the task workspace.
+
## Repeat Runs
`repeat` supports the same core strategies as repeated attempts:
@@ -215,12 +263,19 @@ Do not set both `repeat` and `runs` in the same runtime block.
## Result Layout
-Default eval runs write to:
+Eval runs write to the selected result group:
```text
-.agentv/results///
+.agentv/results///
```
+CLI `--experiment` sets the result group explicitly. Without that flag, AgentV
+derives the group from the eval input: a single eval uses the eval metadata
+`name` when present or the eval filename otherwise, and multiple eval files use
+`multi-eval`. Inline `experiment.name` does not currently select the result
+group.
+
Imported source suite metadata appears in `index.jsonl` rows and manifests.
-AgentV does not add a redundant suite directory when the result group is already
-the eval name.
+Use `index.jsonl` fields such as `eval_path`, `test_id`, `target`, and
+`result_dir` for identity and artifact discovery instead of reconstructing paths
+from suite names or wrapper layout.
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index f6664b336..018892bd3 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -328,7 +328,7 @@ workspace:
Notes:
- Pooling is default for shared workspaces with repos when mode is not specified.
- `mode: static` (or `--workspace-mode static`) uses `path` / `--workspace-path`. When the path is empty or missing, the workspace is auto-materialised (template copied + repos cloned). Populated directories are reused as-is.
-- Static mode is incompatible with `isolation: per_test`.
+- Static mode is incompatible with `isolation: per_case`.
- `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset).
- Pool slots are managed separately (`agentv workspace list|clean`).
diff --git a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
index 33ad6bf50..8ec466839 100644
--- a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
+++ b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx
@@ -98,7 +98,7 @@ name: swe-style-regression
description: Regression tasks against pinned source commits.
workspace:
- isolation: per_test
+ isolation: per_case
repos:
- path: ./repo
repo: https://github.com/example/widget.git
@@ -153,7 +153,7 @@ primitives.
name: repo-regressions
workspace:
- isolation: per_test
+ isolation: per_case
repos:
- path: ./repo
repo: https://github.com/example/widget.git
@@ -207,9 +207,11 @@ When one eval references another eval, preserve the task/runtime split:
- The parent runnable eval owns runtime `experiment:` for the run.
- Child `experiment:` blocks are ignored by `type: suite` composition. There is
no fallback to the child `experiment:` when the parent has no `experiment:`.
-- Child `workspace` setup is preserved for `type: suite` imports. Parent
- workspace applies to raw cases owned by the parent file, not to imported suite
- tests.
+- Child `workspace` setup is preserved for `type: suite` imports. A parent eval
+ that imports any `type: suite` entry must not define parent workspace fields
+ such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`.
+ Parent workspace context is for parent-owned raw cases, including raw cases
+ imported with `type: tests`.
- A tests-only import can drop child workspace context only when the import mode
says so explicitly.
- Workspace path collisions or incompatible isolation settings should fail
diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
index 3c738ff9d..823a3dd70 100644
--- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
+++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx
@@ -211,4 +211,4 @@ CLI flags `--retain-on-success` / `--retain-on-failure` control temporary eval-r
- You need guaranteed clean-slate isolation between runs
- You're debugging workspace setup issues and want fresh clones each time
- You use `mode: static` with a pre-existing or auto-materialised directory (pooling is automatically skipped)
-- You need `isolation: per_test` (each test gets its own workspace copy; pooling is automatically skipped)
+- You need `isolation: per_case` (each test gets its own workspace copy; pooling is automatically skipped)
diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx
index 6ff210675..48092b649 100644
--- a/apps/web/src/content/docs/docs/targets/configuration.mdx
+++ b/apps/web/src/content/docs/docs/targets/configuration.mdx
@@ -165,7 +165,7 @@ workspace:
hooks:
after_each:
reset: fast # none | fast | strict
- isolation: shared # shared (default) | per_test
+ isolation: shared # shared (default) | per_case
mode: pooled # pooled | temp | static
path: /tmp/my-ws # workspace path for mode=static
```
@@ -181,11 +181,13 @@ workspace:
| `repos[].ancestor` | Walk N commits back from the checked-out ref (e.g., `1` for parent) |
| `repos[].sparse` | Sparse checkout paths |
| `hooks.after_each.reset` | Reset policy after each test: `none`, `fast`, `strict` |
-| `isolation` | `shared` reuses one workspace; `per_test` creates a fresh copy per test |
+| `isolation` | `shared` reuses one workspace; `per_case` creates a fresh copy per test case |
| `mode` | Workspace mode: `pooled`, `temp`, `static` |
| `path` | Workspace path for `mode=static`. When empty or missing, the workspace is auto-materialised (template copied + repos cloned). Populated directories are reused as-is. |
| `hooks.enabled` | Boolean (default: `true`). Set `false` to skip all lifecycle hooks. |
+`isolation: per_case` is the spelling for fresh workspace state per test case.
+
**Pooling:** `mode: pooled` (or default shared repo mode) reuses pool slots between runs. Use `mode: temp` to disable pooling for fresh clone/checkouts each run.
**Static auto-materialisation:** When `mode: static` and `path` points to an empty or missing directory, AgentV automatically copies the template and clones repos into it. If the directory already exists and is populated, it is reused as-is.
diff --git a/docs/adr/0006-separate-experiments-from-eval-definitions.md b/docs/adr/0006-separate-experiments-from-eval-definitions.md
index 9c3c703b1..0cc4f8f46 100644
--- a/docs/adr/0006-separate-experiments-from-eval-definitions.md
+++ b/docs/adr/0006-separate-experiments-from-eval-definitions.md
@@ -216,13 +216,20 @@ order inside each resolved source.
`type: suite` preserves the imported suite task contract. That includes suite
metadata, `workspace`, shared `input`, shared `assertions`, and tests. The
-parent eval still owns one run bundle. Child suite `experiment:` defaults apply
-to imported tests only when the field can be scoped per test:
-`threshold`, `repeat` or `runs`, `timeout_seconds`, and `budget_usd`.
-Where the parent eval supplies one of those defaults, the parent value wins.
-Fields that cannot vary per imported suite inside one parent run, such as
-`target`, `targets`, `workers`, `workspace`, `agent`, `model`, `agent_options`,
-and `sandbox`, must be supplied by the parent experiment for imported suites.
+parent eval still owns one run bundle and one runtime policy. Child suite
+`experiment:` blocks are ignored when imported with `type: suite`; they do not
+fall back into the parent run. Scoped runtime overrides that the parent wants
+to apply to imported tests live in `tests[].run`.
+
+A parent eval that imports any child eval suite with `type: suite` must not
+define parent workspace-affecting fields, including `workspace`,
+`experiment.workspace`, or legacy `execution.workspace`. The wrapper owns
+runtime policy, not task environment. Imported child suites keep their own
+`workspace`, including `workspace.repos[]`, templates, hooks, and isolation.
+`experiment.workspace` remains a narrow runtime override for `mode` and `path`
+only; it is not a place for repos, hooks, templates, Docker config, or
+isolation. If the parent should own workspace context, import raw cases with
+`type: tests` or shorthand paths instead of importing an eval suite.
`type: tests` imports only raw test entries. It intentionally drops shared
suite context such as workspace, shared input, and shared assertions. Use this
@@ -257,11 +264,11 @@ needs it, but the default composition model must not merge task contracts in a
surprising way.
If a parent eval defines `workspace` and imports child eval suites with
-`type: suite`, the parent workspace applies only to raw cases owned by the
-parent file. Imported suite tests keep their child suite workspace. This is a
-valid mixed-case pattern when the parent owns raw cases, but it is usually a
-DX smell when every test is a `type: suite` import. AgentV should warn or lint
-that shape rather than silently implying a parent workspace override.
+`type: suite`, AgentV should reject the file during validation and loading.
+That removes the ambiguous parent-child workspace merge question from the
+authoring model. A parent eval may still define `workspace` when it imports raw
+cases with `type: tests`, because those raw cases intentionally use parent
+suite context.
If a parent eval has no `experiment:` and imports child suites that do have
`experiment:` blocks, child runtime still does not fall back into the parent
@@ -270,7 +277,7 @@ used. The correct choices are to run the child suite directly, add a parent
`experiment:` block, or pass CLI runtime flags.
Wrapper evals that import multiple suites with distinct shared workspace
-contracts should fail fast or require per-test isolation, separate runs, or an
+contracts should fail fast or require per-case isolation, separate runs, or an
explicit future composition mode. Shared workspace setup is safe when one suite
owns the task contract; it is not a place for implicit parent-child or
child-child workspace merging.
@@ -286,7 +293,7 @@ policy without creating separate experiment files.
Runtime override precedence is:
```text
-test.run > tests[].run > parent experiment > imported suite experiment defaults
+test.run > tests[].run > parent experiment
```
Group-level overrides live beside `include`, `type`, and `select`:
@@ -350,17 +357,18 @@ suite-level `execution`, `workspace`, `input`, and `assertions`.
When a wrapper eval imports it with `type: suite`, AgentV must preserve its
shared `workspace`, `input`, and `assertions` because those fields are part of
the task contract. Its `execution` block is the legacy spelling for child
-runtime configuration. Under this decision, the child runtime block is treated
-as child `experiment`/legacy `execution`: scoped defaults such as threshold,
-repeat policy, timeout, and budget can follow the imported tests, while
+runtime configuration. Under this decision, child `experiment`/legacy
+`execution` blocks are ignored in wrapper composition; scoped threshold,
+repeat, timeout, and budget overrides must be authored on the parent
+`tests[].run` include entry or on child tests themselves, while
candidate-changing fields must be supplied by the parent wrapper eval's
`experiment:`.
This is the motivating distinction:
- task context from imported suites is preserved;
-- child runtime policy from imported suites contributes scoped defaults only
- where a parent runtime policy does not override them;
+- child runtime policy from imported suites is ignored in wrapper composition;
+ scoped runtime overrides are explicit `tests[].run` data;
- raw-case imports do not inherit suite context.
## Result Layout
@@ -415,9 +423,8 @@ Negative:
evals.
- Explicit task-context override syntax is deferred, so authors who need
overrides must create a new suite or wait for a focused override design.
-- Wrapper evals need diagnostics so authors understand that parent workspace
- does not override imported suite workspaces and child experiment blocks are
- ignored.
+- Wrapper evals need diagnostics so authors understand that parent workspace is
+ invalid with `type: suite` imports and child experiment blocks are ignored.
## Non-Goals
diff --git a/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md b/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md
index f94ea687f..e6ac498f2 100644
--- a/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md
+++ b/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md
@@ -69,8 +69,13 @@ references child eval files with `type: suite`, the current loader ignores the
child `experiment:` block and uses the parent `experiment:` when one exists; it
does not fall back to the child `experiment:`. Workspace follows task ownership,
not runtime fallback: imported child tests keep the child suite workspace that
-was already expanded into those tests, while the parent workspace applies to
-raw cases owned by the parent file. A "tests only" import mode may drop child
+was already expanded into those tests. Therefore a parent eval that imports any
+child eval with `type: suite` must not define parent workspace-affecting fields
+such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`.
+Parent workspace context is valid for parent-owned raw cases only, including
+raw cases imported with `type: tests` or shorthand paths. `experiment.workspace`
+is limited to runtime `mode` and `path`; task environment fields remain in
+top-level or case-level `workspace`. A "tests only" import mode may drop child
workspace context, but that must be opt-in.
ADR 0006 defines the contract-layer model behind this rule: task data, task
@@ -86,14 +91,17 @@ imported cases' validity.
This decision creates follow-up behavior and docs beads:
- `av-pkp` adds authoring diagnostics for misleading wrapper composition,
- including parent workspace with only suite imports and ignored child
- experiments.
-- `av-ha5` guards incompatible imported-suite shared workspace compositions so
- one wrapper run cannot silently use the wrong shared workspace.
+ including forbidden parent workspace on suite-import wrappers and ignored
+ child experiments.
+- `av-ha5` rejects parent workspace on suite-import wrappers and guards
+ incompatible imported-suite shared workspace compositions so one wrapper run
+ cannot silently use the wrong shared workspace.
- `av-82t` improves Dashboard/report display of the existing experiment
namespace and derived runtime source without adding new authored primitives.
- `av-58q` teaches the optional `evals/suites/` and `experiments/`
wrapper-eval folder convention without making the path schema-significant.
+- `av-dxp` adds a regression eval for this architecture decision so future
+ agents do not recommend parent workspace on suite-import wrappers.
## Consequences
@@ -116,8 +124,9 @@ Negative:
- AgentV still needs strong docs examples so authors do not invent competing
provenance keys.
-- Import/composition behavior needs a focused follow-up if parent evals include
- child evals with conflicting workspaces.
+- Import/composition behavior needs focused diagnostics because parent
+ `workspace` is invalid on suite-import wrappers and child suite workspaces
+ remain task-owned.
- Some imported benchmark vocabulary such as SWE-bench `base_commit` must be
translated at the adapter boundary.
- Diagnostics are needed because the one-primitive model puts task suites and
@@ -136,6 +145,10 @@ Negative:
checkout and works for branches, tags, SHAs, and non-SWE benchmarks.
- **Drop child workspaces when importing child evals.** Rejected as a default.
That turns valid imported cases into tests detached from their setup.
+- **Allow parent workspace on suite-import wrappers.** Rejected. It creates a
+ misleading merge/override question inside a one-primitive authoring model.
+ Parent evals that need workspace context should import raw cases with
+ `type: tests`; wrapper evals that import suites own runtime policy only.
- **Copy benchmark-specific fields into AgentV.** Rejected. SWE-bench patches,
Harbor task TOML, Margin suite config, promptfoo provider matrices, and
Braintrust hosted experiment fields stay in adapters, fixtures, metadata, or
diff --git a/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md b/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md
index 4a2e89f1f..f9ecab40e 100644
--- a/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md
+++ b/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md
@@ -63,11 +63,14 @@ with a special name.
LangSmith, promptfoo, OpenAI Evals, Inspect, Hugging Face Datasets, and
OpenInference inform adapters and docs, not AgentV-native object models.
- **Make composition explicit.** When a parent eval references child eval files
- with `type: suite`, the current loader uses the parent `experiment:` and does
- not fall back to the child `experiment:`. Child workspace remains task-owned:
- imported suite tests keep their expanded child workspace, while parent
- workspace applies to raw cases owned by the parent file. Any future parent
- workspace override/remap should be explicit and logged.
+ with `type: suite`, the parent owns runtime `experiment:` and does not fall
+ back to the child `experiment:`. Child workspace remains task-owned: imported
+ suite tests keep their expanded child workspace. Parent evals that import
+ suites must not define parent workspace-affecting fields such as `workspace`,
+ `experiment.workspace`, or legacy `execution.workspace`; parent workspace
+ applies only to parent-owned raw cases, including cases imported with
+ `type: tests`. `experiment.workspace` is only a runtime `mode`/`path`
+ override, not a task workspace definition.
### Evidence Summary
@@ -145,11 +148,14 @@ Research ambiguity:
composition, even when the parent has no `experiment:`; there is currently no
child-experiment fallback.
- R13. Child `workspace` setup should remain task-owned. In the current loader,
- imported suite tests keep their child workspace, and parent workspace applies
- only to parent raw cases.
-- R14. Parent workspace override/remap for imported suites should require an
- explicit future syntax and should emit an info log explaining which workspace
- is being used.
+ imported suite tests keep their child workspace. A parent eval that imports
+ any suite with `type: suite` must not define parent workspace-affecting fields
+ such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`.
+ `experiment.workspace` is limited to runtime `mode` and `path`; task
+ workspace setup stays in `workspace`.
+- R14. Parent workspace applies to parent-owned raw cases only, including raw
+ cases imported with `type: tests`. Any future parent workspace
+ override/remap for imported suites should require explicit syntax.
- R15. A tests-only import mode may drop child workspace context, but it must be
explicit because it changes case validity.
- R16. Workspace merge conflicts, path collisions, and incompatible isolation
@@ -178,8 +184,9 @@ Research ambiguity:
SWE-bench `base_commit` at adapter boundaries when needed.
4. **Document composition semantics before implementing new imports.** Parent
evals own runtime `experiment:` without child fallback. Child workspaces are
- preserved for `type: suite`; parent workspace applies to parent-owned raw
- cases. Any future override/remap needs explicit syntax and an info log.
+ preserved for `type: suite`; parent workspace fields are forbidden when
+ importing suites and apply only to parent-owned raw cases. Any future
+ override/remap needs explicit syntax.
5. **Canonicalize docs toward `experiment:`.** Existing examples that still
teach `execution:` should be audited in a follow-up docs bead if that surface
is still transitional.
@@ -212,8 +219,9 @@ Research ambiguity:
- `repeat` and `runs` both appear in some external or local vocabulary. AgentV
should keep `repeat` canonical unless a compatibility story requires aliases.
- Silently replacing child workspaces during eval composition can create false
- failures or false passes. Composition needs explicit modes, info logs when an
- override/remap is requested, and loud collision handling.
+ failures or false passes. Composition needs the hard parent-workspace rule,
+ explicit future override/remap modes if ever needed, and loud collision
+ handling.
- Translating imported `base_commit` into `workspace.repos[].commit` may surprise
SWE-bench users unless docs show the mapping directly.
- Provenance in free-form metadata can drift across adapters. Docs should
@@ -225,9 +233,8 @@ Research ambiguity:
`experiment:` before the next tag?
- OQ2. Should AgentV eventually support a formal suite-level `metadata` field,
and if so, should it be general-purpose rather than benchmark-specific?
-- OQ3. Should AgentV add an info log for current `type: suite` imports when a
- parent workspace exists, explaining that imported child tests keep child
- workspace while parent workspace applies only to parent raw cases?
+- OQ3. Which examples should be added to make the hard parent-workspace rule
+ obvious before authors hit validation?
- OQ4. What exact composition syntax should distinguish full-suite include from
tests-only import and any future explicit workspace override/remap?
- OQ5. When multiple child evals provide `workspace.repos[]`, should path
@@ -242,10 +249,13 @@ Research ambiguity:
- `docs(schema): canonicalize eval runtime docs and examples` - Audit
`execution:` versus `experiment:`, `runs` versus `repeat`, and AI-facing
eval-builder references.
+- `av-dxp` - Add a regression eval for the wrapper-workspace architecture
+ decision so future agents recommend forbidding parent workspace on
+ suite-import wrappers rather than hand-waving toward warnings or implicit
+ merge.
- `design(schema): eval composition semantics` - Define full-suite include,
- tests-only import, current parent/child workspace ownership, optional info
- logs, future workspace merge/remap, collision errors, and parent `experiment:`
- override behavior.
+ tests-only import, current parent/child workspace ownership, future workspace
+ merge/remap, collision errors, and parent `experiment:` override behavior.
- `docs(evals): benchmark authoring recipes` - Add human and AI docs for
SWE-bench-style, Harbor-backed, Margin-style, promptfoo-style, and
Braintrust/LangSmith-style mappings using existing AgentV primitives.
diff --git a/packages/core/src/evaluation/experiment.ts b/packages/core/src/evaluation/experiment.ts
index b0659c3f9..cb9d62307 100644
--- a/packages/core/src/evaluation/experiment.ts
+++ b/packages/core/src/evaluation/experiment.ts
@@ -33,6 +33,11 @@ export type ExperimentRepeat = {
readonly costLimitUsd?: number;
};
+export type ExperimentWorkspaceConfig = {
+ readonly mode?: 'pooled' | 'temp' | 'static';
+ readonly path?: string;
+};
+
export type ExperimentConfigWire = {
readonly name?: string;
readonly agent?: string;
@@ -48,7 +53,7 @@ export type ExperimentConfigWire = {
readonly threshold?: number;
readonly budget_usd?: number;
readonly sandbox?: ExperimentSandbox;
- readonly workspace?: Record;
+ readonly workspace?: ExperimentWorkspaceConfig;
};
export type ExperimentConfig = {
@@ -66,7 +71,7 @@ export type ExperimentConfig = {
readonly threshold?: number;
readonly budgetUsd?: number;
readonly sandbox?: ExperimentSandbox;
- readonly workspace?: Record;
+ readonly workspace?: ExperimentWorkspaceConfig;
readonly fingerprint?: string;
};
@@ -137,7 +142,7 @@ export function normalizeExperimentConfig(rawConfig: unknown): ExperimentConfig
'budget_usd',
);
const sandbox = readOptionalSandbox(rawConfig.sandbox);
- const workspace = readOptionalRecord(rawConfig.workspace);
+ const workspace = readOptionalWorkspace(rawConfig.workspace);
const configWithoutFingerprint: Omit = {
...(name !== undefined && { name }),
@@ -391,6 +396,36 @@ function readOptionalRecord(raw: unknown): Record | undefined {
return raw;
}
+function readOptionalWorkspace(raw: unknown): ExperimentWorkspaceConfig | undefined {
+ const workspace = readOptionalRecord(raw);
+ if (workspace === undefined) {
+ return undefined;
+ }
+
+ for (const key of Object.keys(workspace)) {
+ if (key !== 'mode' && key !== 'path') {
+ throw new Error(
+ `Experiment workspace.${key} is not supported. Experiment workspace supports only mode and path; put task setup in top-level workspace.`,
+ );
+ }
+ }
+
+ const mode = workspace.mode;
+ if (mode !== undefined && mode !== 'pooled' && mode !== 'temp' && mode !== 'static') {
+ throw new Error("Experiment workspace.mode must be 'pooled', 'temp', or 'static'.");
+ }
+
+ const path = workspace.path;
+ if (path !== undefined && (typeof path !== 'string' || path.trim().length === 0)) {
+ throw new Error('Experiment workspace.path must be a non-empty string.');
+ }
+
+ return {
+ ...(mode !== undefined && { mode }),
+ ...(path !== undefined && { path: path.trim() }),
+ };
+}
+
function isRecord(value: unknown): value is Record {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index d7771ab92..3d49090e4 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -87,6 +87,7 @@ import {
type EvalCaseWorkspaceSetup,
WorkspaceSetupError,
captureWorkspaceFileChanges,
+ caseUsesSharedWorkspaceSetup,
hasHookCommand,
hooksEnabled,
prepareEvalCaseWorkspace,
@@ -853,14 +854,34 @@ export async function runEvaluation(
target.providerBatching === true &&
primaryProvider.supportsBatch === true &&
typeof primaryProvider.invokeBatch === 'function';
+ let batchingDisabledByRuntimePolicy = false;
// Disable batch mode when trials > 1 (batch processes all cases at once, incompatible with per-case retries)
if (trials && trials.count > 1 && providerSupportsBatch) {
console.warn('Warning: Batch mode is disabled when trials.count > 1. Using per-case dispatch.');
providerSupportsBatch = false;
+ batchingDisabledByRuntimePolicy = true;
}
- if (target.providerBatching && !providerSupportsBatch && verbose) {
+ const requiresWorkspaceDispatch =
+ workspacePath !== undefined ||
+ legacyWorkspacePath !== undefined ||
+ workspaceMode !== undefined ||
+ filteredEvalCases.some((evalCase) => evalCase.workspace !== undefined);
+ if (providerSupportsBatch && requiresWorkspaceDispatch) {
+ if (verbose) {
+ console.warn('Warning: Batch mode is disabled for workspace-enabled evals.');
+ }
+ providerSupportsBatch = false;
+ batchingDisabledByRuntimePolicy = true;
+ }
+
+ if (
+ target.providerBatching &&
+ !providerSupportsBatch &&
+ verbose &&
+ !batchingDisabledByRuntimePolicy
+ ) {
console.warn(
`Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`,
);
@@ -1180,12 +1201,20 @@ export async function runEvaluation(
});
}
- // Multi-slot pool: each test grabs its own pool slot
- const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : undefined;
- const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath;
- const testBaselineCommit = testPoolSlot
- ? poolSlotBaselines.get(testPoolSlot.path)
- : sharedBaselineCommit;
+ // Multi-slot pool: each shared-workspace test grabs its own pool slot.
+ // Per-case isolated cases and raw/no-workspace cases outside the selected
+ // shared owner prepare without inheriting a child suite's workspace.
+ const usesSharedWorkspace = caseUsesSharedWorkspaceSetup(evalCase, sharedSetup);
+ const testPoolSlot =
+ usesSharedWorkspace && availablePoolSlots.length > 0 ? availablePoolSlots.pop() : undefined;
+ const testWorkspacePath = usesSharedWorkspace
+ ? (testPoolSlot?.path ?? sharedWorkspacePath)
+ : undefined;
+ const testBaselineCommit = usesSharedWorkspace
+ ? testPoolSlot
+ ? poolSlotBaselines.get(testPoolSlot.path)
+ : sharedBaselineCommit
+ : undefined;
try {
const graderProvider = await resolveGraderProvider(target);
@@ -1841,7 +1870,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise {
const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
@@ -101,9 +124,9 @@ export async function aggregateRunDir(
const allResults = parseJsonlResults(content);
const results = deduplicateByTestIdTarget(allResults);
- const plannedTestCount =
- options?.plannedTestCount ??
- (await readPlannedTestCount(path.join(runDir, RUN_SUMMARY_FILENAME)));
+ const previousMetadata = await readRunSummaryMetadata(path.join(runDir, RUN_SUMMARY_FILENAME));
+ const plannedTestCount = options?.plannedTestCount ?? previousMetadata.plannedTestCount;
+ const runtimeSource = options?.runtimeSource ?? previousMetadata.runtimeSource;
const summary = buildRunSummaryArtifact(
results,
@@ -111,6 +134,7 @@ export async function aggregateRunDir(
options?.experiment,
plannedTestCount,
options?.experimentMetadata,
+ runtimeSource,
);
const summaryPath = path.join(runDir, RUN_SUMMARY_FILENAME);
await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
@@ -119,17 +143,58 @@ export async function aggregateRunDir(
return { summaryPath, testCount: results.length, targetCount: targetSet.size };
}
-async function readPlannedTestCount(summaryPath: string): Promise {
+async function readRunSummaryMetadata(summaryPath: string): Promise<{
+ plannedTestCount?: number;
+ runtimeSource?: RunRuntimeSourceMetadata;
+}> {
try {
const raw = await readFile(summaryPath, 'utf8');
- const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } };
+ const parsed = JSON.parse(raw) as {
+ metadata?: {
+ planned_test_count?: number;
+ runtime_source?: RunRuntimeSourceMetadata;
+ };
+ };
const value = parsed.metadata?.planned_test_count;
- return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+ const plannedTestCount =
+ typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+ const runtimeSource = isRunRuntimeSourceMetadata(parsed.metadata?.runtime_source)
+ ? parsed.metadata.runtime_source
+ : undefined;
+ return {
+ ...(plannedTestCount !== undefined && { plannedTestCount }),
+ ...(runtimeSource !== undefined && { runtimeSource }),
+ };
} catch {
- return undefined;
+ return {};
}
}
+function isRunRuntimeSourceMetadata(value: unknown): value is RunRuntimeSourceMetadata {
+ if (typeof value !== 'object' || value === null || Array.isArray(value)) {
+ return false;
+ }
+ const candidate = value as Partial;
+ return (
+ candidate.schema_version === 'agentv.runtime_source.v1' &&
+ (candidate.kind === 'direct_suite' ||
+ candidate.kind === 'wrapper_eval' ||
+ candidate.kind === 'multi_eval') &&
+ (candidate.config_source === 'defaults' ||
+ candidate.config_source === 'inline_experiment' ||
+ candidate.config_source === 'cli_flags' ||
+ candidate.config_source === 'mixed') &&
+ typeof candidate.experiment_namespace === 'string' &&
+ (candidate.experiment_namespace_source === 'cli' ||
+ candidate.experiment_namespace_source === 'eval_metadata' ||
+ candidate.experiment_namespace_source === 'eval_filename' ||
+ candidate.experiment_namespace_source === 'multi_eval' ||
+ candidate.experiment_namespace_source === 'unknown') &&
+ Array.isArray(candidate.eval_files) &&
+ candidate.eval_files.every((entry) => typeof entry === 'string')
+ );
+}
+
export interface GradingArtifact {
readonly assertions: readonly {
readonly text: string;
@@ -238,6 +303,7 @@ export interface RunSummaryArtifact {
readonly tests_run: readonly string[];
readonly experiment?: string;
readonly experiment_config?: ExperimentArtifactMetadata;
+ readonly runtime_source?: RunRuntimeSourceMetadata;
readonly planned_test_count?: number;
};
readonly run_summary: Record<
@@ -302,6 +368,7 @@ export interface IndexArtifactEntry {
readonly transcript_raw_path?: string;
readonly metrics_path?: string;
readonly artifact_pointers?: ResultArtifactPointersWire;
+ readonly runtime_source?: RunRuntimeSourceMetadata;
readonly raw_provider_log_path?: string;
readonly input_path?: string;
readonly task_dir?: string;
@@ -1108,6 +1175,7 @@ export function buildRunSummaryArtifact(
experiment?: string,
plannedTestCount?: number,
experimentMetadata?: ExperimentArtifactMetadata,
+ runtimeSource?: RunRuntimeSourceMetadata,
): RunSummaryArtifact {
const targetSet = new Set();
const testIdSet = new Set();
@@ -1199,6 +1267,7 @@ export function buildRunSummaryArtifact(
tests_run: testIds,
experiment,
experiment_config: experimentMetadata,
+ runtime_source: runtimeSource,
planned_test_count: plannedTestCount,
},
run_summary: runSummary,
@@ -1215,6 +1284,7 @@ export async function writeInitialRunSummaryArtifact(
plannedTestCount: number;
experiment?: string;
experimentMetadata?: ExperimentArtifactMetadata;
+ runtimeSource?: RunRuntimeSourceMetadata;
},
): Promise {
await mkdir(runDir, { recursive: true });
@@ -1224,6 +1294,7 @@ export async function writeInitialRunSummaryArtifact(
options.experiment,
options.plannedTestCount,
options.experimentMetadata,
+ options.runtimeSource,
);
const summaryPath = path.join(runDir, RUN_SUMMARY_FILENAME);
await writeFile(summaryPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
@@ -1394,6 +1465,7 @@ export function buildIndexArtifactEntry(
artifactPointers?: ResultArtifactPointersWire;
rawProviderLogPath?: string;
extraIndexFields?: AdditionalResultIndexFields;
+ runtimeSource?: RunRuntimeSourceMetadata;
projectionIdentity?: ProjectionIdentity;
duplicatePolicy?: ExportDuplicatePolicy;
},
@@ -1450,6 +1522,7 @@ export function buildIndexArtifactEntry(
? toRelativeArtifactPath(options.outputDir, options.rawProviderLogPath)
: undefined,
artifact_pointers: options.artifactPointers,
+ runtime_source: options.runtimeSource,
...options.extraIndexFields,
external_trace: toIndexExternalTrace(result, options.projectionIdentity?.dimensions.runId),
projection_identity: options.projectionIdentity
@@ -1467,6 +1540,7 @@ export function buildResultIndexArtifact(
projectionIdentity?: ProjectionIdentity;
duplicatePolicy?: ExportDuplicatePolicy;
artifactPointers?: ResultArtifactPointersWire;
+ runtimeSource?: RunRuntimeSourceMetadata;
},
): ResultIndexArtifact {
const artifactSubdir = buildArtifactSubdir(result);
@@ -1516,6 +1590,7 @@ export function buildResultIndexArtifact(
? path.posix.join(singleRunDir, 'transcript-raw.jsonl')
: undefined,
artifact_pointers: options?.artifactPointers,
+ runtime_source: options?.runtimeSource,
...extraIndexFields,
external_trace: toIndexExternalTrace(result, options?.projectionIdentity?.dimensions.runId),
projection_identity: options?.projectionIdentity
@@ -1926,6 +2001,7 @@ export async function writePerTestArtifacts(
resultGroup?: string;
sourceTests?: readonly EvalTest[];
additionalArtifacts?: AdditionalResultArtifactsWriter;
+ runtimeSource?: RunRuntimeSourceMetadata;
},
): Promise {
await mkdir(outputDir, { recursive: true });
@@ -2010,6 +2086,7 @@ export async function writePerTestArtifacts(
transcriptPath: singleTranscriptPath,
transcriptRawPath: singleTranscriptRawPath,
extraIndexFields,
+ runtimeSource: options?.runtimeSource,
projectionIdentity,
duplicatePolicy,
}),
@@ -2033,6 +2110,7 @@ export async function writeArtifactsFromResults(
resultGroup?: string;
sourceTests?: readonly EvalTest[];
additionalArtifacts?: AdditionalResultArtifactsWriter;
+ runtimeSource?: RunRuntimeSourceMetadata;
},
): Promise<{
testArtifactDir: string;
@@ -2175,6 +2253,7 @@ export async function writeArtifactsFromResults(
transcriptPath: plan.singleTranscriptPath,
transcriptRawPath: plan.singleTranscriptRawPath,
extraIndexFields,
+ runtimeSource: options?.runtimeSource,
projectionIdentity: plan.projectionIdentity,
duplicatePolicy,
}),
@@ -2193,13 +2272,16 @@ export async function writeArtifactsFromResults(
emittedIdentityIds.add(identityId);
}
- const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(summaryPath));
+ const previousMetadata = await readRunSummaryMetadata(summaryPath);
+ const plannedTestCount = options?.plannedTestCount ?? previousMetadata.plannedTestCount;
+ const runtimeSource = options?.runtimeSource ?? previousMetadata.runtimeSource;
const summary = buildRunSummaryArtifact(
results,
options?.evalFile,
options?.experiment,
plannedTestCount,
options?.experimentMetadata,
+ runtimeSource,
);
await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 15a501560..91d2c8188 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -350,8 +350,8 @@ export type WorkspaceConfig = {
/** Template directory or .code-workspace file. Directories are copied to temp workspace.
* .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
readonly template?: string;
- /** Isolation strategy for workspace: shared (default) or per_test */
- readonly isolation?: 'shared' | 'per_test';
+ /** Isolation strategy for workspace: shared (default) or per_case. */
+ readonly isolation?: 'shared' | 'per_case';
/** Repository definitions to clone/checkout into workspace */
readonly repos?: readonly RepoConfig[];
/** Workspace lifecycle hooks */
diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts
index 04e275cc0..4c2db309b 100644
--- a/packages/core/src/evaluation/validation/eval-file.schema.ts
+++ b/packages/core/src/evaluation/validation/eval-file.schema.ts
@@ -312,15 +312,23 @@ const DockerWorkspaceSchema = z.object({
cpus: z.number().min(0.1).optional(),
});
+const WorkspaceEnvSchema = z
+ .object({
+ required_commands: z.array(z.string().min(1)).optional(),
+ required_python_modules: z.array(z.string().min(1)).optional(),
+ })
+ .strict();
+
const WorkspaceSchema = z
.object({
template: z.string().optional(),
- isolation: z.enum(['shared', 'per_test']).optional(),
+ isolation: z.enum(['shared', 'per_case']).optional(),
repos: z.array(RepoSchema).optional(),
hooks: WorkspaceHooksSchema.optional(),
mode: z.enum(['pooled', 'temp', 'static']).optional(),
path: z.string().optional(),
docker: DockerWorkspaceSchema.optional(),
+ env: WorkspaceEnvSchema.optional(),
})
.strict();
@@ -384,6 +392,13 @@ const RunOverrideSchema = z
})
.strict();
+const ExperimentWorkspaceSchema = z
+ .object({
+ mode: z.enum(['pooled', 'temp', 'static']).optional(),
+ path: z.string().min(1).optional(),
+ })
+ .strict();
+
const ExperimentTargetRefSchema = z.union([
z.string().min(1),
z
@@ -407,7 +422,7 @@ const ExperimentRuntimeSchema = ExecutionSchema.extend({
timeout_seconds: z.number().gt(0).optional(),
budget_usd: z.number().gt(0).optional(),
sandbox: z.enum(['auto', 'docker', 'vercel']).optional(),
- workspace: JsonObjectSchema.optional(),
+ workspace: ExperimentWorkspaceSchema.optional(),
setup: z.never().optional(),
}).refine((value) => value.repeat === undefined || value.runs === undefined, {
message: 'Use repeat or runs, not both.',
diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts
index 6a3585c28..1b199230d 100644
--- a/packages/core/src/evaluation/validation/eval-validator.ts
+++ b/packages/core/src/evaluation/validation/eval-validator.ts
@@ -270,11 +270,12 @@ export async function validateEvalFile(filePath: string): Promise {
+ await validateWorkspaceConfig(parsed.workspace, absolutePath, errors, 'workspace');
+ if (isObject(parsed.experiment)) {
+ validateExperimentWorkspaceConfig(
+ parsed.experiment.workspace,
+ absolutePath,
+ errors,
+ 'experiment.workspace',
+ );
+ }
+ if (isObject(parsed.execution)) {
+ validateExperimentWorkspaceConfig(
+ parsed.execution.workspace,
+ absolutePath,
+ errors,
+ 'execution.workspace',
+ );
+ }
+}
+
+function validateExperimentWorkspaceConfig(
+ workspace: JsonValue | undefined,
+ filePath: string,
+ errors: ValidationError[],
+ location: string,
+): void {
+ if (workspace === undefined) {
+ return;
+ }
+
+ if (!isObject(workspace)) {
+ errors.push({
+ severity: 'error',
+ filePath,
+ location,
+ message: `${location} must be an object with mode and/or path.`,
+ });
+ return;
+ }
+
+ for (const key of Object.keys(workspace)) {
+ if (key === 'mode' || key === 'path') {
+ continue;
+ }
+ errors.push({
+ severity: 'error',
+ filePath,
+ location: `${location}.${key}`,
+ message: `${location} supports only mode and path. Put task workspace setup in top-level workspace.`,
+ });
+ }
+
+ const mode = workspace.mode;
+ if (mode !== undefined && mode !== 'pooled' && mode !== 'temp' && mode !== 'static') {
+ errors.push({
+ severity: 'error',
+ filePath,
+ location: `${location}.mode`,
+ message: `${location}.mode must be 'pooled', 'temp', or 'static'.`,
+ });
+ }
+
+ const workspacePath = workspace.path;
+ if (
+ workspacePath !== undefined &&
+ (typeof workspacePath !== 'string' || workspacePath.trim().length === 0)
+ ) {
+ errors.push({
+ severity: 'error',
+ filePath,
+ location: `${location}.path`,
+ message: `${location}.path must be a non-empty string.`,
+ });
+ }
+}
+
+async function validateCompositionDiagnostics(
+ filePath: string,
+ parsed: JsonObject,
+ errors: ValidationError[],
+): Promise {
+ const tests = parsed.tests;
+ if (!Array.isArray(tests)) {
+ return;
+ }
+
+ const parentHasRuntime = parsed.experiment !== undefined || parsed.execution !== undefined;
+ const hasSuiteImport = tests.some(
+ (entry) => isObject(entry) && isIncludeEntry(entry) && entry.type === 'suite',
+ );
+
+ if (hasSuiteImport) {
+ for (const location of parentWorkspaceLocations(parsed)) {
+ errors.push({
+ severity: 'error',
+ filePath,
+ location,
+ message:
+ 'Parent workspace is not allowed when an eval imports suites with type: suite. A wrapper eval owns runtime policy, while imported suites own task environment. Move workspace into the child suite, or import raw cases with type: tests when you intentionally want parent workspace context.',
+ });
+ }
+ }
+
+ for (let i = 0; i < tests.length; i++) {
+ const entry = tests[i];
+ if (!isObject(entry) || !isIncludeEntry(entry)) {
+ continue;
+ }
+
+ const includePath = entry.include.trim();
+ const location = `tests[${i}].include`;
+ const resolvedSuites = await resolveSuiteIncludePaths(includePath, path.dirname(filePath));
+
+ if (entry.type === 'suite') {
+ for (const resolvedSuite of resolvedSuites) {
+ const childParsed = await readImportedSuite(resolvedSuite.filePath);
+ if (!childParsed) {
+ continue;
+ }
+ const runtimeField =
+ childParsed.experiment !== undefined
+ ? 'experiment'
+ : childParsed.execution !== undefined
+ ? 'legacy execution'
+ : undefined;
+ if (!runtimeField) {
+ continue;
+ }
+
+ errors.push({
+ severity: 'warning',
+ filePath,
+ location,
+ message: parentHasRuntime
+ ? `Imported suite '${resolvedSuite.displayPath}' defines ${runtimeField}, but child experiment blocks are ignored for type: suite imports. The parent experiment owns wrapper runtime; move runtime settings to the parent experiment or use tests[].run for per-case thresholds, repeats, timeouts, and budgets.`
+ : `Imported suite '${resolvedSuite.displayPath}' defines ${runtimeField}, but child experiment blocks are ignored for type: suite imports. The parent experiment owns wrapper runtime, and this parent has no experiment, so no child runtime settings are applied. Add a parent experiment or use tests[].run for per-case thresholds, repeats, timeouts, and budgets.`,
+ });
+ }
+ continue;
+ }
+
+ if (entry.type === 'tests') {
+ for (const resolvedSuite of resolvedSuites) {
+ if (!/\.eval\.ya?ml$/i.test(resolvedSuite.filePath)) {
+ continue;
+ }
+ errors.push({
+ severity: 'warning',
+ filePath,
+ location,
+ message: `type: tests imports raw cases from eval suite '${resolvedSuite.displayPath}' and drops suite context, including child workspace, input, assertions, metadata, and experiment. Parent suite context applies. Use type: suite to preserve child test and workspace semantics.`,
+ });
+ }
+ }
+ }
+}
+
+function parentWorkspaceLocations(parsed: JsonObject): readonly string[] {
+ const locations: string[] = [];
+ if (parsed.workspace !== undefined) {
+ locations.push('workspace');
+ }
+ if (isObject(parsed.experiment) && parsed.experiment.workspace !== undefined) {
+ locations.push('experiment.workspace');
+ }
+ if (isObject(parsed.execution) && parsed.execution.workspace !== undefined) {
+ locations.push('execution.workspace');
+ }
+ return locations;
+}
+
+async function readImportedSuite(filePath: string): Promise {
+ try {
+ const parsed = interpolateEnv(parseYamlValue(await readFile(filePath, 'utf8')), process.env);
+ return isObject(parsed) ? parsed : undefined;
+ } catch {
+ return undefined;
+ }
+}
+
function validateIncludeEntry(
entry: JsonObject,
location: string,
@@ -690,7 +875,7 @@ async function validateWorkspaceConfig(
}
if (isObject(workspace)) {
- validateWorkspaceRepoConfig(workspace, evalFilePath, errors);
+ validateWorkspaceRepoConfig(workspace, evalFilePath, errors, location);
return;
}
@@ -713,7 +898,7 @@ async function validateWorkspaceConfig(
return;
}
- validateWorkspaceRepoConfig(parsedWorkspace, workspacePath, errors);
+ validateWorkspaceRepoConfig(parsedWorkspace, workspacePath, errors, location);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
errors.push({
@@ -729,6 +914,7 @@ function validateWorkspaceRepoConfig(
workspace: JsonObject,
filePath: string,
errors: ValidationError[],
+ location: string,
): void {
const repos = workspace.repos;
const hooks = workspace.hooks;
@@ -737,6 +923,15 @@ function validateWorkspaceRepoConfig(
const docker = workspace.docker;
+ if (isolation !== undefined && isolation !== 'shared' && isolation !== 'per_case') {
+ errors.push({
+ severity: 'error',
+ filePath,
+ location: `${location}.isolation`,
+ message: "workspace.isolation must be 'shared' or 'per_case'.",
+ });
+ }
+
if (Array.isArray(repos)) {
for (const repo of repos) {
if (!isObject(repo)) continue;
@@ -745,7 +940,7 @@ function validateWorkspaceRepoConfig(
errors.push({
severity: 'error',
filePath,
- location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
+ location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message: 'workspace.repos[].source has been removed. Use workspace.repos[].repo.',
});
}
@@ -754,7 +949,7 @@ function validateWorkspaceRepoConfig(
errors.push({
severity: 'error',
filePath,
- location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
+ location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message:
'workspace.repos[].checkout has been removed. Use top-level commit, base_commit, and ancestor.',
});
@@ -764,7 +959,7 @@ function validateWorkspaceRepoConfig(
errors.push({
severity: 'error',
filePath,
- location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
+ location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message: 'workspace.repos[].clone has been removed. Use top-level sparse if needed.',
});
}
@@ -773,7 +968,7 @@ function validateWorkspaceRepoConfig(
errors.push({
severity: 'error',
filePath,
- location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
+ location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message:
'repos[].repo is required for non-Docker workspaces. ' +
'Repo-less entries are only valid when workspace.docker is configured.',
@@ -788,21 +983,21 @@ function validateWorkspaceRepoConfig(
errors.push({
severity: 'error',
filePath,
- location: `workspace.repos[path=${repo.path ?? '(none)'}]`,
+ location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message: 'repos[].commit and repos[].base_commit must match when both are set.',
});
}
}
}
- // after_each reset with per_test isolation warning
- if (isObject(afterEachHook) && afterEachHook.reset && isolation === 'per_test') {
+ // after_each reset with per-case isolation warning
+ if (isObject(afterEachHook) && afterEachHook.reset && isolation === 'per_case') {
errors.push({
severity: 'warning',
filePath,
- location: 'workspace.hooks.after_each',
+ location: `${location}.hooks.after_each`,
message:
- 'hooks.after_each.reset is redundant with isolation: per_test (each test gets a fresh workspace).',
+ 'hooks.after_each.reset is redundant with isolation: per_case (each test gets a fresh workspace).',
});
}
}
diff --git a/packages/core/src/evaluation/workspace/setup.ts b/packages/core/src/evaluation/workspace/setup.ts
index db5251b08..38c630d2c 100644
--- a/packages/core/src/evaluation/workspace/setup.ts
+++ b/packages/core/src/evaluation/workspace/setup.ts
@@ -105,6 +105,8 @@ export interface SharedWorkspaceSetupOptions {
export interface SharedWorkspaceSetup {
readonly suiteWorkspace?: WorkspaceConfig;
+ readonly sharedWorkspaceOwnerKey?: string;
+ readonly sharedWorkspaceAppliesToAllCases: boolean;
readonly sharedWorkspacePath?: string;
readonly sharedBaselineCommit?: string;
readonly suiteWorkspaceFile?: string;
@@ -176,6 +178,130 @@ export function hooksEnabled(
return workspace?.hooks?.enabled !== false;
}
+export function isPerCaseIsolation(
+ workspace: { readonly isolation?: WorkspaceConfig['isolation'] } | undefined,
+): boolean {
+ return workspace?.isolation === 'per_case';
+}
+
+export function caseUsesSharedWorkspaceSetup(
+ evalCase: EvalTest,
+ setup: Pick,
+): boolean {
+ if (isPerCaseIsolation(evalCase.workspace)) {
+ return false;
+ }
+ if (setup.sharedWorkspaceAppliesToAllCases) {
+ return true;
+ }
+ return (
+ setup.sharedWorkspaceOwnerKey !== undefined &&
+ workspaceNeedsSharedSetup(evalCase.workspace) &&
+ sharedWorkspaceOwnerKey(evalCase) === setup.sharedWorkspaceOwnerKey
+ );
+}
+
+function workspaceNeedsSharedSetup(
+ workspace: WorkspaceConfig | undefined,
+): workspace is WorkspaceConfig {
+ if (!workspace || isPerCaseIsolation(workspace)) {
+ return false;
+ }
+ return !!(
+ workspace.path ||
+ workspace.mode === 'static' ||
+ workspace.template ||
+ workspace.hooks ||
+ workspace.repos?.length ||
+ workspace.docker ||
+ workspace.env
+ );
+}
+
+function stableWorkspaceValue(value: unknown): string {
+ if (value === undefined) {
+ return 'undefined';
+ }
+ if (value === null || typeof value !== 'object') {
+ return JSON.stringify(value);
+ }
+ if (Array.isArray(value)) {
+ return `[${value.map(stableWorkspaceValue).join(',')}]`;
+ }
+ const entries = Object.entries(value as Record)
+ .filter(([, entryValue]) => entryValue !== undefined)
+ .sort(([left], [right]) => left.localeCompare(right));
+ return `{${entries
+ .map(([key, entryValue]) => `${JSON.stringify(key)}:${stableWorkspaceValue(entryValue)}`)
+ .join(',')}}`;
+}
+
+function describeWorkspaceOwner(evalCase: EvalTest): string {
+ const source = evalCase.source;
+ if (source?.importedSuiteName) {
+ return `imported suite "${source.importedSuiteName}" (${source.evalFileAbsolutePath})`;
+ }
+ if (source?.evalFileAbsolutePath) {
+ return `parent-owned cases (${source.evalFileAbsolutePath})`;
+ }
+ return 'programmatic cases';
+}
+
+function sharedWorkspaceOwnerKey(evalCase: EvalTest): string {
+ const source = evalCase.source;
+ const sourceKey = source?.importedSuiteName
+ ? `imported:${source.evalFileAbsolutePath}:${source.importedSuiteName}`
+ : source?.evalFileAbsolutePath
+ ? `parent:${source.evalFileAbsolutePath}`
+ : 'programmatic';
+ return `${sourceKey}:${stableWorkspaceValue(evalCase.workspace)}`;
+}
+
+interface SelectedSharedWorkspace {
+ readonly key: string;
+ readonly workspace: WorkspaceConfig;
+}
+
+function selectSuiteWorkspace(evalCases: readonly EvalTest[]): SelectedSharedWorkspace | undefined {
+ const candidates = new Map<
+ string,
+ { readonly workspace: WorkspaceConfig; readonly owner: string; readonly testIds: string[] }
+ >();
+
+ for (const evalCase of evalCases) {
+ if (!workspaceNeedsSharedSetup(evalCase.workspace)) {
+ continue;
+ }
+ const key = sharedWorkspaceOwnerKey(evalCase);
+ const existing = candidates.get(key);
+ if (existing) {
+ existing.testIds.push(evalCase.id);
+ continue;
+ }
+ candidates.set(key, {
+ workspace: evalCase.workspace,
+ owner: describeWorkspaceOwner(evalCase),
+ testIds: [evalCase.id],
+ });
+ }
+
+ if (candidates.size <= 1) {
+ const [key, candidate] = [...candidates.entries()][0] ?? [];
+ return key && candidate ? { key, workspace: candidate.workspace } : undefined;
+ }
+
+ const owners = [...candidates.values()]
+ .map((candidate) => `${candidate.owner} for tests ${candidate.testIds.join(', ')}`)
+ .join('; ');
+ throw new WorkspaceSetupError(
+ `Wrapper eval contains multiple shared workspace owners: ${owners}. AgentV does not merge parent and child workspaces or run separate imported-suite shared workspaces in one wrapper execution. Use isolation: per_case for imported suites, split them into separate runs, or keep only one shared workspace owner.`,
+ {
+ failureStage: 'setup',
+ failureReasonCode: 'ambiguous_shared_workspace',
+ },
+ );
+}
+
function workspaceGitEnv(): Record {
const env = { ...process.env };
for (const key of Object.keys(env)) {
@@ -279,7 +405,8 @@ export async function prepareSharedWorkspaceSetup(
workspaceMode,
workspaceClean,
} = options;
- const suiteWorkspace = evalCases[0]?.workspace;
+ const selectedSuiteWorkspace = selectSuiteWorkspace(evalCases);
+ const suiteWorkspace = selectedSuiteWorkspace?.workspace;
const rawTemplate = suiteWorkspace?.template;
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
const workspaceTemplate = resolvedTemplate?.dir;
@@ -290,9 +417,10 @@ export async function prepareSharedWorkspaceSetup(
}
};
- const isPerTestIsolation = suiteWorkspace?.isolation === 'per_test';
+ const isPerCaseWorkspace = isPerCaseIsolation(suiteWorkspace);
const cliWorkspacePath = workspacePath ?? legacyWorkspacePath;
+ const sharedWorkspaceAppliesToAllCases = !!cliWorkspacePath;
const yamlWorkspacePath = suiteWorkspace?.path;
if (cliWorkspacePath && workspaceMode && workspaceMode !== 'static') {
throw new Error('--workspace-path requires --workspace-mode static when both are provided');
@@ -313,9 +441,9 @@ export async function prepareSharedWorkspaceSetup(
const useStaticWorkspace = configuredMode === 'static';
- if (useStaticWorkspace && isPerTestIsolation) {
+ if (useStaticWorkspace && evalCases.some((evalCase) => isPerCaseIsolation(evalCase.workspace))) {
throw new Error(
- 'static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default).',
+ 'static workspace mode is incompatible with isolation: per_case. Use isolation: shared (default).',
);
}
if (configuredMode !== 'static' && configuredStaticPath) {
@@ -324,7 +452,7 @@ export async function prepareSharedWorkspaceSetup(
const hasSharedWorkspace = !!(
useStaticWorkspace ||
- (!isPerTestIsolation &&
+ (!isPerCaseWorkspace &&
(workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length))
);
@@ -332,11 +460,11 @@ export async function prepareSharedWorkspaceSetup(
const usePool =
poolEnabled !== false &&
!!suiteWorkspace?.repos?.length &&
- !isPerTestIsolation &&
+ !isPerCaseWorkspace &&
!useStaticWorkspace;
setupLog(
- `sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} workers=${workers}`,
+ `sharedWorkspace=${hasSharedWorkspace} perCaseIsolation=${isPerCaseWorkspace} usePool=${usePool} workers=${workers}`,
);
if (hasSharedWorkspace && !usePool && workers > 1 && evalCases.length > 1) {
console.warn(
@@ -391,7 +519,7 @@ export async function prepareSharedWorkspaceSetup(
setupLog(`reusing existing static workspace: ${configuredStaticPath}`);
}
sharedWorkspacePath = configuredStaticPath;
- } else if (!isPerTestIsolation && usePool && suiteWorkspace?.repos) {
+ } else if (!isPerCaseWorkspace && usePool && suiteWorkspace?.repos) {
const slotsNeeded = workers;
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
@@ -421,7 +549,7 @@ export async function prepareSharedWorkspaceSetup(
} else {
availablePoolSlots.push(...poolSlots);
}
- } else if (!isPerTestIsolation && workspaceTemplate) {
+ } else if (!isPerCaseWorkspace && workspaceTemplate) {
setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
try {
sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, 'shared');
@@ -435,7 +563,7 @@ export async function prepareSharedWorkspaceSetup(
cause: error,
});
}
- } else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
+ } else if (!isPerCaseWorkspace && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
sharedWorkspacePath = getWorkspacePath(evalRunId, 'shared');
await mkdir(sharedWorkspacePath, { recursive: true });
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
@@ -452,7 +580,7 @@ export async function prepareSharedWorkspaceSetup(
}
const hasReposToMaterialize =
- !!suiteWorkspace?.repos?.length && !usePool && !isPerTestIsolation;
+ !!suiteWorkspace?.repos?.length && !usePool && !isPerCaseWorkspace;
const needsRepoMaterialisation =
hasReposToMaterialize && (!useStaticWorkspace || staticMaterialised);
const needsPerRepoCheck =
@@ -504,16 +632,7 @@ export async function prepareSharedWorkspaceSetup(
const suiteDockerConfig = suiteWorkspace?.docker;
if (suiteDockerConfig) {
- setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
- const { DockerWorkspaceProvider } = await import('./docker-workspace.js');
- const dockerSetup = new DockerWorkspaceProvider(suiteDockerConfig);
- if (!(await dockerSetup.isDockerAvailable())) {
- throw new Error(
- 'Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running.',
- );
- }
- await dockerSetup.pullImage();
- setupLog('Docker image pull complete');
+ await prepareDockerWorkspace(suiteDockerConfig, setupLog);
}
if (suiteWorkspace?.env) {
@@ -776,6 +895,10 @@ export async function prepareSharedWorkspaceSetup(
return {
...(suiteWorkspace !== undefined && { suiteWorkspace }),
+ ...(selectedSuiteWorkspace?.key !== undefined && {
+ sharedWorkspaceOwnerKey: selectedSuiteWorkspace.key,
+ }),
+ sharedWorkspaceAppliesToAllCases,
...(sharedWorkspacePath !== undefined && { sharedWorkspacePath }),
...(sharedBaselineCommit !== undefined && { sharedBaselineCommit }),
...(suiteWorkspaceFile !== undefined && { suiteWorkspaceFile }),
@@ -812,10 +935,13 @@ export async function prepareEvalCaseWorkspace(
setupDebug,
} = options;
- let workspacePath: string | undefined = sharedWorkspacePath;
+ let workspacePath: string | undefined = isPerCaseIsolation(evalCase.workspace)
+ ? undefined
+ : sharedWorkspacePath;
+ const inheritedSuiteWorkspaceFile = workspacePath ? suiteWorkspaceFile : undefined;
let beforeAllOutput: string | undefined;
let beforeEachOutput: string | undefined;
- const isSharedWorkspace = !!sharedWorkspacePath;
+ const isSharedWorkspace = !!workspacePath;
let caseWorkspaceFile: string | undefined;
const caseHooksEnabled = hooksEnabled(evalCase.workspace);
const hookExecutions: WorkspaceSetupHookExecution[] = [];
@@ -863,12 +989,12 @@ export async function prepareEvalCaseWorkspace(
try {
if (setupDebug) {
console.log(
- `[setup] test=${evalCase.id} materializing ${evalCase.workspace.repos.length} per-test repo(s) into ${workspacePath}`,
+ `[setup] test=${evalCase.id} materializing ${evalCase.workspace.repos.length} per-case repo(s) into ${workspacePath}`,
);
}
await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
if (setupDebug) {
- console.log(`[setup] test=${evalCase.id} per-test repo materialization complete`);
+ console.log(`[setup] test=${evalCase.id} per-case repo materialization complete`);
}
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
@@ -907,6 +1033,50 @@ export async function prepareEvalCaseWorkspace(
}
}
+ const caseDockerConfig = evalCase.workspace?.docker;
+ if (caseDockerConfig) {
+ try {
+ await prepareDockerWorkspace(caseDockerConfig, (message) => {
+ if (setupDebug) {
+ console.log(`[setup] test=${evalCase.id} ${message}`);
+ }
+ });
+ } catch (error) {
+ const message = error instanceof Error ? error.message : String(error);
+ if (forceCleanup && workspacePath) {
+ await cleanupWorkspace(workspacePath).catch(() => {});
+ }
+ throw new WorkspaceSetupError(message, {
+ failureStage: 'setup',
+ failureReasonCode: 'docker_setup_error',
+ hookExecutions,
+ cause: error,
+ });
+ }
+ }
+
+ const caseEnvConfig = evalCase.workspace?.env;
+ if (caseEnvConfig) {
+ try {
+ await runPreflightChecks(caseEnvConfig, workspacePath ?? evalDir, (message) => {
+ if (setupDebug) {
+ console.log(`[setup] test=${evalCase.id} ${message}`);
+ }
+ });
+ } catch (error) {
+ const message = error instanceof Error ? error.message : String(error);
+ if (forceCleanup && workspacePath) {
+ await cleanupWorkspace(workspacePath).catch(() => {});
+ }
+ throw new WorkspaceSetupError(message, {
+ failureStage: 'setup',
+ failureReasonCode: 'preflight_error',
+ hookExecutions,
+ cause: error,
+ });
+ }
+ }
+
const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeAllHook)) {
const beforeAllHook = caseBeforeAllHook;
@@ -1119,7 +1289,7 @@ export async function prepareEvalCaseWorkspace(
return {
...(workspacePath !== undefined && { workspacePath }),
- caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
+ caseWorkspaceFile: caseWorkspaceFile ?? inheritedSuiteWorkspaceFile,
...(beforeAllOutput !== undefined && { beforeAllOutput }),
...(beforeEachOutput !== undefined && { beforeEachOutput }),
...(baselineCommit !== undefined && { baselineCommit }),
@@ -1168,4 +1338,23 @@ async function runPreflightChecks(
}
}
+async function prepareDockerWorkspace(
+ dockerConfig: WorkspaceConfig['docker'],
+ log: (msg: string) => void,
+): Promise {
+ if (!dockerConfig) {
+ return;
+ }
+ log(`pulling Docker image: ${dockerConfig.image}`);
+ const { DockerWorkspaceProvider } = await import('./docker-workspace.js');
+ const dockerSetup = new DockerWorkspaceProvider(dockerConfig);
+ if (!(await dockerSetup.isDockerAvailable())) {
+ throw new Error(
+ 'Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running.',
+ );
+ }
+ await dockerSetup.pullImage();
+ log('Docker image pull complete');
+}
+
export { captureWorkspaceFileChanges };
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index c795b9ec9..bb912ad86 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -104,8 +104,6 @@ type LoadOptions = {
readonly category?: string;
/** Internal DFS stack for detecting circular `type: suite` imports. */
readonly suiteImportStack?: readonly SuiteImportStackEntry[];
- /** Internal runtime defaults supplied by an eval that imports this suite. */
- readonly importParentExperimentConfig?: ExperimentConfig;
};
type SuiteImportStackEntry = {
@@ -472,10 +470,6 @@ async function loadTestsFromParsedYamlValue(
const rawTestCases = resolveTests(suite);
const suiteExperimentConfig = normalizeSuiteExperimentConfig(suite);
- const importContextExperimentConfig = mergeExperimentParentDefaults(
- options?.importParentExperimentConfig,
- suiteExperimentConfig,
- );
// Top-level `metadata:` is inherited by cases. Suite identity tags are parsed
// separately by parseMetadata() and are not case tags.
const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
@@ -502,7 +496,7 @@ async function loadTestsFromParsedYamlValue(
evalFileDir,
repoRoot,
suiteMetadataPayload,
- parentExperimentConfig: importContextExperimentConfig,
+ parentWorkspaceLocation: parentWorkspaceLocation(suite),
options,
});
expandedTestCases = expanded.rawCases;
@@ -899,12 +893,11 @@ function mergeRunOverrides(
};
}
-function applyRunDefaultsToImportedTest(
+function applyRunOverrideToImportedTest(
test: EvalTest,
- childExperimentRun: EvalRunOverride | undefined,
includeRun: EvalRunOverride | undefined,
): EvalTest {
- const run = mergeRunOverrides(mergeRunOverrides(childExperimentRun, includeRun), test.run);
+ const run = mergeRunOverrides(includeRun, test.run);
if (!run) {
return test;
}
@@ -914,157 +907,6 @@ function applyRunDefaultsToImportedTest(
};
}
-function experimentProvidesTarget(config: ExperimentConfig | undefined): boolean {
- return config?.target !== undefined || config?.targets !== undefined;
-}
-
-function experimentProvidesRepeat(config: ExperimentConfig | undefined): boolean {
- return config?.repeat !== undefined || config?.runs !== undefined;
-}
-
-function mergeExperimentParentDefaults(
- parent: ExperimentConfig | undefined,
- child: ExperimentConfig | undefined,
-): ExperimentConfig | undefined {
- if (!parent) {
- return child;
- }
- if (!child) {
- return parent;
- }
- return {
- ...child,
- ...parent,
- ...(experimentProvidesRepeat(parent)
- ? {
- ...(parent.repeat !== undefined && { repeat: parent.repeat }),
- ...(parent.runs !== undefined && { runs: parent.runs }),
- }
- : {
- ...(child.repeat !== undefined && { repeat: child.repeat }),
- ...(child.runs !== undefined && { runs: child.runs }),
- }),
- };
-}
-
-function buildExperimentRunDefaults(
- config: ExperimentConfig | undefined,
-): EvalRunOverride | undefined {
- if (!config) {
- return undefined;
- }
- const repeat = config.repeat
- ? {
- count: config.repeat.count,
- strategy: config.repeat.strategy,
- ...(config.repeat.costLimitUsd !== undefined && {
- costLimitUsd: config.repeat.costLimitUsd,
- }),
- ...(config.earlyExit !== undefined && { earlyExit: config.earlyExit }),
- }
- : config.runs !== undefined
- ? {
- count: config.runs,
- strategy: 'pass_at_k' as const,
- ...(config.earlyExit !== undefined && { earlyExit: config.earlyExit }),
- }
- : undefined;
- const run = {
- ...(config.threshold !== undefined && { threshold: config.threshold }),
- ...(repeat !== undefined && { repeat }),
- ...(config.timeoutSeconds !== undefined && { timeoutSeconds: config.timeoutSeconds }),
- ...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }),
- } satisfies EvalRunOverride;
- return Object.keys(run).length > 0 ? run : undefined;
-}
-
-function buildImportedExperimentRunDefaults(
- child: ExperimentConfig | undefined,
- parent: ExperimentConfig | undefined,
-): EvalRunOverride | undefined {
- const childRun = buildExperimentRunDefaults(child);
- if (!childRun) {
- return undefined;
- }
- const run = {
- ...(parent?.threshold === undefined &&
- childRun.threshold !== undefined && { threshold: childRun.threshold }),
- ...(!experimentProvidesRepeat(parent) && childRun.repeat !== undefined
- ? { repeat: childRun.repeat }
- : {}),
- ...(parent?.timeoutSeconds === undefined &&
- childRun.timeoutSeconds !== undefined && { timeoutSeconds: childRun.timeoutSeconds }),
- ...(parent?.budgetUsd === undefined &&
- childRun.budgetUsd !== undefined && { budgetUsd: childRun.budgetUsd }),
- } satisfies EvalRunOverride;
- return Object.keys(run).length > 0 ? run : undefined;
-}
-
-type ImportedExperimentFieldRule = {
- readonly field: string;
- readonly childHasField: (config: ExperimentConfig) => boolean;
- readonly parentHasOverride: (config: ExperimentConfig | undefined) => boolean;
-};
-
-const UNSCOPED_IMPORTED_EXPERIMENT_FIELDS: readonly ImportedExperimentFieldRule[] = [
- {
- field: 'target',
- childHasField: (config) => experimentProvidesTarget(config),
- parentHasOverride: experimentProvidesTarget,
- },
- {
- field: 'agent',
- childHasField: (config) => config.agent !== undefined,
- parentHasOverride: (config) => config?.agent !== undefined,
- },
- {
- field: 'model',
- childHasField: (config) => config.model !== undefined,
- parentHasOverride: (config) => config?.model !== undefined,
- },
- {
- field: 'agent_options',
- childHasField: (config) => config.agentOptions !== undefined,
- parentHasOverride: (config) => config?.agentOptions !== undefined,
- },
- {
- field: 'workers',
- childHasField: (config) => config.workers !== undefined,
- parentHasOverride: (config) => config?.workers !== undefined,
- },
- {
- field: 'sandbox',
- childHasField: (config) => config.sandbox !== undefined,
- parentHasOverride: (config) => config?.sandbox !== undefined,
- },
- {
- field: 'workspace',
- childHasField: (config) => config.workspace !== undefined,
- parentHasOverride: (config) => config?.workspace !== undefined,
- },
-];
-
-function assertImportedExperimentCanCompose(
- child: ExperimentConfig | undefined,
- parent: ExperimentConfig | undefined,
- importPath: string,
-): void {
- if (!child) {
- return;
- }
- const unsupported = UNSCOPED_IMPORTED_EXPERIMENT_FIELDS.filter(
- (rule) => rule.childHasField(child) && !rule.parentHasOverride(parent),
- ).map((rule) => `experiment.${rule.field}`);
- if (unsupported.length === 0) {
- return;
- }
- throw new Error(
- `Imported eval suite '${displayEvalImportPath(importPath)}' defines ${unsupported.join(
- ', ',
- )}, which cannot be scoped per imported suite. Set these fields in the parent experiment when importing this suite.`,
- );
-}
-
function markSuiteImportedTest(test: EvalTest): EvalTest {
return {
...test,
@@ -1313,7 +1155,7 @@ async function expandInlineTestEntries(params: {
readonly evalFileDir: string;
readonly repoRoot: URL | string;
readonly suiteMetadataPayload?: Record;
- readonly parentExperimentConfig?: ExperimentConfig;
+ readonly parentWorkspaceLocation?: string;
readonly options?: LoadOptions;
}): Promise {
const withFileReferences = await expandFileReferences(params.entries, params.evalFileDir);
@@ -1339,20 +1181,15 @@ async function expandInlineTestEntries(params: {
for (const resolvedPath of resolvedPaths) {
if (mode === 'suite') {
+ if (params.parentWorkspaceLocation) {
+ throw new Error(
+ `Parent workspace is not allowed when importing eval suites with type: suite (${params.parentWorkspaceLocation}): ${includePath}. Move workspace into the child suite, or import raw cases with type: tests when you intentionally want parent workspace context.`,
+ );
+ }
const suite = await loadTestSuite(resolvedPath, params.repoRoot, {
...params.options,
filter: select?.testIds,
- importParentExperimentConfig: params.parentExperimentConfig,
});
- assertImportedExperimentCanCompose(
- suite.experimentConfig,
- params.parentExperimentConfig,
- resolvedPath,
- );
- const childExperimentRun = buildImportedExperimentRunDefaults(
- suite.experimentConfig,
- params.parentExperimentConfig,
- );
const selectedTests = params.options?.filter
? suite.tests.filter((test) => matchesFilter(test.id, params.options?.filter ?? ''))
: suite.tests;
@@ -1360,7 +1197,7 @@ async function expandInlineTestEntries(params: {
...selectedTests
.filter((test) => evalTestMatchesSelect(test, select))
.map(markSuiteImportedTest)
- .map((test) => applyRunDefaultsToImportedTest(test, childExperimentRun, includeRun)),
+ .map((test) => applyRunOverrideToImportedTest(test, includeRun)),
);
} else {
const importedCases = await loadRawCasesForInclude(resolvedPath);
@@ -1379,6 +1216,19 @@ async function expandInlineTestEntries(params: {
return { rawCases, importedSuiteTests };
}
+function parentWorkspaceLocation(suite: RawTestSuite): string | undefined {
+ if (suite.workspace !== undefined) {
+ return 'workspace';
+ }
+
+ const runtime = suite.experiment ?? suite.execution;
+ if (isJsonObject(runtime) && runtime.workspace !== undefined) {
+ return suite.experiment !== undefined ? 'experiment.workspace' : 'execution.workspace';
+ }
+
+ return undefined;
+}
+
function readSuiteRuntimeBlock(suite: RawTestSuite, evalFilePath: string): JsonObject | undefined {
if (suite.experiment !== undefined && suite.execution !== undefined) {
throw new Error(
@@ -1890,8 +1740,11 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi
template = path.resolve(evalFileDir, template);
}
+ if (obj.isolation !== undefined && obj.isolation !== 'shared' && obj.isolation !== 'per_case') {
+ throw new Error("workspace.isolation must be 'shared' or 'per_case'.");
+ }
const isolation =
- obj.isolation === 'shared' || obj.isolation === 'per_test' ? obj.isolation : undefined;
+ obj.isolation === 'shared' || obj.isolation === 'per_case' ? obj.isolation : undefined;
const repos = Array.isArray(obj.repos)
? ((obj.repos as Record[])
@@ -2000,6 +1853,7 @@ function mergeWorkspaceConfigs(
mode: caseLevel.mode ?? suiteLevel.mode,
path: caseLevel.path ?? suiteLevel.path,
docker: caseLevel.docker ?? suiteLevel.docker,
+ env: caseLevel.env ?? suiteLevel.env,
workspaceFileDir: caseLevel.workspaceFileDir ?? suiteLevel.workspaceFileDir,
};
}
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 3d8d7e8aa..8ee19d67a 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -81,6 +81,10 @@ export {
type GradingArtifact,
type IndexArtifactEntry,
type ResultIndexArtifact,
+ type ExperimentNamespaceSource,
+ type RunRuntimeConfigSource,
+ type RunRuntimeSourceKind,
+ type RunRuntimeSourceMetadata,
type RunSummaryArtifact,
type TimingArtifact,
} from './evaluation/run-artifacts.js';
diff --git a/packages/core/test/evaluation/eval-inline-experiment.test.ts b/packages/core/test/evaluation/eval-inline-experiment.test.ts
index 646f0ee33..17ce9604a 100644
--- a/packages/core/test/evaluation/eval-inline-experiment.test.ts
+++ b/packages/core/test/evaluation/eval-inline-experiment.test.ts
@@ -382,7 +382,7 @@ describe('eval.yaml inline experiment and tests imports', () => {
expect(identitySuite.tests[0]?.metadata?.tags).toEqual(['suite-identity']);
});
- it('type: suite preserves child suite context and lets parent experiment override child defaults', async () => {
+ it('type: suite preserves child suite context while parent experiment owns runtime', async () => {
await writeFile(
path.join(tempDir, 'child.eval.yaml'),
[
@@ -422,8 +422,6 @@ describe('eval.yaml inline experiment and tests imports', () => {
' strategy: pass_at_k',
' timeout_seconds: 30',
' budget_usd: 1.5',
- 'workspace:',
- ' path: ./parent-workspace',
'input: parent shared input',
'assertions:',
' - type: contains',
@@ -452,7 +450,104 @@ describe('eval.yaml inline experiment and tests imports', () => {
expect(test.assertions?.[0]).toMatchObject({ value: 'child' });
});
- it('applies imported child experiment defaults when parent has no experiment', async () => {
+ it('rejects parent workspace when importing eval suites with type: suite', async () => {
+ await writeFile(
+ path.join(tempDir, 'child.eval.yaml'),
+ [
+ 'name: child-suite',
+ 'workspace:',
+ ' path: ./child-workspace',
+ 'tests:',
+ ' - id: child-case',
+ ' input: child case input',
+ ' criteria: ok',
+ '',
+ ].join('\n'),
+ );
+ const parentPath = path.join(tempDir, 'parent.eval.yaml');
+ await writeFile(
+ parentPath,
+ [
+ 'name: parent-suite',
+ 'workspace:',
+ ' path: ./parent-workspace',
+ 'tests:',
+ ' - include: child.eval.yaml',
+ ' type: suite',
+ '',
+ ].join('\n'),
+ );
+
+ await expect(loadTestSuite(parentPath, tempDir)).rejects.toThrow(
+ /Parent workspace is not allowed/,
+ );
+ });
+
+ it('rejects parent experiment workspace when importing eval suites with type: suite', async () => {
+ await writeFile(
+ path.join(tempDir, 'child.eval.yaml'),
+ [
+ 'name: child-suite',
+ 'tests:',
+ ' - id: child-case',
+ ' input: child case input',
+ ' criteria: ok',
+ '',
+ ].join('\n'),
+ );
+ const parentPath = path.join(tempDir, 'parent.eval.yaml');
+ await writeFile(
+ parentPath,
+ [
+ 'name: parent-suite',
+ 'experiment:',
+ ' workspace:',
+ ' path: ./parent-workspace',
+ 'tests:',
+ ' - include: child.eval.yaml',
+ ' type: suite',
+ '',
+ ].join('\n'),
+ );
+
+ await expect(loadTestSuite(parentPath, tempDir)).rejects.toThrow(
+ /Parent workspace is not allowed.*experiment\.workspace/,
+ );
+ });
+
+ it('rejects legacy execution workspace when importing eval suites with type: suite', async () => {
+ await writeFile(
+ path.join(tempDir, 'child.eval.yaml'),
+ [
+ 'name: child-suite',
+ 'tests:',
+ ' - id: child-case',
+ ' input: child case input',
+ ' criteria: ok',
+ '',
+ ].join('\n'),
+ );
+ const parentPath = path.join(tempDir, 'parent.eval.yaml');
+ await writeFile(
+ parentPath,
+ [
+ 'name: parent-suite',
+ 'execution:',
+ ' workspace:',
+ ' path: ./parent-workspace',
+ 'tests:',
+ ' - include: child.eval.yaml',
+ ' type: suite',
+ '',
+ ].join('\n'),
+ );
+
+ await expect(loadTestSuite(parentPath, tempDir)).rejects.toThrow(
+ /Parent workspace is not allowed.*execution\.workspace/,
+ );
+ });
+
+ it('ignores imported child experiment defaults when parent has no experiment', async () => {
await writeFile(
path.join(tempDir, 'child.eval.yaml'),
[
@@ -482,15 +577,10 @@ describe('eval.yaml inline experiment and tests imports', () => {
const suite = await loadTestSuite(parentPath, tempDir);
expect(suite.experimentConfig).toBeUndefined();
- expect(suite.tests[0]?.run).toMatchObject({
- threshold: 0.2,
- repeat: { count: 5, strategy: 'mean' },
- timeoutSeconds: 10,
- budgetUsd: 0.5,
- });
+ expect(suite.tests[0]?.run).toBeUndefined();
});
- it('applies include-level run overrides over imported child experiment defaults', async () => {
+ it('applies include-level run overrides without importing child experiment defaults', async () => {
await writeFile(
path.join(tempDir, 'child.eval.yaml'),
[
@@ -526,15 +616,13 @@ describe('eval.yaml inline experiment and tests imports', () => {
const suite = await loadTestSuite(parentPath, tempDir);
- expect(suite.tests[0]?.run).toMatchObject({
+ expect(suite.tests[0]?.run).toEqual({
threshold: 0.9,
- repeat: { count: 5, strategy: 'mean' },
timeoutSeconds: 30,
- budgetUsd: 0.5,
});
});
- it('applies test.run over include-level and imported child experiment defaults', async () => {
+ it('applies test.run over include-level run overrides without child experiment defaults', async () => {
await writeFile(
path.join(tempDir, 'child.eval.yaml'),
[
@@ -598,7 +686,7 @@ describe('eval.yaml inline experiment and tests imports', () => {
expect(byId.get('child-critical')?.threshold).toBe(1.0);
});
- it('rejects imported child experiment fields that cannot be scoped without a parent override', async () => {
+ it('ignores imported child experiment fields that cannot be scoped in a wrapper', async () => {
await writeFile(
path.join(tempDir, 'child-a.eval.yaml'),
[
@@ -637,9 +725,11 @@ describe('eval.yaml inline experiment and tests imports', () => {
].join('\n'),
);
- await expect(loadTestSuite(parentPath, tempDir)).rejects.toThrow(
- /experiment\.workers.*cannot be scoped per imported suite/,
- );
+ const suite = await loadTestSuite(parentPath, tempDir);
+
+ expect(suite.experimentConfig).toBeUndefined();
+ expect(suite.tests.map((test) => test.id)).toEqual(['a', 'b']);
+ expect(suite.tests.every((test) => test.run === undefined)).toBe(true);
});
it('type: tests imports only raw cases and applies parent suite context', async () => {
@@ -663,6 +753,8 @@ describe('eval.yaml inline experiment and tests imports', () => {
parentPath,
[
'name: parent-suite',
+ 'workspace:',
+ ' path: ./parent-workspace',
'input: parent shared input',
'assertions:',
' - type: contains',
@@ -682,6 +774,7 @@ describe('eval.yaml inline experiment and tests imports', () => {
'parent shared input',
'raw case input',
]);
+ expect(test.workspace?.path).toBe('./parent-workspace');
expect(test.assertions?.[0]).toMatchObject({ type: 'contains', value: 'parent' });
});
});
diff --git a/packages/core/test/evaluation/experiment.test.ts b/packages/core/test/evaluation/experiment.test.ts
index 7524edec1..ff7244ffc 100644
--- a/packages/core/test/evaluation/experiment.test.ts
+++ b/packages/core/test/evaluation/experiment.test.ts
@@ -92,6 +92,12 @@ describe('inline experiment config', () => {
expect(() => normalizeExperimentConfig({ scripts: ['bun test'] })).toThrow(
/scripts are not supported/,
);
+ expect(() => normalizeExperimentConfig({ workspace: { isolation: 'per_test' } })).toThrow(
+ /supports only mode and path/,
+ );
+ expect(() =>
+ normalizeExperimentConfig({ workspace: { repos: [{ repo: 'acme/support-app' }] } }),
+ ).toThrow(/supports only mode and path/);
});
it('builds safe snake_case artifact metadata without agent options', () => {
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 4cb3000d8..79cfd34b0 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -834,6 +834,63 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
expect(case2?.error).toBe("Batch output missing id 'case-2'");
});
+ it('disables provider batching when cases require workspace setup', async () => {
+ class BatchCapableProvider implements Provider {
+ readonly id = 'batch:workspace';
+ readonly kind = 'mock' as const;
+ readonly targetName = 'workspace';
+ readonly supportsBatch = true;
+ batchCalls = 0;
+ invokeRequests: ProviderRequest[] = [];
+
+ async invoke(request: ProviderRequest): Promise {
+ this.invokeRequests.push(request);
+ return {
+ output: [{ role: 'assistant', content: 'OK' }],
+ };
+ }
+
+ async invokeBatch(): Promise {
+ this.batchCalls += 1;
+ throw new Error('batch should not be used for workspace cases');
+ }
+ }
+
+ const templateDir = mkdtempSync(path.join(tmpdir(), 'agentv-batch-workspace-'));
+ writeFileSync(path.join(templateDir, 'README.md'), 'workspace\n', 'utf8');
+ const provider = new BatchCapableProvider();
+
+ try {
+ const results = await runEvaluation({
+ testFilePath: 'in-memory.yaml',
+ repoRoot: 'in-memory',
+ target: {
+ ...baseTarget,
+ providerBatching: true,
+ workers: 1,
+ },
+ providerFactory: () => provider,
+ evaluators: evaluatorRegistry,
+ evalCases: [
+ {
+ ...baseTestCase,
+ workspace: {
+ template: templateDir,
+ },
+ },
+ ],
+ });
+
+ expect(results).toHaveLength(1);
+ expect(provider.batchCalls).toBe(0);
+ expect(provider.invokeRequests).toHaveLength(1);
+ expect(provider.invokeRequests[0]?.cwd).toBeDefined();
+ } finally {
+ const { rm } = await import('node:fs/promises');
+ await rm(templateDir, { recursive: true, force: true });
+ }
+ });
+
it('uses a custom evaluator prompt when provided', async () => {
const directory = mkdtempSync(path.join(tmpdir(), 'agentv-custom-grader-'));
const promptPath = path.join(directory, 'grader-prompt.md');
@@ -2838,6 +2895,33 @@ describe('workspace.template .code-workspace resolution', () => {
}
}
});
+
+ it('does not pass suite workspaceFile to a case without the shared workspace', async () => {
+ const { mkdtemp, writeFile, mkdir } = await import('node:fs/promises');
+ testDir = await mkdtemp(path.join(tmpdir(), 'agentv-orch-ws-resolve-'));
+
+ const sharedDir = path.join(testDir, 'shared');
+ await mkdir(sharedDir, { recursive: true });
+ const suiteWorkspaceFile = path.join(sharedDir, 'child.code-workspace');
+ await writeFile(suiteWorkspaceFile, JSON.stringify({ folders: [{ path: '.' }] }));
+
+ const provider = new CapturingProvider('mock', {
+ output: [{ role: 'assistant', content: [{ type: 'text', text: 'answer' }] }],
+ });
+
+ const result = await runEvalCase({
+ evalCase: baseTestCase,
+ provider,
+ target: baseTarget,
+ evaluators: evaluatorRegistry,
+ suiteWorkspaceFile,
+ });
+
+ expect(result.error).toBeUndefined();
+ expect(provider.lastRequest).toBeDefined();
+ expect(provider.lastRequest?.cwd).toBeUndefined();
+ expect(provider.lastRequest?.workspaceFile).toBeUndefined();
+ });
});
describe('suite-level total budget guardrail', () => {
@@ -3157,7 +3241,7 @@ describe('--workspace flag', () => {
expect(results[0].error).toBeUndefined();
});
- it('errors when workspace is combined with per_test isolation', async () => {
+ it('errors when workspace is combined with per_case isolation', async () => {
const { mkdtemp } = await import('node:fs/promises');
testDir = await mkdtemp(path.join(tmpdir(), 'agentv-ws-flag-'));
@@ -3168,7 +3252,7 @@ describe('--workspace flag', () => {
const evalCase: EvalTest = {
...baseTestCase,
workspace: {
- isolation: 'per_test',
+ isolation: 'per_case',
},
};
@@ -3182,7 +3266,7 @@ describe('--workspace flag', () => {
evalCases: [evalCase],
workspace: testDir,
}),
- ).rejects.toThrow('static workspace mode is incompatible with isolation: per_test');
+ ).rejects.toThrow('static workspace mode is incompatible with isolation: per_case');
});
it('never deletes user-provided workspace after run', async () => {
@@ -3281,9 +3365,9 @@ describe('--workspace flag', () => {
expect(results[0].beforeEachOutput).toBeDefined();
});
- it('creates per-test workspaces for hook-only suites when isolation is per_test', async () => {
+ it('creates per-case workspaces for hook-only suites when isolation is per_case', async () => {
const { mkdtemp, mkdir, writeFile, access: fsAccess } = await import('node:fs/promises');
- testDir = await mkdtemp(path.join(tmpdir(), 'agentv-per-test-hooks-'));
+ testDir = await mkdtemp(path.join(tmpdir(), 'agentv-per-case-hooks-'));
const beforeAllScript = path.join(testDir, 'before-all.js');
writeFileSync(
@@ -3300,7 +3384,7 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id
const workspacesSeen: string[] = [];
const provider: Provider = {
- id: 'mock:per-test-hooks',
+ id: 'mock:per-case-hooks',
kind: 'mock',
targetName: 'mock',
async invoke(request: ProviderRequest): Promise {
@@ -3316,7 +3400,7 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id
};
const workspaceConfig = {
- isolation: 'per_test' as const,
+ isolation: 'per_case' as const,
hooks: {
before_all: {
command: [process.execPath, beforeAllScript],
diff --git a/packages/core/test/evaluation/repo-schema-validation.test.ts b/packages/core/test/evaluation/repo-schema-validation.test.ts
index 10f778bce..86a3abdaf 100644
--- a/packages/core/test/evaluation/repo-schema-validation.test.ts
+++ b/packages/core/test/evaluation/repo-schema-validation.test.ts
@@ -152,6 +152,22 @@ describe('repo lifecycle schema validation', () => {
});
it('accepts workspace with isolation field', () => {
+ const result = EvalFileSchema.safeParse({
+ ...baseEval,
+ workspace: {
+ isolation: 'per_case',
+ repos: [
+ {
+ path: './repo-a',
+ repo: 'https://github.com/org/repo.git',
+ },
+ ],
+ },
+ });
+ expect(result.success).toBe(true);
+ });
+
+ it('rejects removed workspace isolation per_test value', () => {
const result = EvalFileSchema.safeParse({
...baseEval,
workspace: {
@@ -164,9 +180,51 @@ describe('repo lifecycle schema validation', () => {
],
},
});
+ expect(result.success).toBe(false);
+ });
+
+ it('rejects removed experiment workspace isolation per_test value', () => {
+ const result = EvalFileSchema.safeParse({
+ ...baseEval,
+ experiment: {
+ workspace: {
+ isolation: 'per_test',
+ },
+ },
+ });
+ expect(result.success).toBe(false);
+ });
+
+ it('accepts experiment workspace runtime override fields', () => {
+ const result = EvalFileSchema.safeParse({
+ ...baseEval,
+ experiment: {
+ workspace: {
+ mode: 'static',
+ path: '/tmp/my-workspace',
+ },
+ },
+ });
expect(result.success).toBe(true);
});
+ it('rejects task workspace fields in experiment workspace', () => {
+ const result = EvalFileSchema.safeParse({
+ ...baseEval,
+ experiment: {
+ workspace: {
+ repos: [
+ {
+ path: './repo-a',
+ repo: 'https://github.com/org/repo.git',
+ },
+ ],
+ },
+ },
+ });
+ expect(result.success).toBe(false);
+ });
+
it('accepts workspace.mode=temp', () => {
const result = EvalFileSchema.safeParse({
...baseEval,
diff --git a/packages/core/test/evaluation/validation/eval-file-schema.test.ts b/packages/core/test/evaluation/validation/eval-file-schema.test.ts
index 06093ddda..51bf2839a 100644
--- a/packages/core/test/evaluation/validation/eval-file-schema.test.ts
+++ b/packages/core/test/evaluation/validation/eval-file-schema.test.ts
@@ -50,6 +50,20 @@ describe('EvalFileSchema input shorthand', () => {
expect(result.success).toBe(false);
});
+ it('accepts workspace env preflight requirements', () => {
+ const result = EvalFileSchema.safeParse({
+ workspace: {
+ env: {
+ required_commands: ['git'],
+ required_python_modules: ['json'],
+ },
+ },
+ tests: [baseTest],
+ });
+
+ expect(result.success).toBe(true);
+ });
+
it('accepts inline experiment runtime and include selection entries', () => {
const result = EvalFileSchema.safeParse({
name: 'wrapper',
diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts
index f23a05e59..b3c3f059a 100644
--- a/packages/core/test/evaluation/validation/eval-validator.test.ts
+++ b/packages/core/test/evaluation/validation/eval-validator.test.ts
@@ -84,6 +84,265 @@ tests:
expect(result.errors.some((error) => error.message.includes("Missing 'type'"))).toBe(true);
});
+ it('rejects parent workspace when importing suites', async () => {
+ const childPath = path.join(tempDir, 'composition-child-workspace.eval.yaml');
+ await writeFile(
+ childPath,
+ `workspace:
+ path: ./child-workspace
+tests:
+ - id: child-case
+ criteria: Goal
+ input: Query
+`,
+ );
+ const filePath = path.join(tempDir, 'composition-parent-workspace.eval.yaml');
+ await writeFile(
+ filePath,
+ `workspace:
+ path: ./parent-workspace
+tests:
+ - include: composition-child-workspace.eval.yaml
+ type: suite
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(false);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'error' &&
+ error.location === 'workspace' &&
+ error.message.includes('Parent workspace is not allowed') &&
+ error.message.includes('type: suite'),
+ ),
+ ).toBe(true);
+ });
+
+ it('rejects parent experiment workspace when importing suites', async () => {
+ await writeFile(
+ path.join(tempDir, 'composition-child-experiment-workspace.eval.yaml'),
+ `tests:
+ - id: child-case
+ criteria: Goal
+ input: Query
+`,
+ );
+ const filePath = path.join(tempDir, 'composition-parent-experiment-workspace.eval.yaml');
+ await writeFile(
+ filePath,
+ `experiment:
+ workspace:
+ path: ./parent-workspace
+tests:
+ - include: composition-child-experiment-workspace.eval.yaml
+ type: suite
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(false);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'error' &&
+ error.location === 'experiment.workspace' &&
+ error.message.includes('Parent workspace is not allowed') &&
+ error.message.includes('type: suite'),
+ ),
+ ).toBe(true);
+ });
+
+ it('rejects legacy execution workspace when importing suites', async () => {
+ await writeFile(
+ path.join(tempDir, 'composition-child-execution-workspace.eval.yaml'),
+ `tests:
+ - id: child-case
+ criteria: Goal
+ input: Query
+`,
+ );
+ const filePath = path.join(tempDir, 'composition-parent-execution-workspace.eval.yaml');
+ await writeFile(
+ filePath,
+ `execution:
+ workspace:
+ path: ./parent-workspace
+tests:
+ - include: composition-child-execution-workspace.eval.yaml
+ type: suite
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(false);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'error' &&
+ error.location === 'execution.workspace' &&
+ error.message.includes('Parent workspace is not allowed') &&
+ error.message.includes('type: suite'),
+ ),
+ ).toBe(true);
+ });
+
+ it('rejects removed isolation values in experiment workspace', async () => {
+ const filePath = path.join(tempDir, 'experiment-workspace-legacy-isolation.eval.yaml');
+ await writeFile(
+ filePath,
+ `experiment:
+ workspace:
+ isolation: per_test
+tests:
+ - id: test-1
+ criteria: Goal
+ input: Query
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(false);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'error' &&
+ error.location === 'experiment.workspace.isolation' &&
+ error.message.includes('supports only mode and path'),
+ ),
+ ).toBe(true);
+ });
+
+ it('rejects task workspace fields in experiment workspace', async () => {
+ const filePath = path.join(tempDir, 'experiment-workspace-repos.eval.yaml');
+ await writeFile(
+ filePath,
+ `experiment:
+ workspace:
+ repos:
+ - repo: acme/support-app
+ path: support-app
+tests:
+ - id: test-1
+ criteria: Goal
+ input: Query
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(false);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'error' &&
+ error.location === 'experiment.workspace.repos' &&
+ error.message.includes('supports only mode and path'),
+ ),
+ ).toBe(true);
+ });
+
+ it('accepts runtime workspace overrides in experiment workspace', async () => {
+ const filePath = path.join(tempDir, 'experiment-workspace-runtime.eval.yaml');
+ await writeFile(
+ filePath,
+ `experiment:
+ workspace:
+ mode: static
+ path: ./prepared-workspace
+tests:
+ - id: test-1
+ criteria: Goal
+ input: Query
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(true);
+ });
+
+ it('warns that imported child experiments are ignored by wrapper composition', async () => {
+ await writeFile(
+ path.join(tempDir, 'composition-child-experiment.eval.yaml'),
+ `experiment:
+ target: child-target
+ workers: 2
+ threshold: 0.9
+tests:
+ - id: child-case
+ criteria: Goal
+ input: Query
+`,
+ );
+ const filePath = path.join(tempDir, 'composition-parent-no-experiment.eval.yaml');
+ await writeFile(
+ filePath,
+ `tests:
+ - include: composition-child-experiment.eval.yaml
+ type: suite
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(true);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'warning' &&
+ error.location === 'tests[0].include' &&
+ error.message.includes('child experiment blocks are ignored') &&
+ error.message.includes('parent has no experiment'),
+ ),
+ ).toBe(true);
+ });
+
+ it('warns when type: tests imports an eval suite and drops suite context', async () => {
+ await writeFile(
+ path.join(tempDir, 'composition-child-tests-import.eval.yaml'),
+ `workspace:
+ path: ./child-workspace
+input: child suite input
+assertions:
+ - type: contains
+ value: child
+tests:
+ - id: raw-case
+ criteria: Goal
+ input: Query
+`,
+ );
+ const filePath = path.join(tempDir, 'composition-parent-tests-import.eval.yaml');
+ await writeFile(
+ filePath,
+ `workspace:
+ path: ./parent-workspace
+tests:
+ - include: composition-child-tests-import.eval.yaml
+ type: tests
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(true);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'warning' &&
+ error.location === 'tests[0].include' &&
+ error.message.includes('type: tests imports raw cases') &&
+ error.message.includes('drops suite context'),
+ ),
+ ).toBe(true);
+ });
+
it('rejects eval files with both experiment and legacy execution', async () => {
const filePath = path.join(tempDir, 'runtime-conflict.yaml');
await writeFile(
@@ -885,6 +1144,38 @@ tests:
expect(extWarnings).toHaveLength(0);
});
+ it('validates experiment workspace with tests string shorthand', async () => {
+ await writeFile(
+ path.join(tempDir, 'cases-shorthand-workspace.yaml'),
+ `- id: test-1
+ criteria: Goal
+ input: "Query"
+`,
+ );
+
+ const filePath = path.join(tempDir, 'tests-yaml-ext-experiment-workspace.yaml');
+ await writeFile(
+ filePath,
+ `experiment:
+ workspace:
+ isolation: per_test
+tests: "./cases-shorthand-workspace.yaml"
+`,
+ );
+
+ const result = await validateEvalFile(filePath);
+
+ expect(result.valid).toBe(false);
+ expect(
+ result.errors.some(
+ (error) =>
+ error.severity === 'error' &&
+ error.location === 'experiment.workspace.isolation' &&
+ error.message.includes('supports only mode and path'),
+ ),
+ ).toBe(true);
+ });
+
it('passes valid tests string path with .yml extension', async () => {
await writeFile(
path.join(tempDir, 'cases.yml'),
@@ -1023,7 +1314,7 @@ tests:
).toBe(true);
});
- it('errors when legacy checkout is set in a per-test workspace', async () => {
+ it('errors when legacy checkout is set in a per-case workspace', async () => {
const filePath = path.join(tempDir, 'workspace-legacy-checkout-error.yaml');
await writeFile(
filePath,
diff --git a/packages/core/test/evaluation/workspace-config-parsing.test.ts b/packages/core/test/evaluation/workspace-config-parsing.test.ts
index e8ac7bc07..d104b3f0c 100644
--- a/packages/core/test/evaluation/workspace-config-parsing.test.ts
+++ b/packages/core/test/evaluation/workspace-config-parsing.test.ts
@@ -146,6 +146,38 @@ tests:
});
});
+ it('should preserve workspace env when merging case-level workspace with suite defaults', async () => {
+ const evalFile = path.join(testDir, 'workspace-env-merge.yaml');
+ await writeFile(
+ evalFile,
+ `
+workspace:
+ hooks:
+ before_all:
+ command: ["bun", "run", "default-setup.ts"]
+
+tests:
+ - id: case-env
+ input: "Do something"
+ criteria: "Should work"
+ workspace:
+ env:
+ required_commands: ["git"]
+ required_python_modules: ["json"]
+`,
+ );
+
+ const cases = await loadTests(evalFile, testDir);
+ expect(cases).toHaveLength(1);
+ expect(cases[0].workspace?.hooks?.before_all).toEqual({
+ command: ['bun', 'run', 'default-setup.ts'],
+ });
+ expect(cases[0].workspace?.env).toEqual({
+ required_commands: ['git'],
+ required_python_modules: ['json'],
+ });
+ });
+
it('should resolve before_all cwd relative to eval file directory', async () => {
const evalFile = path.join(testDir, 'workspace-cwd.yaml');
await writeFile(
@@ -323,7 +355,7 @@ tests:
`
description: test
workspace:
- isolation: per_test
+ isolation: per_case
repos:
- path: ./repo-a
repo: https://github.com/org/repo.git
@@ -335,7 +367,27 @@ tests:
);
const cases = await loadTests(evalFile, testDir);
- expect(cases[0].workspace?.isolation).toBe('per_test');
+ expect(cases[0].workspace?.isolation).toBe('per_case');
+ });
+
+ it('rejects removed workspace isolation per_test value', async () => {
+ const evalFile = path.join(testDir, 'workspace-isolation-legacy.yaml');
+ await writeFile(
+ evalFile,
+ `
+description: test
+workspace:
+ isolation: per_test
+tests:
+ - id: test-1
+ input: "hello"
+ criteria: "world"
+`,
+ );
+
+ await expect(loadTests(evalFile, testDir)).rejects.toThrow(
+ "workspace.isolation must be 'shared' or 'per_case'.",
+ );
});
it('infers workspace.mode=static when workspace.path is provided without mode', async () => {
diff --git a/packages/core/test/evaluation/workspace/deps-scanner.test.ts b/packages/core/test/evaluation/workspace/deps-scanner.test.ts
index d714d23ba..0028130c6 100644
--- a/packages/core/test/evaluation/workspace/deps-scanner.test.ts
+++ b/packages/core/test/evaluation/workspace/deps-scanner.test.ts
@@ -52,7 +52,7 @@ tests:
expect(result.repos[0].usedBy).toEqual([file]);
});
- it('extracts repos from per-test workspace', async () => {
+ it('extracts repos from per-case workspace', async () => {
const file = await writeYaml(
'per-test.eval.yaml',
`
diff --git a/packages/core/test/evaluation/workspace/setup.test.ts b/packages/core/test/evaluation/workspace/setup.test.ts
index 6c7160ddc..5ebb5a9ed 100644
--- a/packages/core/test/evaluation/workspace/setup.test.ts
+++ b/packages/core/test/evaluation/workspace/setup.test.ts
@@ -1,13 +1,15 @@
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
import { execSync } from 'node:child_process';
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
-import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { EvalTest } from '../../../src/evaluation/types.js';
import {
type SharedWorkspaceSetup,
+ caseUsesSharedWorkspaceSetup,
+ prepareEvalCaseWorkspace,
prepareSharedWorkspaceSetup,
releaseSharedWorkspaceSetup,
} from '../../../src/evaluation/workspace/setup.js';
@@ -38,6 +40,40 @@ function createTestRepo(dir: string, files: Record): string {
return execSync('git rev-parse HEAD', { cwd: dir, env: cleanGitEnv() }).toString().trim();
}
+function testCase(
+ id: string,
+ workspace: EvalTest['workspace'],
+ source?: {
+ readonly evalFileAbsolutePath: string;
+ readonly importedSuiteName?: string;
+ },
+): EvalTest {
+ return {
+ id,
+ question: 'test',
+ criteria: 'ok',
+ input: [{ role: 'user', content: 'test' }],
+ expected_output: [],
+ file_paths: [],
+ workspace,
+ ...(source
+ ? {
+ source: {
+ evalFilePath: source.evalFileAbsolutePath,
+ evalFileAbsolutePath: source.evalFileAbsolutePath,
+ testId: id,
+ testSnapshotYaml: `id: ${id}`,
+ graderDefinitions: [],
+ references: [],
+ ...(source.importedSuiteName !== undefined && {
+ importedSuiteName: source.importedSuiteName,
+ }),
+ },
+ }
+ : {}),
+ };
+}
+
describe('prepareSharedWorkspaceSetup', () => {
let tmpDir: string;
let savedAgentvHome: string | undefined;
@@ -149,4 +185,247 @@ describe('prepareSharedWorkspaceSetup', () => {
);
expect(existsSync(path.join(existingWorkspace, 'repo-a'))).toBe(false);
});
+
+ it('rejects multiple imported suites with shared workspaces before setup', async () => {
+ const evalCases = [
+ testCase(
+ 'a',
+ { template: path.join(tmpDir, 'missing-a') },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'child-a.eval.yaml'),
+ importedSuiteName: 'child-a',
+ },
+ ),
+ testCase(
+ 'b',
+ { template: path.join(tmpDir, 'missing-b') },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'child-b.eval.yaml'),
+ importedSuiteName: 'child-b',
+ },
+ ),
+ ];
+
+ await expect(
+ prepareSharedWorkspaceSetup({
+ evalRunId: 'test-multiple-imported-suite-shared-workspaces',
+ evalCases,
+ evalDir: tmpDir,
+ workers: 1,
+ }),
+ ).rejects.toThrow(/multiple shared workspace owners/);
+ });
+
+ it('allows per-case isolated imported suites without shared setup', async () => {
+ setup = await prepareSharedWorkspaceSetup({
+ evalRunId: 'test-per-case-imported-suites',
+ evalCases: [
+ testCase(
+ 'a',
+ { isolation: 'per_case', template: path.join(tmpDir, 'missing-a') },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'child-a.eval.yaml'),
+ importedSuiteName: 'child-a',
+ },
+ ),
+ testCase(
+ 'b',
+ { isolation: 'per_case', template: path.join(tmpDir, 'missing-b') },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'child-b.eval.yaml'),
+ importedSuiteName: 'child-b',
+ },
+ ),
+ ],
+ evalDir: tmpDir,
+ workers: 1,
+ });
+
+ expect(setup.sharedWorkspacePath).toBeUndefined();
+ expect(setup.suiteWorkspace).toBeUndefined();
+ });
+
+ it('rejects mixed imported-suite and parent-owned shared workspaces', async () => {
+ const parentTemplate = path.join(tmpDir, 'parent-template');
+ const childTemplate = path.join(tmpDir, 'child-template');
+ mkdirSync(parentTemplate, { recursive: true });
+ mkdirSync(childTemplate, { recursive: true });
+
+ await expect(
+ prepareSharedWorkspaceSetup({
+ evalRunId: 'test-parent-and-imported-shared-workspaces',
+ evalCases: [
+ testCase(
+ 'child-case',
+ { template: childTemplate },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'child.eval.yaml'),
+ importedSuiteName: 'child',
+ },
+ ),
+ testCase(
+ 'parent-case',
+ { template: parentTemplate },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'parent.eval.yaml'),
+ },
+ ),
+ ],
+ evalDir: tmpDir,
+ workers: 1,
+ }),
+ ).rejects.toThrow(/does not merge parent and child workspaces/);
+ });
+
+ it('keeps imported per-case workspaces allowed beside parent-owned raw cases', async () => {
+ const parentTemplate = path.join(tmpDir, 'parent-template');
+ mkdirSync(parentTemplate, { recursive: true });
+ writeFileSync(path.join(parentTemplate, 'parent-marker.txt'), 'parent\n', 'utf8');
+
+ setup = await prepareSharedWorkspaceSetup({
+ evalRunId: 'test-parent-shared-imported-per-case',
+ evalCases: [
+ testCase(
+ 'child-case',
+ { isolation: 'per_case', template: path.join(tmpDir, 'child-template') },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'child.eval.yaml'),
+ importedSuiteName: 'child',
+ },
+ ),
+ testCase(
+ 'parent-case',
+ { template: parentTemplate },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'parent.eval.yaml'),
+ },
+ ),
+ ],
+ evalDir: tmpDir,
+ workers: 1,
+ });
+
+ expect(setup.sharedWorkspacePath).toBeDefined();
+ expect(setup.suiteWorkspace?.template).toBe(parentTemplate);
+ if (!setup.sharedWorkspacePath) {
+ throw new Error('Expected parent-owned shared workspace');
+ }
+ expect(readFileSync(path.join(setup.sharedWorkspacePath, 'parent-marker.txt'), 'utf8')).toBe(
+ 'parent\n',
+ );
+ });
+
+ it('runs shared setup for env-only workspace configs', async () => {
+ const missingCommand = `agentv-missing-command-${Date.now()}`;
+
+ await expect(
+ prepareSharedWorkspaceSetup({
+ evalRunId: 'test-env-only-preflight',
+ evalCases: [
+ testCase('env-only-case', {
+ env: {
+ required_commands: [missingCommand],
+ },
+ }),
+ ],
+ evalDir: tmpDir,
+ workers: 1,
+ }),
+ ).rejects.toThrow(`command: ${missingCommand}`);
+ });
+
+ it('runs shared setup for docker-only workspace configs', async () => {
+ await expect(
+ prepareSharedWorkspaceSetup({
+ evalRunId: 'test-docker-only-preflight',
+ evalCases: [
+ testCase('docker-only-case', {
+ docker: {
+ image: 'invalid image with spaces',
+ },
+ }),
+ ],
+ evalDir: tmpDir,
+ workers: 1,
+ }),
+ ).rejects.toThrow(/Docker workspace configured|docker pull failed|invalid reference/);
+ });
+
+ it('runs per-case setup for env-only workspace configs', async () => {
+ const missingCommand = `agentv-missing-command-${Date.now()}`;
+
+ await expect(
+ prepareEvalCaseWorkspace({
+ evalRunId: 'test-per-case-env-only-preflight',
+ evalCase: testCase('per-case-env-only-case', {
+ isolation: 'per_case',
+ env: {
+ required_commands: [missingCommand],
+ },
+ }),
+ evalDir: tmpDir,
+ }),
+ ).rejects.toThrow(`command: ${missingCommand}`);
+ });
+
+ it('runs per-case setup for docker-only workspace configs', async () => {
+ await expect(
+ prepareEvalCaseWorkspace({
+ evalRunId: 'test-per-case-docker-only-preflight',
+ evalCase: testCase('per-case-docker-only-case', {
+ isolation: 'per_case',
+ docker: {
+ image: 'invalid image with spaces',
+ },
+ }),
+ evalDir: tmpDir,
+ }),
+ ).rejects.toThrow(/Docker workspace configured|docker pull failed|invalid reference/);
+ });
+
+ it('does not apply a child suite shared workspace to raw cases with no workspace', async () => {
+ const childTemplate = path.join(tmpDir, 'child-template');
+ mkdirSync(childTemplate, { recursive: true });
+ writeFileSync(path.join(childTemplate, 'child-marker.txt'), 'child\n', 'utf8');
+ writeFileSync(
+ path.join(childTemplate, 'child.code-workspace'),
+ JSON.stringify({ folders: [{ path: '.' }] }),
+ 'utf8',
+ );
+
+ const childCase = testCase(
+ 'child-case',
+ { template: childTemplate },
+ {
+ evalFileAbsolutePath: path.join(tmpDir, 'child.eval.yaml'),
+ importedSuiteName: 'child',
+ },
+ );
+ const rawCase = testCase('raw-case', undefined, {
+ evalFileAbsolutePath: path.join(tmpDir, 'parent.eval.yaml'),
+ });
+
+ setup = await prepareSharedWorkspaceSetup({
+ evalRunId: 'test-child-shared-raw-no-workspace',
+ evalCases: [childCase, rawCase],
+ evalDir: tmpDir,
+ workers: 1,
+ });
+
+ expect(setup.sharedWorkspacePath).toBeDefined();
+ expect(setup.suiteWorkspaceFile).toBeDefined();
+ expect(caseUsesSharedWorkspaceSetup(childCase, setup)).toBe(true);
+ expect(caseUsesSharedWorkspaceSetup(rawCase, setup)).toBe(false);
+
+ const rawWorkspaceSetup = await prepareEvalCaseWorkspace({
+ evalCase: rawCase,
+ evalRunId: 'test-child-shared-raw-no-workspace-case',
+ sharedWorkspacePath: undefined,
+ suiteWorkspaceFile: setup.suiteWorkspaceFile,
+ evalDir: tmpDir,
+ });
+
+ expect(rawWorkspaceSetup.workspacePath).toBeUndefined();
+ expect(rawWorkspaceSetup.caseWorkspaceFile).toBeUndefined();
+ });
});
diff --git a/packages/sdk/src/eval.ts b/packages/sdk/src/eval.ts
index 59dcb4b99..35c426d56 100644
--- a/packages/sdk/src/eval.ts
+++ b/packages/sdk/src/eval.ts
@@ -115,7 +115,7 @@ export interface EvalDockerWorkspace {
export interface EvalWorkspace {
readonly template?: string;
- readonly isolation?: 'shared' | 'per_test';
+ readonly isolation?: 'shared' | 'per_case';
readonly repos?: readonly EvalWorkspaceRepo[];
readonly hooks?: EvalWorkspaceHooks;
readonly mode?: 'pooled' | 'temp' | 'static';
diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md
index 482496c43..1cc0ea8f6 100644
--- a/skills-data/agentv-eval-writer/SKILL.md
+++ b/skills-data/agentv-eval-writer/SKILL.md
@@ -383,7 +383,7 @@ workspace:
hooks:
after_each:
reset: fast # none | fast | strict
- isolation: shared # shared | per_test
+ isolation: shared # shared | per_case
mode: pooled # pooled | temp | static
```
diff --git a/skills-data/agentv-eval-writer/references/eval-schema.json b/skills-data/agentv-eval-writer/references/eval-schema.json
index 21514a2bc..0080ca074 100644
--- a/skills-data/agentv-eval-writer/references/eval-schema.json
+++ b/skills-data/agentv-eval-writer/references/eval-schema.json
@@ -5234,7 +5234,7 @@
},
"isolation": {
"type": "string",
- "enum": ["shared", "per_test"]
+ "enum": ["shared", "per_case"]
},
"repos": {
"type": "array",
@@ -5486,6 +5486,26 @@
},
"required": ["image"],
"additionalProperties": false
+ },
+ "env": {
+ "type": "object",
+ "properties": {
+ "required_commands": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1
+ }
+ },
+ "required_python_modules": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1
+ }
+ }
+ },
+ "additionalProperties": false
}
},
"additionalProperties": false
@@ -12021,7 +12041,7 @@
},
"isolation": {
"type": "string",
- "enum": ["shared", "per_test"]
+ "enum": ["shared", "per_case"]
},
"repos": {
"type": "array",
@@ -12273,6 +12293,26 @@
},
"required": ["image"],
"additionalProperties": false
+ },
+ "env": {
+ "type": "object",
+ "properties": {
+ "required_commands": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1
+ }
+ },
+ "required_python_modules": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1
+ }
+ }
+ },
+ "additionalProperties": false
}
},
"additionalProperties": false
@@ -16149,8 +16189,17 @@
},
"workspace": {
"type": "object",
- "properties": {},
- "additionalProperties": {}
+ "properties": {
+ "mode": {
+ "type": "string",
+ "enum": ["pooled", "temp", "static"]
+ },
+ "path": {
+ "type": "string",
+ "minLength": 1
+ }
+ },
+ "additionalProperties": false
},
"setup": {
"not": {}
@@ -18581,8 +18630,17 @@
},
"workspace": {
"type": "object",
- "properties": {},
- "additionalProperties": {}
+ "properties": {
+ "mode": {
+ "type": "string",
+ "enum": ["pooled", "temp", "static"]
+ },
+ "path": {
+ "type": "string",
+ "minLength": 1
+ }
+ },
+ "additionalProperties": false
},
"setup": {
"not": {}
@@ -19774,7 +19832,7 @@
},
"isolation": {
"type": "string",
- "enum": ["shared", "per_test"]
+ "enum": ["shared", "per_case"]
},
"repos": {
"type": "array",
@@ -20026,6 +20084,26 @@
},
"required": ["image"],
"additionalProperties": false
+ },
+ "env": {
+ "type": "object",
+ "properties": {
+ "required_commands": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1
+ }
+ },
+ "required_python_modules": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1
+ }
+ }
+ },
+ "additionalProperties": false
}
},
"additionalProperties": false