diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx index d85077d87..8bb05eee3 100644 --- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx +++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx @@ -63,9 +63,14 @@ tests: ``` `type: suite` preserves the imported suite's task contract: metadata, -`workspace`, shared `input`, shared `assertions`, and tests. The child suite's -`experiment:` or legacy `execution:` runtime block is ignored; the parent eval's -runtime block controls the run. +`workspace`, shared `input`, shared `assertions`, and tests. The parent eval +still owns the single run bundle. Runtime defaults from an imported suite apply +only where they can be scoped to that suite's tests: `threshold`, `repeat` or +`runs`, `timeout_seconds`, and `budget_usd`. If the parent eval supplies one of +those defaults, the parent value wins for imported tests. Fields that cannot be +scoped inside one parent run, such as `target`, `targets`, `workers`, +`workspace`, `agent`, `model`, `agent_options`, and `sandbox`, must be supplied +by the parent experiment when importing the suite. `type: tests` imports only raw test entries. It intentionally drops shared context from an imported eval suite, so parent suite fields apply to those raw @@ -89,11 +94,9 @@ Suite imports are resolved as a deterministic include graph. Circular `type: suite` imports fail validation with the import chain; raw-case shorthand does not recursively load suite runtime blocks. -Imported suite artifacts are nested under the source suite name inside a wrapper -eval result directory, for example -`.agentv/results/////...`. -Direct tests owned by the wrapper eval and raw case imports live directly under -`/...`. +Imported suite rows keep their source suite metadata in `index.jsonl`. Use each +row's `result_dir` as the authoritative path to generated artifacts inside the +run directory; do not infer layout from suite names. ## Scoped Run Overrides @@ -101,7 +104,7 @@ Use scoped `run:` blocks for result interpretation and scheduling policies that vary by include group or test case. Precedence is: ```text -test.run > tests[].run > experiment +test.run > tests[].run > parent experiment > imported suite experiment defaults ``` ```yaml diff --git a/docs/adr/0006-separate-experiments-from-eval-definitions.md b/docs/adr/0006-separate-experiments-from-eval-definitions.md index 752495b71..6f5bfdd91 100644 --- a/docs/adr/0006-separate-experiments-from-eval-definitions.md +++ b/docs/adr/0006-separate-experiments-from-eval-definitions.md @@ -183,9 +183,14 @@ Imported tests run in deterministic order: resolved path first, then the test order inside each resolved source. `type: suite` preserves the imported suite task contract. That includes suite -metadata, `workspace`, shared `input`, shared `assertions`, and tests. The child -suite's `experiment:` block, or legacy `execution:` block, is ignored and -replaced by the parent eval's `experiment:` block. +metadata, `workspace`, shared `input`, shared `assertions`, and tests. The +parent eval still owns one run bundle. Child suite `experiment:` defaults apply +to imported tests only when the field can be scoped per test: +`threshold`, `repeat` or `runs`, `timeout_seconds`, and `budget_usd`. +Where the parent eval supplies one of those defaults, the parent value wins. +Fields that cannot vary per imported suite inside one parent run, such as +`target`, `targets`, `workers`, `workspace`, `agent`, `model`, `agent_options`, +and `sandbox`, must be supplied by the parent experiment for imported suites. `type: tests` imports only raw test entries. It intentionally drops shared suite context such as workspace, shared input, and shared assertions. Use this @@ -230,7 +235,7 @@ policy without creating separate experiment files. Runtime override precedence is: ```text -test.run > tests[].run > experiment +test.run > tests[].run > parent experiment > imported suite experiment defaults ``` Group-level overrides live beside `include`, `type`, and `select`: @@ -295,14 +300,16 @@ When a wrapper eval imports it with `type: suite`, AgentV must preserve its shared `workspace`, `input`, and `assertions` because those fields are part of the task contract. Its `execution` block is the legacy spelling for child runtime configuration. Under this decision, the child runtime block is treated -as child `experiment`/legacy `execution` and ignored in favor of the parent -wrapper eval's `experiment:`. +as child `experiment`/legacy `execution`: scoped defaults such as threshold, +repeat policy, timeout, and budget can follow the imported tests, while +candidate-changing fields must be supplied by the parent wrapper eval's +`experiment:`. This is the motivating distinction: - task context from imported suites is preserved; -- child runtime policy from imported suites is replaced by the parent runtime - policy; +- child runtime policy from imported suites contributes scoped defaults only + where a parent runtime policy does not override them; - raw-case imports do not inherit suite context. ## Result Layout diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index ea543edca..c795b9ec9 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -104,6 +104,8 @@ type LoadOptions = { readonly category?: string; /** Internal DFS stack for detecting circular `type: suite` imports. */ readonly suiteImportStack?: readonly SuiteImportStackEntry[]; + /** Internal runtime defaults supplied by an eval that imports this suite. */ + readonly importParentExperimentConfig?: ExperimentConfig; }; type SuiteImportStackEntry = { @@ -469,6 +471,11 @@ async function loadTestsFromParsedYamlValue( suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName; const rawTestCases = resolveTests(suite); + const suiteExperimentConfig = normalizeSuiteExperimentConfig(suite); + const importContextExperimentConfig = mergeExperimentParentDefaults( + options?.importParentExperimentConfig, + suiteExperimentConfig, + ); // Top-level `metadata:` is inherited by cases. Suite identity tags are parsed // separately by parseMetadata() and are not case tags. const suiteMetadataPayload = extractSuiteMetadataPayload(suite); @@ -495,6 +502,7 @@ async function loadTestsFromParsedYamlValue( evalFileDir, repoRoot, suiteMetadataPayload, + parentExperimentConfig: importContextExperimentConfig, options, }); expandedTestCases = expanded.rawCases; @@ -891,8 +899,12 @@ function mergeRunOverrides( }; } -function applyRunOverrideToTest(test: EvalTest, includeRun: EvalRunOverride | undefined): EvalTest { - const run = mergeRunOverrides(includeRun, test.run); +function applyRunDefaultsToImportedTest( + test: EvalTest, + childExperimentRun: EvalRunOverride | undefined, + includeRun: EvalRunOverride | undefined, +): EvalTest { + const run = mergeRunOverrides(mergeRunOverrides(childExperimentRun, includeRun), test.run); if (!run) { return test; } @@ -902,6 +914,157 @@ function applyRunOverrideToTest(test: EvalTest, includeRun: EvalRunOverride | un }; } +function experimentProvidesTarget(config: ExperimentConfig | undefined): boolean { + return config?.target !== undefined || config?.targets !== undefined; +} + +function experimentProvidesRepeat(config: ExperimentConfig | undefined): boolean { + return config?.repeat !== undefined || config?.runs !== undefined; +} + +function mergeExperimentParentDefaults( + parent: ExperimentConfig | undefined, + child: ExperimentConfig | undefined, +): ExperimentConfig | undefined { + if (!parent) { + return child; + } + if (!child) { + return parent; + } + return { + ...child, + ...parent, + ...(experimentProvidesRepeat(parent) + ? { + ...(parent.repeat !== undefined && { repeat: parent.repeat }), + ...(parent.runs !== undefined && { runs: parent.runs }), + } + : { + ...(child.repeat !== undefined && { repeat: child.repeat }), + ...(child.runs !== undefined && { runs: child.runs }), + }), + }; +} + +function buildExperimentRunDefaults( + config: ExperimentConfig | undefined, +): EvalRunOverride | undefined { + if (!config) { + return undefined; + } + const repeat = config.repeat + ? { + count: config.repeat.count, + strategy: config.repeat.strategy, + ...(config.repeat.costLimitUsd !== undefined && { + costLimitUsd: config.repeat.costLimitUsd, + }), + ...(config.earlyExit !== undefined && { earlyExit: config.earlyExit }), + } + : config.runs !== undefined + ? { + count: config.runs, + strategy: 'pass_at_k' as const, + ...(config.earlyExit !== undefined && { earlyExit: config.earlyExit }), + } + : undefined; + const run = { + ...(config.threshold !== undefined && { threshold: config.threshold }), + ...(repeat !== undefined && { repeat }), + ...(config.timeoutSeconds !== undefined && { timeoutSeconds: config.timeoutSeconds }), + ...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }), + } satisfies EvalRunOverride; + return Object.keys(run).length > 0 ? run : undefined; +} + +function buildImportedExperimentRunDefaults( + child: ExperimentConfig | undefined, + parent: ExperimentConfig | undefined, +): EvalRunOverride | undefined { + const childRun = buildExperimentRunDefaults(child); + if (!childRun) { + return undefined; + } + const run = { + ...(parent?.threshold === undefined && + childRun.threshold !== undefined && { threshold: childRun.threshold }), + ...(!experimentProvidesRepeat(parent) && childRun.repeat !== undefined + ? { repeat: childRun.repeat } + : {}), + ...(parent?.timeoutSeconds === undefined && + childRun.timeoutSeconds !== undefined && { timeoutSeconds: childRun.timeoutSeconds }), + ...(parent?.budgetUsd === undefined && + childRun.budgetUsd !== undefined && { budgetUsd: childRun.budgetUsd }), + } satisfies EvalRunOverride; + return Object.keys(run).length > 0 ? run : undefined; +} + +type ImportedExperimentFieldRule = { + readonly field: string; + readonly childHasField: (config: ExperimentConfig) => boolean; + readonly parentHasOverride: (config: ExperimentConfig | undefined) => boolean; +}; + +const UNSCOPED_IMPORTED_EXPERIMENT_FIELDS: readonly ImportedExperimentFieldRule[] = [ + { + field: 'target', + childHasField: (config) => experimentProvidesTarget(config), + parentHasOverride: experimentProvidesTarget, + }, + { + field: 'agent', + childHasField: (config) => config.agent !== undefined, + parentHasOverride: (config) => config?.agent !== undefined, + }, + { + field: 'model', + childHasField: (config) => config.model !== undefined, + parentHasOverride: (config) => config?.model !== undefined, + }, + { + field: 'agent_options', + childHasField: (config) => config.agentOptions !== undefined, + parentHasOverride: (config) => config?.agentOptions !== undefined, + }, + { + field: 'workers', + childHasField: (config) => config.workers !== undefined, + parentHasOverride: (config) => config?.workers !== undefined, + }, + { + field: 'sandbox', + childHasField: (config) => config.sandbox !== undefined, + parentHasOverride: (config) => config?.sandbox !== undefined, + }, + { + field: 'workspace', + childHasField: (config) => config.workspace !== undefined, + parentHasOverride: (config) => config?.workspace !== undefined, + }, +]; + +function assertImportedExperimentCanCompose( + child: ExperimentConfig | undefined, + parent: ExperimentConfig | undefined, + importPath: string, +): void { + if (!child) { + return; + } + const unsupported = UNSCOPED_IMPORTED_EXPERIMENT_FIELDS.filter( + (rule) => rule.childHasField(child) && !rule.parentHasOverride(parent), + ).map((rule) => `experiment.${rule.field}`); + if (unsupported.length === 0) { + return; + } + throw new Error( + `Imported eval suite '${displayEvalImportPath(importPath)}' defines ${unsupported.join( + ', ', + )}, which cannot be scoped per imported suite. Set these fields in the parent experiment when importing this suite.`, + ); +} + function markSuiteImportedTest(test: EvalTest): EvalTest { return { ...test, @@ -1150,6 +1313,7 @@ async function expandInlineTestEntries(params: { readonly evalFileDir: string; readonly repoRoot: URL | string; readonly suiteMetadataPayload?: Record; + readonly parentExperimentConfig?: ExperimentConfig; readonly options?: LoadOptions; }): Promise { const withFileReferences = await expandFileReferences(params.entries, params.evalFileDir); @@ -1178,7 +1342,17 @@ async function expandInlineTestEntries(params: { const suite = await loadTestSuite(resolvedPath, params.repoRoot, { ...params.options, filter: select?.testIds, + importParentExperimentConfig: params.parentExperimentConfig, }); + assertImportedExperimentCanCompose( + suite.experimentConfig, + params.parentExperimentConfig, + resolvedPath, + ); + const childExperimentRun = buildImportedExperimentRunDefaults( + suite.experimentConfig, + params.parentExperimentConfig, + ); const selectedTests = params.options?.filter ? suite.tests.filter((test) => matchesFilter(test.id, params.options?.filter ?? '')) : suite.tests; @@ -1186,7 +1360,7 @@ async function expandInlineTestEntries(params: { ...selectedTests .filter((test) => evalTestMatchesSelect(test, select)) .map(markSuiteImportedTest) - .map((test) => applyRunOverrideToTest(test, includeRun)), + .map((test) => applyRunDefaultsToImportedTest(test, childExperimentRun, includeRun)), ); } else { const importedCases = await loadRawCasesForInclude(resolvedPath); diff --git a/packages/core/test/evaluation/eval-inline-experiment.test.ts b/packages/core/test/evaluation/eval-inline-experiment.test.ts index 08234ba71..646f0ee33 100644 --- a/packages/core/test/evaluation/eval-inline-experiment.test.ts +++ b/packages/core/test/evaluation/eval-inline-experiment.test.ts @@ -382,13 +382,19 @@ describe('eval.yaml inline experiment and tests imports', () => { expect(identitySuite.tests[0]?.metadata?.tags).toEqual(['suite-identity']); }); - it('type: suite preserves child suite context and ignores child runtime config', async () => { + it('type: suite preserves child suite context and lets parent experiment override child defaults', async () => { await writeFile( path.join(tempDir, 'child.eval.yaml'), [ 'name: child-suite', 'experiment:', ' target: child-target', + ' workers: 1', + ' threshold: 0.2', + ' repeat:', + ' count: 5', + ' timeout_seconds: 10', + ' budget_usd: 0.5', 'workspace:', ' path: ./child-workspace', 'input: child shared input', @@ -409,6 +415,13 @@ describe('eval.yaml inline experiment and tests imports', () => { 'name: parent-suite', 'experiment:', ' target: parent-target', + ' workers: 2', + ' threshold: 0.8', + ' repeat:', + ' count: 3', + ' strategy: pass_at_k', + ' timeout_seconds: 30', + ' budget_usd: 1.5', 'workspace:', ' path: ./parent-workspace', 'input: parent shared input', @@ -426,6 +439,9 @@ describe('eval.yaml inline experiment and tests imports', () => { const test = suite.tests[0]; expect(suite.experimentConfig?.target).toBe('parent-target'); + expect(suite.experimentConfig?.threshold).toBe(0.8); + expect(suite.experimentConfig?.repeat).toMatchObject({ count: 3, strategy: 'pass_at_k' }); + expect(test.run).toBeUndefined(); expect(test.suite).toBe('child-suite'); expect(test.workspace?.path).toBe('./child-workspace'); expect(test.input.map((message) => message.content)).toEqual([ @@ -436,7 +452,7 @@ describe('eval.yaml inline experiment and tests imports', () => { expect(test.assertions?.[0]).toMatchObject({ value: 'child' }); }); - it('applies scoped run overrides with test.run taking precedence over tests[].run', async () => { + it('applies imported child experiment defaults when parent has no experiment', async () => { await writeFile( path.join(tempDir, 'child.eval.yaml'), [ @@ -445,6 +461,91 @@ describe('eval.yaml inline experiment and tests imports', () => { ' threshold: 0.2', ' repeat:', ' count: 5', + ' strategy: mean', + ' timeout_seconds: 10', + ' budget_usd: 0.5', + 'tests:', + ' - id: child-default', + ' input: default', + ' criteria: ok', + '', + ].join('\n'), + ); + const parentPath = path.join(tempDir, 'parent.eval.yaml'); + await writeFile( + parentPath, + ['name: parent-suite', 'tests:', ' - include: child.eval.yaml', ' type: suite', ''].join( + '\n', + ), + ); + + const suite = await loadTestSuite(parentPath, tempDir); + + expect(suite.experimentConfig).toBeUndefined(); + expect(suite.tests[0]?.run).toMatchObject({ + threshold: 0.2, + repeat: { count: 5, strategy: 'mean' }, + timeoutSeconds: 10, + budgetUsd: 0.5, + }); + }); + + it('applies include-level run overrides over imported child experiment defaults', async () => { + await writeFile( + path.join(tempDir, 'child.eval.yaml'), + [ + 'name: child-suite', + 'experiment:', + ' threshold: 0.2', + ' repeat:', + ' count: 5', + ' strategy: mean', + ' timeout_seconds: 10', + ' budget_usd: 0.5', + 'tests:', + ' - id: child-default', + ' input: default', + ' criteria: ok', + '', + ].join('\n'), + ); + const parentPath = path.join(tempDir, 'parent.eval.yaml'); + await writeFile( + parentPath, + [ + 'name: parent-suite', + 'tests:', + ' - include: child.eval.yaml', + ' type: suite', + ' run:', + ' threshold: 0.9', + ' timeout_seconds: 30', + '', + ].join('\n'), + ); + + const suite = await loadTestSuite(parentPath, tempDir); + + expect(suite.tests[0]?.run).toMatchObject({ + threshold: 0.9, + repeat: { count: 5, strategy: 'mean' }, + timeoutSeconds: 30, + budgetUsd: 0.5, + }); + }); + + it('applies test.run over include-level and imported child experiment defaults', async () => { + await writeFile( + path.join(tempDir, 'child.eval.yaml'), + [ + 'name: child-suite', + 'experiment:', + ' threshold: 0.2', + ' repeat:', + ' count: 5', + ' strategy: mean', + ' timeout_seconds: 10', + ' budget_usd: 0.5', 'tests:', ' - id: child-default', ' input: default', @@ -464,11 +565,6 @@ describe('eval.yaml inline experiment and tests imports', () => { parentPath, [ 'name: parent-suite', - 'experiment:', - ' threshold: 0.8', - ' repeat:', - ' count: 3', - ' strategy: pass_at_k', 'tests:', ' - include: child.eval.yaml', ' type: suite', @@ -486,8 +582,7 @@ describe('eval.yaml inline experiment and tests imports', () => { const suite = await loadTestSuite(parentPath, tempDir); const byId = new Map(suite.tests.map((test) => [test.id, test])); - expect(suite.experimentConfig?.threshold).toBe(0.8); - expect(suite.experimentConfig?.repeat).toMatchObject({ count: 3, strategy: 'pass_at_k' }); + expect(suite.experimentConfig).toBeUndefined(); expect(byId.get('child-default')?.run).toMatchObject({ threshold: 0.9, repeat: { count: 2, strategy: 'pass_all' }, @@ -503,6 +598,50 @@ describe('eval.yaml inline experiment and tests imports', () => { expect(byId.get('child-critical')?.threshold).toBe(1.0); }); + it('rejects imported child experiment fields that cannot be scoped without a parent override', async () => { + await writeFile( + path.join(tempDir, 'child-a.eval.yaml'), + [ + 'name: child-a', + 'experiment:', + ' workers: 2', + 'tests:', + ' - id: a', + ' input: a', + ' criteria: ok', + '', + ].join('\n'), + ); + await writeFile( + path.join(tempDir, 'child-b.eval.yaml'), + [ + 'name: child-b', + 'experiment:', + ' workers: 4', + 'tests:', + ' - id: b', + ' input: b', + ' criteria: ok', + '', + ].join('\n'), + ); + const parentPath = path.join(tempDir, 'parent.eval.yaml'); + await writeFile( + parentPath, + [ + 'name: parent-suite', + 'tests:', + ' - include: child-*.eval.yaml', + ' type: suite', + '', + ].join('\n'), + ); + + await expect(loadTestSuite(parentPath, tempDir)).rejects.toThrow( + /experiment\.workers.*cannot be scoped per imported suite/, + ); + }); + it('type: tests imports only raw cases and applies parent suite context', async () => { await writeFile( path.join(tempDir, 'child.eval.yaml'),