EntityProcess · christso · Jun 27, 2026 · Jun 27, 2026
diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx
@@ -63,9 +63,14 @@ tests:
 ```
 
 `type: suite` preserves the imported suite's task contract: metadata,
-`workspace`, shared `input`, shared `assertions`, and tests. The child suite's
-`experiment:` or legacy `execution:` runtime block is ignored; the parent eval's
-runtime block controls the run.
+`workspace`, shared `input`, shared `assertions`, and tests. The parent eval
+still owns the single run bundle. Runtime defaults from an imported suite apply
+only where they can be scoped to that suite's tests: `threshold`, `repeat` or
+`runs`, `timeout_seconds`, and `budget_usd`. If the parent eval supplies one of
+those defaults, the parent value wins for imported tests. Fields that cannot be
+scoped inside one parent run, such as `target`, `targets`, `workers`,
+`workspace`, `agent`, `model`, `agent_options`, and `sandbox`, must be supplied
+by the parent experiment when importing the suite.
 
 `type: tests` imports only raw test entries. It intentionally drops shared
 context from an imported eval suite, so parent suite fields apply to those raw
@@ -89,19 +94,17 @@ Suite imports are resolved as a deterministic include graph. Circular `type:
 suite` imports fail validation with the import chain; raw-case shorthand does
 not recursively load suite runtime blocks.
 
-Imported suite artifacts are nested under the source suite name inside a wrapper
-eval result directory, for example
-`.agentv/results/<wrapper-eval>/<timestamp>/<imported-suite>/<test-id>/...`.
-Direct tests owned by the wrapper eval and raw case imports live directly under
-`<test-id>/...`.
+Imported suite rows keep their source suite metadata in `index.jsonl`. Use each
+row's `result_dir` as the authoritative path to generated artifacts inside the
+run directory; do not infer layout from suite names.
 
 ## Scoped Run Overrides
 
 Use scoped `run:` blocks for result interpretation and scheduling policies that
 vary by include group or test case. Precedence is:
 
 ```text
-test.run > tests[].run > experiment
+test.run > tests[].run > parent experiment > imported suite experiment defaults
 ```
 
 ```yaml

diff --git a/docs/adr/0006-separate-experiments-from-eval-definitions.md b/docs/adr/0006-separate-experiments-from-eval-definitions.md
@@ -183,9 +183,14 @@ Imported tests run in deterministic order: resolved path first, then the test
 order inside each resolved source.
 
 `type: suite` preserves the imported suite task contract. That includes suite
-metadata, `workspace`, shared `input`, shared `assertions`, and tests. The child
-suite's `experiment:` block, or legacy `execution:` block, is ignored and
-replaced by the parent eval's `experiment:` block.
+metadata, `workspace`, shared `input`, shared `assertions`, and tests. The
+parent eval still owns one run bundle. Child suite `experiment:` defaults apply
+to imported tests only when the field can be scoped per test:
+`threshold`, `repeat` or `runs`, `timeout_seconds`, and `budget_usd`.
+Where the parent eval supplies one of those defaults, the parent value wins.
+Fields that cannot vary per imported suite inside one parent run, such as
+`target`, `targets`, `workers`, `workspace`, `agent`, `model`, `agent_options`,
+and `sandbox`, must be supplied by the parent experiment for imported suites.
 
 `type: tests` imports only raw test entries. It intentionally drops shared
 suite context such as workspace, shared input, and shared assertions. Use this
@@ -230,7 +235,7 @@ policy without creating separate experiment files.
 Runtime override precedence is:
 
 ```text
-test.run > tests[].run > experiment
+test.run > tests[].run > parent experiment > imported suite experiment defaults
 ```
 
 Group-level overrides live beside `include`, `type`, and `select`:
@@ -295,14 +300,16 @@ When a wrapper eval imports it with `type: suite`, AgentV must preserve its
 shared `workspace`, `input`, and `assertions` because those fields are part of
 the task contract. Its `execution` block is the legacy spelling for child
 runtime configuration. Under this decision, the child runtime block is treated
-as child `experiment`/legacy `execution` and ignored in favor of the parent
-wrapper eval's `experiment:`.
+as child `experiment`/legacy `execution`: scoped defaults such as threshold,
+repeat policy, timeout, and budget can follow the imported tests, while
+candidate-changing fields must be supplied by the parent wrapper eval's
+`experiment:`.
 
 This is the motivating distinction:
 
 - task context from imported suites is preserved;
-- child runtime policy from imported suites is replaced by the parent runtime
-  policy;
+- child runtime policy from imported suites contributes scoped defaults only
+  where a parent runtime policy does not override them;
 - raw-case imports do not inherit suite context.
 
 ## Result Layout

diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
@@ -104,6 +104,8 @@ type LoadOptions = {
   readonly category?: string;
   /** Internal DFS stack for detecting circular `type: suite` imports. */
   readonly suiteImportStack?: readonly SuiteImportStackEntry[];
+  /** Internal runtime defaults supplied by an eval that imports this suite. */
+  readonly importParentExperimentConfig?: ExperimentConfig;
 };
 
 type SuiteImportStackEntry = {
@@ -469,6 +471,11 @@ async function loadTestsFromParsedYamlValue(
     suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
 
   const rawTestCases = resolveTests(suite);
+  const suiteExperimentConfig = normalizeSuiteExperimentConfig(suite);
+  const importContextExperimentConfig = mergeExperimentParentDefaults(
+    options?.importParentExperimentConfig,
+    suiteExperimentConfig,
+  );
   // Top-level `metadata:` is inherited by cases. Suite identity tags are parsed
   // separately by parseMetadata() and are not case tags.
   const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
@@ -495,6 +502,7 @@ async function loadTestsFromParsedYamlValue(
       evalFileDir,
       repoRoot,
       suiteMetadataPayload,
+      parentExperimentConfig: importContextExperimentConfig,
       options,
     });
     expandedTestCases = expanded.rawCases;
@@ -891,8 +899,12 @@ function mergeRunOverrides(
   };
 }
 
-function applyRunOverrideToTest(test: EvalTest, includeRun: EvalRunOverride | undefined): EvalTest {
-  const run = mergeRunOverrides(includeRun, test.run);
+function applyRunDefaultsToImportedTest(
+  test: EvalTest,
+  childExperimentRun: EvalRunOverride | undefined,
+  includeRun: EvalRunOverride | undefined,
+): EvalTest {
+  const run = mergeRunOverrides(mergeRunOverrides(childExperimentRun, includeRun), test.run);
   if (!run) {
     return test;
   }
@@ -902,6 +914,157 @@ function applyRunOverrideToTest(test: EvalTest, includeRun: EvalRunOverride | un
   };
 }
 
+function experimentProvidesTarget(config: ExperimentConfig | undefined): boolean {
+  return config?.target !== undefined || config?.targets !== undefined;
+}
+
+function experimentProvidesRepeat(config: ExperimentConfig | undefined): boolean {
+  return config?.repeat !== undefined || config?.runs !== undefined;
+}
+
+function mergeExperimentParentDefaults(
+  parent: ExperimentConfig | undefined,
+  child: ExperimentConfig | undefined,
+): ExperimentConfig | undefined {
+  if (!parent) {
+    return child;
+  }
+  if (!child) {
+    return parent;
+  }
+  return {
+    ...child,
+    ...parent,
+    ...(experimentProvidesRepeat(parent)
+      ? {
+          ...(parent.repeat !== undefined && { repeat: parent.repeat }),
+          ...(parent.runs !== undefined && { runs: parent.runs }),
+        }
+      : {
+          ...(child.repeat !== undefined && { repeat: child.repeat }),
+          ...(child.runs !== undefined && { runs: child.runs }),
+        }),
+  };
+}
+
+function buildExperimentRunDefaults(
+  config: ExperimentConfig | undefined,
+): EvalRunOverride | undefined {
+  if (!config) {
+    return undefined;
+  }
+  const repeat = config.repeat
+    ? {
+        count: config.repeat.count,
+        strategy: config.repeat.strategy,
+        ...(config.repeat.costLimitUsd !== undefined && {
+          costLimitUsd: config.repeat.costLimitUsd,
+        }),
+        ...(config.earlyExit !== undefined && { earlyExit: config.earlyExit }),
+      }
+    : config.runs !== undefined
+      ? {
+          count: config.runs,
+          strategy: 'pass_at_k' as const,
+          ...(config.earlyExit !== undefined && { earlyExit: config.earlyExit }),
+        }
+      : undefined;
+  const run = {
+    ...(config.threshold !== undefined && { threshold: config.threshold }),
+    ...(repeat !== undefined && { repeat }),
+    ...(config.timeoutSeconds !== undefined && { timeoutSeconds: config.timeoutSeconds }),
+    ...(config.budgetUsd !== undefined && { budgetUsd: config.budgetUsd }),
+  } satisfies EvalRunOverride;
+  return Object.keys(run).length > 0 ? run : undefined;
+}
+
+function buildImportedExperimentRunDefaults(
+  child: ExperimentConfig | undefined,
+  parent: ExperimentConfig | undefined,
+): EvalRunOverride | undefined {
+  const childRun = buildExperimentRunDefaults(child);
+  if (!childRun) {
+    return undefined;
+  }
+  const run = {
+    ...(parent?.threshold === undefined &&
+      childRun.threshold !== undefined && { threshold: childRun.threshold }),
+    ...(!experimentProvidesRepeat(parent) && childRun.repeat !== undefined
+      ? { repeat: childRun.repeat }
+      : {}),
+    ...(parent?.timeoutSeconds === undefined &&
+      childRun.timeoutSeconds !== undefined && { timeoutSeconds: childRun.timeoutSeconds }),
+    ...(parent?.budgetUsd === undefined &&
+      childRun.budgetUsd !== undefined && { budgetUsd: childRun.budgetUsd }),
+  } satisfies EvalRunOverride;
+  return Object.keys(run).length > 0 ? run : undefined;
+}
+
+type ImportedExperimentFieldRule = {
+  readonly field: string;
+  readonly childHasField: (config: ExperimentConfig) => boolean;
+  readonly parentHasOverride: (config: ExperimentConfig | undefined) => boolean;
+};
+
+const UNSCOPED_IMPORTED_EXPERIMENT_FIELDS: readonly ImportedExperimentFieldRule[] = [
+  {
+    field: 'target',
+    childHasField: (config) => experimentProvidesTarget(config),
+    parentHasOverride: experimentProvidesTarget,
+  },
+  {
+    field: 'agent',
+    childHasField: (config) => config.agent !== undefined,
+    parentHasOverride: (config) => config?.agent !== undefined,
+  },
+  {
+    field: 'model',
+    childHasField: (config) => config.model !== undefined,
+    parentHasOverride: (config) => config?.model !== undefined,
+  },
+  {
+    field: 'agent_options',
+    childHasField: (config) => config.agentOptions !== undefined,
+    parentHasOverride: (config) => config?.agentOptions !== undefined,
+  },
+  {
+    field: 'workers',
+    childHasField: (config) => config.workers !== undefined,
+    parentHasOverride: (config) => config?.workers !== undefined,
+  },
+  {
+    field: 'sandbox',
+    childHasField: (config) => config.sandbox !== undefined,
+    parentHasOverride: (config) => config?.sandbox !== undefined,
+  },
+  {
+    field: 'workspace',
+    childHasField: (config) => config.workspace !== undefined,
+    parentHasOverride: (config) => config?.workspace !== undefined,
+  },
+];
+
+function assertImportedExperimentCanCompose(
+  child: ExperimentConfig | undefined,
+  parent: ExperimentConfig | undefined,
+  importPath: string,
+): void {
+  if (!child) {
+    return;
+  }
+  const unsupported = UNSCOPED_IMPORTED_EXPERIMENT_FIELDS.filter(
+    (rule) => rule.childHasField(child) && !rule.parentHasOverride(parent),
+  ).map((rule) => `experiment.${rule.field}`);
+  if (unsupported.length === 0) {
+    return;
+  }
+  throw new Error(
+    `Imported eval suite '${displayEvalImportPath(importPath)}' defines ${unsupported.join(
+      ', ',
+    )}, which cannot be scoped per imported suite. Set these fields in the parent experiment when importing this suite.`,
+  );
+}
+
 function markSuiteImportedTest(test: EvalTest): EvalTest {
   return {
     ...test,
@@ -1150,6 +1313,7 @@ async function expandInlineTestEntries(params: {
   readonly evalFileDir: string;
   readonly repoRoot: URL | string;
   readonly suiteMetadataPayload?: Record<string, unknown>;
+  readonly parentExperimentConfig?: ExperimentConfig;
   readonly options?: LoadOptions;
 }): Promise<ExpandedInlineTestEntries> {
   const withFileReferences = await expandFileReferences(params.entries, params.evalFileDir);
@@ -1178,15 +1342,25 @@ async function expandInlineTestEntries(params: {
         const suite = await loadTestSuite(resolvedPath, params.repoRoot, {
           ...params.options,
           filter: select?.testIds,
+          importParentExperimentConfig: params.parentExperimentConfig,
         });
+        assertImportedExperimentCanCompose(
+          suite.experimentConfig,
+          params.parentExperimentConfig,
+          resolvedPath,
+        );
+        const childExperimentRun = buildImportedExperimentRunDefaults(
+          suite.experimentConfig,
+          params.parentExperimentConfig,
+        );
         const selectedTests = params.options?.filter
           ? suite.tests.filter((test) => matchesFilter(test.id, params.options?.filter ?? ''))
           : suite.tests;
         importedSuiteTests.push(
           ...selectedTests
             .filter((test) => evalTestMatchesSelect(test, select))
             .map(markSuiteImportedTest)
-            .map((test) => applyRunOverrideToTest(test, includeRun)),
+            .map((test) => applyRunDefaultsToImportedTest(test, childExperimentRun, includeRun)),
         );
       } else {
         const importedCases = await loadRawCasesForInclude(resolvedPath);