EntityProcess · christso · Jun 28, 2026 · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/CONCEPTS.md b/CONCEPTS.md
@@ -12,9 +12,17 @@ Shared domain vocabulary for this project — entities, named processes, and sta
 
 ## Evaluation Model
 
-**Eval** — The frozen task and grading definition: prompts, datasets, input files, fixtures, assertions, and judge criteria. An eval defines what is being tested, not which agent, model, setup variant, or run policy executes it.
+**Eval / Eval YAML** — The only composable and runnable AgentV authoring primitive. An eval YAML file can be a reusable task suite that owns task context, a wrapper eval that imports suites and carries an inline `experiment:` block, or a sidecar around raw JSONL cases. AgentV does not have a separate runnable `experiment.yaml` artifact.
 
-**Experiment** — A committed run variant that selects how evals are executed: target or target matrix, setup, scripts, eval filters, repeat counts, timeouts, workers, budgets, and related run knobs. Experiments make A/B setup differences explicit while pointing at stable eval tasks.
+**Task suite** — Eval YAML that owns what is being tested: prompts, datasets, input files, fixtures, `workspace`, assertions, expected references, and judge criteria. It can run directly or be imported by another eval with `tests[].include` and `type: suite`.
+
+**Raw case file** — YAML, JSONL, or directory case data imported with `tests: ./cases.yaml`, string shorthand, or `type: tests`. Raw cases are reusable data inputs; they do not carry imported suite context such as shared `workspace`, shared `input`, or shared `assertions`.
+
+**Wrapper eval** — Eval YAML whose main job is to import task suites and bind runtime policy with an inline `experiment:` block. Wrapper evals may live under an `experiments/` directory, but that path is an optional user-owned convention and AgentV does not infer behavior from it. A wrapper that imports suites with `type: suite` does not define parent workspace fields such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`; imported suites own task environment.
+
+**Experiment** — The run-policy namespace for how evals are executed: target or target matrix, eval filters, repeat counts, timeouts, workers, budgets, thresholds, and related run knobs. In authored files it lives as inline `experiment:` inside eval YAML; CLI `--experiment` and `experiment.name` choose the result bucket. Lifecycle setup belongs in `workspace.hooks` or `targets[].hooks`, not in a separate experiment artifact.
+
+**Workspace** — The task environment an eval prepares for the agent: repositories, templates, fixture files, and lifecycle hooks. It is not prompt input; use `input` for instructions and `workspace.repos[]` for multi-repo workspaces the agent can inspect or modify through tools.
 
 **Run manifest** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `task_dir`, `summary_path`, and `grading_path`.
 

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -12,6 +12,7 @@ import {
   RESULT_INDEX_FILENAME,
   RUN_SUMMARY_FILENAME,
   type ResultIndexArtifact,
+  type RunRuntimeSourceMetadata,
   type RunSummaryArtifact,
   type TimingArtifact,
   aggregateRunDir,
@@ -215,6 +216,7 @@ export async function writePerTestArtifacts(
     repoRoot?: string;
     sourceTests?: readonly EvalTest[];
     taskBundleTargets?: readonly TaskBundleTargetSelection[];
+    runtimeSource?: RunRuntimeSourceMetadata;
   },
 ): Promise<void> {
   await writeCorePerTestArtifacts(results, outputDir, {
@@ -224,6 +226,7 @@ export async function writePerTestArtifacts(
     duplicatePolicy: options?.duplicatePolicy,
     sourceTests: options?.sourceTests,
     additionalArtifacts: createTaskBundleArtifactsWriter(options),
+    runtimeSource: options?.runtimeSource,
   });
 }
 
@@ -242,6 +245,7 @@ export async function writeArtifactsFromResults(
     repoRoot?: string;
     sourceTests?: readonly EvalTest[];
     taskBundleTargets?: readonly TaskBundleTargetSelection[];
+    runtimeSource?: RunRuntimeSourceMetadata;
   },
 ): Promise<{
   testArtifactDir: string;
@@ -258,5 +262,6 @@ export async function writeArtifactsFromResults(
     resultGroup: options?.resultGroup,
     sourceTests: options?.sourceTests,
     additionalArtifacts: createTaskBundleArtifactsWriter(options),
+    runtimeSource: options?.runtimeSource,
   });
 }
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -19,6 +19,7 @@ import {
   type ResolvedTarget,
   ResponseCache,
   RunBudgetTracker,
+  type RunRuntimeSourceMetadata,
   type TrialsConfig,
   buildExperimentArtifactMetadata,
   buildTraceFromMessages,
@@ -589,6 +590,163 @@ function deriveEvalResultGroupName(evalFilePath: string | undefined): string {
   );
 }
 
+const CLI_RUNTIME_SOURCE_OPTION_KEYS = [
+  'target',
+  'targets',
+  'filter',
+  'tag',
+  'excludeTag',
+  'workers',
+  'dryRun',
+  'dryRunDelay',
+  'dryRunDelayMin',
+  'dryRunDelayMax',
+  'agentTimeout',
+  'maxRetries',
+  'cache',
+  'cachePath',
+  'noCache',
+  'graderTarget',
+  'model',
+  'threshold',
+  'budgetUsd',
+  'transcript',
+  'recordReplay',
+  'recordReplayVariant',
+  'workspacePath',
+  'workspaceMode',
+] as const;
+
+function hasCliRuntimeSource(rawOptions: Record<string, unknown>): boolean {
+  return CLI_RUNTIME_SOURCE_OPTION_KEYS.some((key) => {
+    const value = rawOptions[key];
+    if (Array.isArray(value)) {
+      return value.some((entry) => typeof entry === 'string' && entry.trim().length > 0);
+    }
+    if (typeof value === 'string') {
+      return value.trim().length > 0 && value.trim() !== 'default';
+    }
+    if (typeof value === 'number') {
+      return Number.isFinite(value) && value !== 0;
+    }
+    return value === true;
+  });
+}
+
+function toRuntimeSourcePath(cwd: string, filePath: string | undefined): string | undefined {
+  const trimmed = filePath?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const resolved = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed);
+  const relative = path.relative(cwd, resolved);
+  const displayPath =
+    relative && !relative.startsWith('..') && !path.isAbsolute(relative) ? relative : trimmed;
+  return displayPath.split(path.sep).join('/');
+}
+
+function uniqueRuntimeSourcePaths(values: Iterable<string | undefined>): readonly string[] {
+  return [...new Set([...values].filter((value): value is string => Boolean(value)))].sort();
+}
+
+function testSourceEvalPath(cwd: string, test: EvalTest): string | undefined {
+  return (
+    toRuntimeSourcePath(cwd, test.source?.evalFileRepoPath) ??
+    toRuntimeSourcePath(cwd, test.source?.evalFileAbsolutePath) ??
+    toRuntimeSourcePath(cwd, test.source?.evalFilePath)
+  );
+}
+
+function testSourceEvalPathForComparison(test: EvalTest): string | undefined {
+  const sourcePath = test.source?.evalFileAbsolutePath ?? test.source?.evalFilePath;
+  return sourcePath ? path.resolve(sourcePath) : undefined;
+}
+
+function buildRuntimeConfigSource(params: {
+  readonly activeTestFiles: readonly string[];
+  readonly fileMetadata: ReadonlyMap<string, { readonly options: NormalizedOptions }>;
+  readonly hasCliRuntimeConfig: boolean;
+}): RunRuntimeSourceMetadata['config_source'] {
+  const inlineFingerprints = new Set<string>();
+  let hasInlineExperiment = false;
+  let hasDefaultRuntime = false;
+
+  for (const activeTestFile of params.activeTestFiles) {
+    const experimentMetadata = params.fileMetadata.get(activeTestFile)?.options.experimentMetadata;
+    if (experimentMetadata) {
+      hasInlineExperiment = true;
+      inlineFingerprints.add(experimentMetadata.fingerprint ?? activeTestFile);
+    } else {
+      hasDefaultRuntime = true;
+    }
+  }
+
+  if (
+    (hasInlineExperiment && params.hasCliRuntimeConfig) ||
+    (hasInlineExperiment && hasDefaultRuntime) ||
+    inlineFingerprints.size > 1
+  ) {
+    return 'mixed';
+  }
+  if (params.hasCliRuntimeConfig) {
+    return 'cli_flags';
+  }
+  if (hasInlineExperiment) {
+    return 'inline_experiment';
+  }
+  return 'defaults';
+}
+
+function buildRuntimeSourceMetadata(params: {
+  readonly cwd: string;
+  readonly activeTestFiles: readonly string[];
+  readonly sourceTests: readonly EvalTest[];
+  readonly fileMetadata: ReadonlyMap<string, { readonly options: NormalizedOptions }>;
+  readonly experimentNamespace: string;
+  readonly experimentNamespaceSource: RunRuntimeSourceMetadata['experiment_namespace_source'];
+  readonly hasCliRuntimeConfig: boolean;
+}): RunRuntimeSourceMetadata {
+  const evalFiles = uniqueRuntimeSourcePaths(
+    params.activeTestFiles.map((filePath) => toRuntimeSourcePath(params.cwd, filePath)),
+  );
+  const activeResolvedFiles = new Set(
+    params.activeTestFiles.map((filePath) => path.resolve(filePath)),
+  );
+  const sourceEvalFiles = uniqueRuntimeSourcePaths(
+    params.sourceTests.map((test) => testSourceEvalPath(params.cwd, test)),
+  );
+  const hasImportedSuite = params.sourceTests.some((test) => test.source?.importedSuiteName);
+  const hasNonActiveSourceFile = params.sourceTests.some((test) => {
+    const sourceFile = testSourceEvalPathForComparison(test);
+    return sourceFile ? !activeResolvedFiles.has(sourceFile) : false;
+  });
+  const kind =
+    params.activeTestFiles.length > 1
+      ? 'multi_eval'
+      : hasImportedSuite || hasNonActiveSourceFile
+        ? 'wrapper_eval'
+        : 'direct_suite';
+  const wrapperEvalFile =
+    kind === 'wrapper_eval'
+      ? toRuntimeSourcePath(params.cwd, params.activeTestFiles[0])
+      : undefined;
+
+  return {
+    schema_version: 'agentv.runtime_source.v1',
+    kind,
+    config_source: buildRuntimeConfigSource({
+      activeTestFiles: params.activeTestFiles,
+      fileMetadata: params.fileMetadata,
+      hasCliRuntimeConfig: params.hasCliRuntimeConfig,
+    }),
+    experiment_namespace: params.experimentNamespace,
+    experiment_namespace_source: params.experimentNamespaceSource,
+    eval_files: evalFiles,
+    ...(wrapperEvalFile && { wrapper_eval_file: wrapperEvalFile }),
+    ...(sourceEvalFiles.length > 0 && { source_eval_files: sourceEvalFiles }),
+  };
+}
+
 type ResolvedExperimentForRun = {
   readonly name?: string;
 };
@@ -1453,10 +1611,19 @@ export async function runEvalCommand(
     resolvedTestFiles.length === 1
       ? (primarySuite?.metadata?.name ?? fallbackResultGroupName)
       : fallbackResultGroupName;
+  const experimentNamespaceSource: RunRuntimeSourceMetadata['experiment_namespace_source'] =
+    resolvedExperiment.name
+      ? 'cli'
+      : resolvedTestFiles.length > 1
+        ? 'multi_eval'
+        : primarySuite?.metadata?.name
+          ? 'eval_metadata'
+          : 'eval_filename';
   options = {
     ...options,
     experiment: resolvedExperiment.name ?? resultGroupName,
   };
+  const hasCliRuntimeConfig = hasCliRuntimeSource(input.rawOptions);
 
   if (!process.env.AGENTV_EXPERIMENT) {
     process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
@@ -1885,9 +2052,21 @@ export async function runEvalCommand(
 
   // Use only files that survived tag filtering.
   const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
+  const activeSourceTests = activeTestFiles.flatMap(
+    (activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],
+  );
   const singleActiveFileMetadata =
     activeTestFiles.length === 1 ? fileMetadata.get(activeTestFiles[0]) : undefined;
   const runExperimentMetadata = singleActiveFileMetadata?.options.experimentMetadata;
+  const runtimeSourceMetadata = buildRuntimeSourceMetadata({
+    cwd,
+    activeTestFiles,
+    sourceTests: activeSourceTests,
+    fileMetadata,
+    experimentNamespace: normalizeExperimentName(options.experiment),
+    experimentNamespaceSource,
+    hasCliRuntimeConfig,
+  });
   const hasPerFileRuntimeThresholds =
     options.cliThreshold === undefined &&
     activeTestFiles.some(
@@ -1932,6 +2111,7 @@ export async function runEvalCommand(
       plannedTestCount: totalEvalCount,
       experiment: normalizeExperimentName(options.experiment),
       experimentMetadata: runExperimentMetadata,
+      runtimeSource: runtimeSourceMetadata,
     });
   }
 
@@ -2194,9 +2374,7 @@ export async function runEvalCommand(
     // Write artifacts to the run directory (always, not conditional on flags)
     if (allResults.length > 0) {
       const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
-      const sourceTests = activeTestFiles.flatMap(
-        (activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],
-      );
+      const sourceTests = activeSourceTests;
       const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata);
       if (isResumeAppend) {
         // Resume mode: write per-test artifacts for newly-run tests, then aggregate
@@ -2209,11 +2387,13 @@ export async function runEvalCommand(
           repoRoot,
           sourceTests,
           taskBundleTargets,
+          runtimeSource: runtimeSourceMetadata,
         });
         const { summaryPath } = await aggregateRunDir(runDir, {
           evalFile,
           experiment: normalizeExperimentName(options.experiment),
           experimentMetadata: runExperimentMetadata,
+          runtimeSource: runtimeSourceMetadata,
         });
         const indexPath = path.join(runDir, 'index.jsonl');
         console.log(`Artifact workspace updated: ${runDir}`);
@@ -2233,6 +2413,7 @@ export async function runEvalCommand(
             repoRoot,
             sourceTests,
             taskBundleTargets,
+            runtimeSource: runtimeSourceMetadata,
           },
         );
         console.log(`Artifact workspace written to: ${runDir}`);

diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
@@ -5,6 +5,7 @@ import {
   type EvaluationResult,
   type ExternalTraceMetadataWire,
   type ResultArtifactPointersWire,
+  type RunRuntimeSourceMetadata,
   type TraceSummary,
   buildTraceFromMessages,
   fromTraceEnvelopeWire,
@@ -60,6 +61,7 @@ export interface ResultManifestRecord {
   readonly metrics_path?: string;
   readonly raw_provider_log_path?: string;
   readonly artifact_pointers?: ResultArtifactPointersWire;
+  readonly runtime_source?: RunRuntimeSourceMetadata;
   readonly external_trace?: ExternalTraceMetadataWire;
   readonly response_path?: string;
   readonly result_dir?: string;
@@ -304,6 +306,7 @@ export function loadManifestResults(
 
 export interface LightweightResultRecord {
   readonly testId: string;
+  readonly evalPath?: string;
   readonly suite?: string;
   readonly category?: string;
   readonly target?: string;
@@ -314,13 +317,15 @@ export interface LightweightResultRecord {
   readonly error?: string;
   readonly costUsd?: number;
   readonly timestamp?: string;
+  readonly runtimeSource?: RunRuntimeSourceMetadata;
 }
 
 export function loadLightweightResults(sourceFile: string): LightweightResultRecord[] {
   const resolvedSourceFile = resolveRunManifestPath(sourceFile);
   const content = readFileSync(resolvedSourceFile, 'utf8');
   return parseResultManifest(content).map((record) => ({
     testId: record.test_id ?? 'unknown',
+    evalPath: record.eval_path,
     suite: record.suite,
     category: record.category,
     target: record.target,
@@ -331,5 +336,6 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
     error: record.error,
     costUsd: record.cost_usd,
     timestamp: record.timestamp,
+    runtimeSource: record.runtime_source,
   }));
 }