Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions CONCEPTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,17 @@ Shared domain vocabulary for this project — entities, named processes, and sta

## Evaluation Model

**Eval** — The frozen task and grading definition: prompts, datasets, input files, fixtures, assertions, and judge criteria. An eval defines what is being tested, not which agent, model, setup variant, or run policy executes it.
**Eval / Eval YAML** — The only composable and runnable AgentV authoring primitive. An eval YAML file can be a reusable task suite that owns task context, a wrapper eval that imports suites and carries an inline `experiment:` block, or a sidecar around raw JSONL cases. AgentV does not have a separate runnable `experiment.yaml` artifact.

**Experiment** — A committed run variant that selects how evals are executed: target or target matrix, setup, scripts, eval filters, repeat counts, timeouts, workers, budgets, and related run knobs. Experiments make A/B setup differences explicit while pointing at stable eval tasks.
**Task suite** — Eval YAML that owns what is being tested: prompts, datasets, input files, fixtures, `workspace`, assertions, expected references, and judge criteria. It can run directly or be imported by another eval with `tests[].include` and `type: suite`.

**Raw case file** — YAML, JSONL, or directory case data imported with `tests: ./cases.yaml`, string shorthand, or `type: tests`. Raw cases are reusable data inputs; they do not carry imported suite context such as shared `workspace`, shared `input`, or shared `assertions`.

**Wrapper eval** — Eval YAML whose main job is to import task suites and bind runtime policy with an inline `experiment:` block. Wrapper evals may live under an `experiments/` directory, but that path is an optional user-owned convention and AgentV does not infer behavior from it. A wrapper that imports suites with `type: suite` does not define parent workspace fields such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`; imported suites own task environment.

**Experiment** — The run-policy namespace for how evals are executed: target or target matrix, eval filters, repeat counts, timeouts, workers, budgets, thresholds, and related run knobs. In authored files it lives as inline `experiment:` inside eval YAML; CLI `--experiment` and `experiment.name` choose the result bucket. Lifecycle setup belongs in `workspace.hooks` or `targets[].hooks`, not in a separate experiment artifact.

**Workspace** — The task environment an eval prepares for the agent: repositories, templates, fixture files, and lifecycle hooks. It is not prompt input; use `input` for instructions and `workspace.repos[]` for multi-repo workspaces the agent can inspect or modify through tools.

**Run manifest** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `task_dir`, `summary_path`, and `grading_path`.

Expand Down
5 changes: 5 additions & 0 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
RESULT_INDEX_FILENAME,
RUN_SUMMARY_FILENAME,
type ResultIndexArtifact,
type RunRuntimeSourceMetadata,
type RunSummaryArtifact,
type TimingArtifact,
aggregateRunDir,
Expand Down Expand Up @@ -215,6 +216,7 @@ export async function writePerTestArtifacts(
repoRoot?: string;
sourceTests?: readonly EvalTest[];
taskBundleTargets?: readonly TaskBundleTargetSelection[];
runtimeSource?: RunRuntimeSourceMetadata;
},
): Promise<void> {
await writeCorePerTestArtifacts(results, outputDir, {
Expand All @@ -224,6 +226,7 @@ export async function writePerTestArtifacts(
duplicatePolicy: options?.duplicatePolicy,
sourceTests: options?.sourceTests,
additionalArtifacts: createTaskBundleArtifactsWriter(options),
runtimeSource: options?.runtimeSource,
});
}

Expand All @@ -242,6 +245,7 @@ export async function writeArtifactsFromResults(
repoRoot?: string;
sourceTests?: readonly EvalTest[];
taskBundleTargets?: readonly TaskBundleTargetSelection[];
runtimeSource?: RunRuntimeSourceMetadata;
},
): Promise<{
testArtifactDir: string;
Expand All @@ -258,5 +262,6 @@ export async function writeArtifactsFromResults(
resultGroup: options?.resultGroup,
sourceTests: options?.sourceTests,
additionalArtifacts: createTaskBundleArtifactsWriter(options),
runtimeSource: options?.runtimeSource,
});
}
187 changes: 184 additions & 3 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
type ResolvedTarget,
ResponseCache,
RunBudgetTracker,
type RunRuntimeSourceMetadata,
type TrialsConfig,
buildExperimentArtifactMetadata,
buildTraceFromMessages,
Expand Down Expand Up @@ -589,6 +590,163 @@ function deriveEvalResultGroupName(evalFilePath: string | undefined): string {
);
}

const CLI_RUNTIME_SOURCE_OPTION_KEYS = [
'target',
'targets',
'filter',
'tag',
'excludeTag',
'workers',
'dryRun',
'dryRunDelay',
'dryRunDelayMin',
'dryRunDelayMax',
'agentTimeout',
'maxRetries',
'cache',
'cachePath',
'noCache',
'graderTarget',
'model',
'threshold',
'budgetUsd',
'transcript',
'recordReplay',
'recordReplayVariant',
'workspacePath',
'workspaceMode',
] as const;

function hasCliRuntimeSource(rawOptions: Record<string, unknown>): boolean {
return CLI_RUNTIME_SOURCE_OPTION_KEYS.some((key) => {
const value = rawOptions[key];
if (Array.isArray(value)) {
return value.some((entry) => typeof entry === 'string' && entry.trim().length > 0);
}
if (typeof value === 'string') {
return value.trim().length > 0 && value.trim() !== 'default';
}
if (typeof value === 'number') {
return Number.isFinite(value) && value !== 0;
}
return value === true;
});
}

function toRuntimeSourcePath(cwd: string, filePath: string | undefined): string | undefined {
const trimmed = filePath?.trim();
if (!trimmed) {
return undefined;
}
const resolved = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed);
const relative = path.relative(cwd, resolved);
const displayPath =
relative && !relative.startsWith('..') && !path.isAbsolute(relative) ? relative : trimmed;
return displayPath.split(path.sep).join('/');
}

function uniqueRuntimeSourcePaths(values: Iterable<string | undefined>): readonly string[] {
return [...new Set([...values].filter((value): value is string => Boolean(value)))].sort();
}

function testSourceEvalPath(cwd: string, test: EvalTest): string | undefined {
return (
toRuntimeSourcePath(cwd, test.source?.evalFileRepoPath) ??
toRuntimeSourcePath(cwd, test.source?.evalFileAbsolutePath) ??
toRuntimeSourcePath(cwd, test.source?.evalFilePath)
);
}

function testSourceEvalPathForComparison(test: EvalTest): string | undefined {
const sourcePath = test.source?.evalFileAbsolutePath ?? test.source?.evalFilePath;
return sourcePath ? path.resolve(sourcePath) : undefined;
}

function buildRuntimeConfigSource(params: {
readonly activeTestFiles: readonly string[];
readonly fileMetadata: ReadonlyMap<string, { readonly options: NormalizedOptions }>;
readonly hasCliRuntimeConfig: boolean;
}): RunRuntimeSourceMetadata['config_source'] {
const inlineFingerprints = new Set<string>();
let hasInlineExperiment = false;
let hasDefaultRuntime = false;

for (const activeTestFile of params.activeTestFiles) {
const experimentMetadata = params.fileMetadata.get(activeTestFile)?.options.experimentMetadata;
if (experimentMetadata) {
hasInlineExperiment = true;
inlineFingerprints.add(experimentMetadata.fingerprint ?? activeTestFile);
} else {
hasDefaultRuntime = true;
}
}

if (
(hasInlineExperiment && params.hasCliRuntimeConfig) ||
(hasInlineExperiment && hasDefaultRuntime) ||
inlineFingerprints.size > 1
) {
return 'mixed';
}
if (params.hasCliRuntimeConfig) {
return 'cli_flags';
}
if (hasInlineExperiment) {
return 'inline_experiment';
}
return 'defaults';
}

function buildRuntimeSourceMetadata(params: {
readonly cwd: string;
readonly activeTestFiles: readonly string[];
readonly sourceTests: readonly EvalTest[];
readonly fileMetadata: ReadonlyMap<string, { readonly options: NormalizedOptions }>;
readonly experimentNamespace: string;
readonly experimentNamespaceSource: RunRuntimeSourceMetadata['experiment_namespace_source'];
readonly hasCliRuntimeConfig: boolean;
}): RunRuntimeSourceMetadata {
const evalFiles = uniqueRuntimeSourcePaths(
params.activeTestFiles.map((filePath) => toRuntimeSourcePath(params.cwd, filePath)),
);
const activeResolvedFiles = new Set(
params.activeTestFiles.map((filePath) => path.resolve(filePath)),
);
const sourceEvalFiles = uniqueRuntimeSourcePaths(
params.sourceTests.map((test) => testSourceEvalPath(params.cwd, test)),
);
const hasImportedSuite = params.sourceTests.some((test) => test.source?.importedSuiteName);
const hasNonActiveSourceFile = params.sourceTests.some((test) => {
const sourceFile = testSourceEvalPathForComparison(test);
return sourceFile ? !activeResolvedFiles.has(sourceFile) : false;
});
const kind =
params.activeTestFiles.length > 1
? 'multi_eval'
: hasImportedSuite || hasNonActiveSourceFile
? 'wrapper_eval'
: 'direct_suite';
const wrapperEvalFile =
kind === 'wrapper_eval'
? toRuntimeSourcePath(params.cwd, params.activeTestFiles[0])
: undefined;

return {
schema_version: 'agentv.runtime_source.v1',
kind,
config_source: buildRuntimeConfigSource({
activeTestFiles: params.activeTestFiles,
fileMetadata: params.fileMetadata,
hasCliRuntimeConfig: params.hasCliRuntimeConfig,
}),
experiment_namespace: params.experimentNamespace,
experiment_namespace_source: params.experimentNamespaceSource,
eval_files: evalFiles,
...(wrapperEvalFile && { wrapper_eval_file: wrapperEvalFile }),
...(sourceEvalFiles.length > 0 && { source_eval_files: sourceEvalFiles }),
};
}

type ResolvedExperimentForRun = {
readonly name?: string;
};
Expand Down Expand Up @@ -1453,10 +1611,19 @@ export async function runEvalCommand(
resolvedTestFiles.length === 1
? (primarySuite?.metadata?.name ?? fallbackResultGroupName)
: fallbackResultGroupName;
const experimentNamespaceSource: RunRuntimeSourceMetadata['experiment_namespace_source'] =
resolvedExperiment.name
? 'cli'
: resolvedTestFiles.length > 1
? 'multi_eval'
: primarySuite?.metadata?.name
? 'eval_metadata'
: 'eval_filename';
options = {
...options,
experiment: resolvedExperiment.name ?? resultGroupName,
};
const hasCliRuntimeConfig = hasCliRuntimeSource(input.rawOptions);

if (!process.env.AGENTV_EXPERIMENT) {
process.env.AGENTV_EXPERIMENT = normalizeExperimentName(options.experiment);
Expand Down Expand Up @@ -1885,9 +2052,21 @@ export async function runEvalCommand(

// Use only files that survived tag filtering.
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
const activeSourceTests = activeTestFiles.flatMap(
(activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],
);
const singleActiveFileMetadata =
activeTestFiles.length === 1 ? fileMetadata.get(activeTestFiles[0]) : undefined;
const runExperimentMetadata = singleActiveFileMetadata?.options.experimentMetadata;
const runtimeSourceMetadata = buildRuntimeSourceMetadata({
cwd,
activeTestFiles,
sourceTests: activeSourceTests,
fileMetadata,
experimentNamespace: normalizeExperimentName(options.experiment),
experimentNamespaceSource,
hasCliRuntimeConfig,
});
const hasPerFileRuntimeThresholds =
options.cliThreshold === undefined &&
activeTestFiles.some(
Expand Down Expand Up @@ -1932,6 +2111,7 @@ export async function runEvalCommand(
plannedTestCount: totalEvalCount,
experiment: normalizeExperimentName(options.experiment),
experimentMetadata: runExperimentMetadata,
runtimeSource: runtimeSourceMetadata,
});
}

Expand Down Expand Up @@ -2194,9 +2374,7 @@ export async function runEvalCommand(
// Write artifacts to the run directory (always, not conditional on flags)
if (allResults.length > 0) {
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
const sourceTests = activeTestFiles.flatMap(
(activeTestFile) => fileMetadata.get(activeTestFile)?.testCases ?? [],
);
const sourceTests = activeSourceTests;
const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata);
if (isResumeAppend) {
// Resume mode: write per-test artifacts for newly-run tests, then aggregate
Expand All @@ -2209,11 +2387,13 @@ export async function runEvalCommand(
repoRoot,
sourceTests,
taskBundleTargets,
runtimeSource: runtimeSourceMetadata,
});
const { summaryPath } = await aggregateRunDir(runDir, {
evalFile,
experiment: normalizeExperimentName(options.experiment),
experimentMetadata: runExperimentMetadata,
runtimeSource: runtimeSourceMetadata,
});
const indexPath = path.join(runDir, 'index.jsonl');
console.log(`Artifact workspace updated: ${runDir}`);
Expand All @@ -2233,6 +2413,7 @@ export async function runEvalCommand(
repoRoot,
sourceTests,
taskBundleTargets,
runtimeSource: runtimeSourceMetadata,
},
);
console.log(`Artifact workspace written to: ${runDir}`);
Expand Down
6 changes: 6 additions & 0 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
type EvaluationResult,
type ExternalTraceMetadataWire,
type ResultArtifactPointersWire,
type RunRuntimeSourceMetadata,
type TraceSummary,
buildTraceFromMessages,
fromTraceEnvelopeWire,
Expand Down Expand Up @@ -60,6 +61,7 @@ export interface ResultManifestRecord {
readonly metrics_path?: string;
readonly raw_provider_log_path?: string;
readonly artifact_pointers?: ResultArtifactPointersWire;
readonly runtime_source?: RunRuntimeSourceMetadata;
readonly external_trace?: ExternalTraceMetadataWire;
readonly response_path?: string;
readonly result_dir?: string;
Expand Down Expand Up @@ -304,6 +306,7 @@ export function loadManifestResults(

export interface LightweightResultRecord {
readonly testId: string;
readonly evalPath?: string;
readonly suite?: string;
readonly category?: string;
readonly target?: string;
Expand All @@ -314,13 +317,15 @@ export interface LightweightResultRecord {
readonly error?: string;
readonly costUsd?: number;
readonly timestamp?: string;
readonly runtimeSource?: RunRuntimeSourceMetadata;
}

export function loadLightweightResults(sourceFile: string): LightweightResultRecord[] {
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
const content = readFileSync(resolvedSourceFile, 'utf8');
return parseResultManifest(content).map((record) => ({
testId: record.test_id ?? 'unknown',
evalPath: record.eval_path,
suite: record.suite,
category: record.category,
target: record.target,
Expand All @@ -331,5 +336,6 @@ export function loadLightweightResults(sourceFile: string): LightweightResultRec
error: record.error,
costUsd: record.cost_usd,
timestamp: record.timestamp,
runtimeSource: record.runtime_source,
}));
}
Loading
Loading