diff --git a/CONCEPTS.md b/CONCEPTS.md index 92c60acdd..f578b1569 100644 --- a/CONCEPTS.md +++ b/CONCEPTS.md @@ -18,7 +18,7 @@ Shared domain vocabulary for this project — entities, named processes, and sta **Raw case file** — YAML, JSONL, or directory case data imported with `tests: ./cases.yaml`, string shorthand, or `type: tests`. Raw cases are reusable data inputs; they do not carry imported suite context such as shared `workspace`, shared `input`, or shared `assertions`. -**Wrapper eval** — Eval YAML whose main job is to import task suites and bind runtime policy with an inline `experiment:` block. Wrapper evals may live under an `experiments/` directory, but that path is an optional user-owned convention and AgentV does not infer behavior from it. A wrapper that imports suites with `type: suite` does not define parent workspace fields such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`; imported suites own task environment. +**Wrapper eval** — Eval YAML whose main job is to import task suites and bind runtime policy with an inline `experiment:` block. Wrapper evals may live under an `experiments/` directory, but that path is an optional user-owned convention and AgentV does not infer behavior from it. A wrapper that imports suites with `type: suite` does not define parent `workspace`; imported suites own task environment. **Experiment** — The run-policy namespace for how evals are executed: target or target matrix, eval filters, repeat counts, timeouts, workers, budgets, thresholds, and related run knobs. In authored files it lives as inline `experiment:` inside eval YAML; CLI `--experiment` and `experiment.name` choose the result bucket. Lifecycle setup belongs in `workspace.hooks` or `targets[].hooks`, not in a separate experiment artifact. diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 6e456d599..2612aeea4 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -439,17 +439,22 @@ function normalizeOptions( const cliOut = normalizeString(rawOptions.out); const configOutputDir = normalizeString(config?.output?.dir); const cliWorkspacePath = normalizeString(rawOptions.workspacePath); + const configWorkspacePath = normalizeString(yamlExecution?.workspace_path); const cliWorkspaceModeRaw = normalizeString(rawOptions.workspaceMode); const cliWorkspaceMode = normalizeWorkspaceMode(rawOptions.workspaceMode); if (cliWorkspacePath && cliWorkspaceModeRaw && cliWorkspaceMode !== 'static') { throw new Error('--workspace-path requires --workspace-mode=static (or omit --workspace-mode)'); } - - const yamlExecutionRecord = yamlExecution as Record | undefined; - const yamlWorkspaceMode = normalizeWorkspaceMode(yamlExecutionRecord?.workspace_mode); - const yamlWorkspacePath = normalizeString(yamlExecutionRecord?.workspace_path); - const workspacePath = cliWorkspacePath ?? yamlWorkspacePath; - const workspaceMode = cliWorkspacePath ? 'static' : (cliWorkspaceMode ?? yamlWorkspaceMode); + const configWorkspaceMode = normalizeWorkspaceMode(yamlExecution?.workspace_mode); + if (configWorkspacePath && configWorkspaceMode && configWorkspaceMode !== 'static') { + throw new Error( + 'execution.workspace_path requires execution.workspace_mode: static when both are provided', + ); + } + const useConfigWorkspacePath = cliWorkspaceMode === undefined || cliWorkspaceMode === 'static'; + const workspacePath = + cliWorkspacePath ?? (useConfigWorkspacePath ? configWorkspacePath : undefined); + const workspaceMode = workspacePath ? 'static' : (cliWorkspaceMode ?? configWorkspaceMode); const resultsRepo = normalizeString(rawOptions.resultsRepo); const resultsPush = normalizeBoolean(rawOptions.resultsPush); const resultsNoPush = normalizeBoolean(rawOptions.noResultsPush); @@ -776,17 +781,14 @@ function applyExperimentOptions( ? [experimentTarget] : options.cliTargets; - const workspaceMode = - options.workspaceMode ?? readExperimentWorkspaceMode(experiment.workspace?.mode); - const workspacePath = options.workspacePath ?? readExperimentWorkspacePath(experiment.workspace); return { ...options, target: options.target ?? (nextCliTargets.length === 1 ? nextCliTargets[0] : undefined), cliTargets: nextCliTargets, agentTimeoutSeconds: options.agentTimeoutSeconds ?? experiment.timeoutSeconds, workers: options.workers ?? experiment.workers, - workspaceMode: workspacePath ? 'static' : workspaceMode, - workspacePath, + workspaceMode: options.workspaceMode, + workspacePath: options.workspacePath, budgetUsd: options.budgetUsd ?? experiment.budgetUsd, threshold: options.threshold ?? experiment.threshold, experimentConfig: experiment, @@ -923,17 +925,6 @@ function groupTestsByRunPolicy(params: { return [...groups.values()]; } -function readExperimentWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' | undefined { - return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined; -} - -function readExperimentWorkspacePath( - workspace: Record | undefined, -): string | undefined { - const value = workspace?.path; - return typeof value === 'string' && value.trim().length > 0 ? value.trim() : undefined; -} - function matchesTestFilter(id: string, filter: string | readonly string[]): boolean { return typeof filter === 'string' ? micromatch.isMatch(id, filter) diff --git a/apps/cli/src/commands/eval/task-bundle.ts b/apps/cli/src/commands/eval/task-bundle.ts index ae81fb721..22a22f7a6 100644 --- a/apps/cli/src/commands/eval/task-bundle.ts +++ b/apps/cli/src/commands/eval/task-bundle.ts @@ -703,15 +703,8 @@ function serializeWorkspace( workspace: WorkspaceConfig, rewrites: ReadonlyMap, ): Record { - const { - workspaceFileDir: _workspaceFileDir, - path: _path, - mode, - ...portableWorkspace - } = workspace; - const withoutStaticMode = - mode === 'static' ? portableWorkspace : { ...portableWorkspace, ...(mode ? { mode } : {}) }; - return rewritePathsDeep(withoutStaticMode, rewrites) as Record; + const { workspaceFileDir: _workspaceFileDir, ...portableWorkspace } = workspace; + return rewritePathsDeep(portableWorkspace, rewrites) as Record; } function buildPortableEvalCase( @@ -827,12 +820,6 @@ async function collectWorkspaceReferences( continue; } - if (workspace.path || workspace.mode === 'static') { - errors.push( - `workspace.path for test "${test.id}" cannot be bundled because it points at an existing static workspace. Use workspace.template, workspace.repos, or workspace.hooks for portable bundles.`, - ); - } - if (workspace.template) { references.push({ kind: 'workspace_template', diff --git a/apps/cli/src/commands/prepare/index.ts b/apps/cli/src/commands/prepare/index.ts index 1240eb2cb..d1a3ff4e3 100644 --- a/apps/cli/src/commands/prepare/index.ts +++ b/apps/cli/src/commands/prepare/index.ts @@ -292,8 +292,7 @@ async function prepareAttempt(options: { evalCases: suite.tests, testId: options.testId, verbose: false, - ...(test.workspace?.path === undefined && - test.workspace?.mode !== 'static' && { workspaceMode: 'temp' }), + workspaceMode: 'temp', retainOnSuccess: 'keep', retainOnFailure: 'keep', }); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index 9499e85cb..b635bf335 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -530,6 +530,31 @@ describe('agentv eval CLI', () => { } }, 30_000); + it('uses config.local.yaml workspace_path as a static workspace override', async () => { + const fixture = await createFixture(); + try { + const workspacePath = path.join(fixture.baseDir, 'local-config-workspace'); + await mkdir(workspacePath, { recursive: true }); + await writeFile( + path.join(fixture.suiteDir, '.agentv', 'config.local.yaml'), + `execution:\n workspace_path: ${JSON.stringify(workspacePath)}\n`, + 'utf8', + ); + + const result = await runCli(fixture, ['eval', fixture.testFilePath]); + + expect(result.exitCode).toBe(0); + const diagnostics = await readDiagnostics(fixture); + expect(diagnostics).toMatchObject({ + workspaceMode: 'static', + workspacePath, + resultCount: 2, + }); + } finally { + await rm(fixture.baseDir, { recursive: true, force: true }); + } + }, 30_000); + it('passes run-level budget tracking through to the evaluator', async () => { const fixture = await createFixture(); try { diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx index f36db7494..5b91ebd1e 100644 --- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx @@ -25,9 +25,9 @@ experiment format. - A **wrapper eval** is eval YAML that imports one or more suites with `type: suite` and binds runtime policy in its inline `experiment:` block. Wrapper evals can live anywhere in the repo. A wrapper that imports suites - with `type: suite` must not define parent workspace fields such as - `workspace`, `experiment.workspace`, or legacy `execution.workspace`; - imported suites own task environment. + with `type: suite` must not define parent `workspace`; imported suites own + task environment. Machine-local existing workspace paths belong in CLI flags + or `config.local.yaml`, not eval YAML. For example, a reusable task suite can keep the task contract in one file: @@ -78,11 +78,11 @@ The `experiments/` directory in that example is optional and user-owned. AgentV does not infer behavior from the path; the wrapper runs because it is eval YAML with an inline `experiment:` block. The wrapper owns runtime policy only. Put workspace setup in imported child suites. Parent workspace-affecting fields, -including `workspace`, `experiment.workspace`, and legacy -`execution.workspace`, are for parent-owned raw cases, including cases imported -with `type: tests`. `experiment.workspace` is only a runtime `mode`/`path` -override; repos, hooks, templates, Docker config, and isolation belong in -top-level or case-level `workspace`. +including top-level `workspace`, are for parent-owned raw cases, including +cases imported with `type: tests`. Runtime workspace path overrides belong in +CLI flags or `.agentv/config.local.yaml`; repos, hooks, templates, Docker +config, env checks, and isolation belong in top-level or case-level +`workspace`. ## YAML Format diff --git a/apps/web/src/content/docs/docs/evaluation/experiments.mdx b/apps/web/src/content/docs/docs/evaluation/experiments.mdx index 88f5a8399..0ee1ac2b2 100644 --- a/apps/web/src/content/docs/docs/evaluation/experiments.mdx +++ b/apps/web/src/content/docs/docs/evaluation/experiments.mdx @@ -201,7 +201,7 @@ prepare files, dependencies, repos, or target-specific runner state. | Reset or apply per-case state | `workspace.hooks.before_each` / `workspace.hooks.after_each` | | Configure an agent runner or provider variant | `targets[].hooks` | | Choose targets, repeats, pass policy, budget, threshold | `experiment` | -| Override run workspace mode/path without changing task setup | `experiment.workspace.mode` / `experiment.workspace.path` | +| Bind an existing local workspace directory | `--workspace-path` or `.agentv/config.local.yaml` | ```yaml workspace: @@ -223,11 +223,11 @@ experiment: strategy: pass_at_k ``` -`experiment.workspace` is intentionally limited to `mode` and `path`, matching -the `--workspace-mode` and `--workspace-path` CLI flags. Put repos, templates, -hooks, Docker config, and isolation under top-level or case-level `workspace`. -Wrapper evals that import child evals with `type: suite` must not define -`experiment.workspace`; imported suites own the task workspace. +`experiment.workspace` is not an authored eval YAML field. Existing local +workspace paths are machine-local bindings: pass `--workspace-path` for a +one-off run or put `execution.workspace_path` in `.agentv/config.local.yaml`. +Put repos, templates, hooks, Docker config, env checks, and isolation under +top-level or case-level `workspace`. ## Repeat Runs diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 018892bd3..b752aa4a2 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -297,14 +297,14 @@ This matches the standard model used by eval frameworks (promptfoo, deepeval, Op ### Workspace Modes and Finish Policy -Use workspace mode and finish policies instead of multiple conflicting booleans: +Use runtime workspace flags and finish policies instead of multiple conflicting booleans: ```bash # Mode: pooled | temp | static agentv eval evals/my-eval.yaml --workspace-mode pooled -# Static mode path -agentv eval evals/my-eval.yaml --workspace-mode static --workspace-path /path/to/workspace +# Existing local workspace path for this run +agentv eval evals/my-eval.yaml --workspace-path /path/to/workspace # Pooled reset policy override: standard | full (CLI override) agentv eval evals/my-eval.yaml --workspace-clean full @@ -313,12 +313,12 @@ agentv eval evals/my-eval.yaml --workspace-clean full agentv eval evals/my-eval.yaml --retain-on-success cleanup --retain-on-failure keep ``` -Equivalent eval YAML: +Portable eval YAML keeps workspace intent under templates, repos, hooks, env, +Docker, and folder isolation: ```yaml workspace: - mode: pooled # pooled | temp | static - path: null # workspace path for mode=static; auto-materialised when empty/missing + isolation: shared # shared | per_case hooks: enabled: true # set false to skip all hooks after_each: @@ -326,9 +326,9 @@ workspace: ``` Notes: -- Pooling is default for shared workspaces with repos when mode is not specified. -- `mode: static` (or `--workspace-mode static`) uses `path` / `--workspace-path`. When the path is empty or missing, the workspace is auto-materialised (template copied + repos cloned). Populated directories are reused as-is. -- Static mode is incompatible with `isolation: per_case`. +- Pooling is default for shared workspaces with repos. +- `--workspace-path` uses an existing machine-local directory as-is and implies static runtime mode. +- Runtime static mode is incompatible with `isolation: per_case`. - `hooks.enabled: false` skips all lifecycle hooks (setup, teardown, reset). - Pool slots are managed separately (`agentv workspace list|clean`). @@ -562,6 +562,9 @@ Example local overlay: ```yaml execution: keep_workspaces: true + # Machine-local existing workspace binding. Do not commit this file. + workspace_path: /home/user/workspaces/my-eval + workspace_mode: static eval_patterns: - "local-evals/**/*.eval.yaml" ``` @@ -570,6 +573,8 @@ eval_patterns: |-------|---------------|------|---------|-------------| | `verbose` | `--verbose` | boolean | `false` | Enable verbose logging | | `keep_workspaces` | `--keep-workspaces` | boolean | `false` | Always keep temp workspaces after eval | +| `workspace_path` | `--workspace-path` | string | none | Machine-local existing workspace directory | +| `workspace_mode` | `--workspace-mode` | `pooled` / `temp` / `static` | none | Machine-local workspace preparation override | | `otel_file` | `--otel-file` | string | none | Write OTLP JSON trace to file | ### TypeScript config (`agentv.config.ts`) diff --git a/apps/web/src/content/docs/docs/graders/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx index c2422a192..38ff33817 100644 --- a/apps/web/src/content/docs/docs/graders/code-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/code-graders.mdx @@ -366,7 +366,7 @@ Use `expected_output` for reference answers and `output` for the actual final an ## Workspace Access -When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.path`, or `workspace.repos`), code graders receive the workspace path in two ways: +When `workspace` is configured in the eval YAML (via `workspace.template`, `workspace.repos`, or lifecycle hooks), code graders receive the prepared workspace path in two ways: 1. **JSON payload**: `workspace_path` field in the stdin input 2. **Environment variable**: `AGENTV_WORKSPACE_PATH` diff --git a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx index 8ec466839..e6a195983 100644 --- a/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx +++ b/apps/web/src/content/docs/docs/guides/benchmark-provenance.mdx @@ -28,7 +28,7 @@ Use this split when deciding where a benchmark key belongs: | `workspace.repos[]` | Yes | Declares repo identity and checkout refs; AgentV resolves acquisition and materializes the checkout. | | `workspace.template` | Yes | Copies a workspace template into the run workspace. | | `workspace.hooks` | Yes | Runs lifecycle commands with workspace and case context on stdin. | -| `workspace.isolation`, `workspace.mode`, `workspace.path` | Yes | Controls workspace reuse and materialization. | +| `workspace.isolation` | Yes | Controls shared vs per-case folder isolation. Runtime workspace paths are machine-local config/CLI bindings, not benchmark provenance. | | `experiment` | Yes | Selects targets, thresholds, repeat policy, budgets, workers, and default grader behavior. | | `input`, `input_files`, `expected_output` | Yes | Builds the target prompt and passive reference answer. | | `assertions` | Yes | Runs deterministic, LLM, composite, or code graders. | @@ -208,8 +208,7 @@ When one eval references another eval, preserve the task/runtime split: - Child `experiment:` blocks are ignored by `type: suite` composition. There is no fallback to the child `experiment:` when the parent has no `experiment:`. - Child `workspace` setup is preserved for `type: suite` imports. A parent eval - that imports any `type: suite` entry must not define parent workspace fields - such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`. + that imports any `type: suite` entry must not define parent `workspace`. Parent workspace context is for parent-owned raw cases, including raw cases imported with `type: tests`. - A tests-only import can drop child workspace context only when the import mode diff --git a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx index 823a3dd70..98ba23054 100644 --- a/apps/web/src/content/docs/docs/guides/workspace-pool.mdx +++ b/apps/web/src/content/docs/docs/guides/workspace-pool.mdx @@ -43,18 +43,15 @@ Pooling is on by default. To disable it: agentv eval evals/my-eval.yaml --workspace-mode temp ``` -### YAML workspace mode +### Local config ```yaml -workspace: - mode: temp - repos: - - path: ./my-repo - repo: https://github.com/org/my-repo.git - commit: main +# .agentv/config.local.yaml +execution: + workspace_mode: temp ``` -`workspace.mode` controls materialization behavior directly (`pooled`, `temp`, or `static`). +`workspace_mode` is a machine-local runtime override. Do not commit it in eval YAML. ## Pool reset mode @@ -173,21 +170,25 @@ The path is resolved relative to the eval file's directory. Relative paths **ins This pattern is especially valuable with pooling: a single `workspace.yaml` guarantees all eval files that reference it produce the same fingerprint and share the same pool. -## Static workspaces (`mode: static`) +## Existing Local Workspaces -For workspaces you manage outside AgentV, use static mode: +For workspaces you manage outside AgentV, bind the existing directory at runtime: ```bash -agentv eval evals/my-eval.yaml --workspace-mode static --workspace-path /path/to/my-workspace +agentv eval evals/my-eval.yaml --workspace-path /path/to/my-workspace ``` -**Auto-materialisation:** When `workspace.path` points to an empty or missing directory, AgentV automatically copies the template and clones repos into it. If the directory already exists and is populated, AgentV checks each repo individually — existing repos are reused as-is, and only missing repos are cloned. This makes static mode convenient for both first-run bootstrap and incremental setup. +Or persist the machine-local binding outside committed eval YAML: -AgentV never deletes a user-provided workspace. Lifecycle hooks still execute (unless `hooks.enabled: false`). This is useful for local development where you already have repos checked out. +```yaml +# .agentv/config.local.yaml +execution: + workspace_path: /path/to/my-workspace +``` -**Note:** When using `--workspace-path` (CLI flag) instead of `workspace.path` (YAML), the directory is always used as-is with no auto-materialisation or repo cloning. +AgentV uses a runtime workspace path as-is. It does not auto-materialize repos into that directory; keep repo materialization intent in `workspace.repos[]` for portable runs, and use `workspace_path` only when the local directory already exists. -**Precedence:** `workspace.mode` / `--workspace-mode` first, then default pooled behavior for shared repo workspaces. +**Precedence:** CLI flags override project-local `.agentv/config.local.yaml`, which overrides committed `.agentv/config.yaml`. ## Interaction with keep/cleanup flags @@ -195,20 +196,20 @@ CLI flags `--retain-on-success` / `--retain-on-failure` control temporary eval-r - In pooled mode, pool slots are retained for reuse regardless of retention settings. - Retention settings do not remove pool entries; use `agentv workspace clean` for pool cleanup. -- With `mode: static`, AgentV never deletes the user-provided directory. +- With `--workspace-path` or `execution.workspace_path`, AgentV never deletes the user-provided directory. ## Comparison of workspace modes | Mode | Setup cost | Persistent | Build artifacts preserved | Concurrent workers | |------|-----------|-----------|--------------------------|-------------------| | **Pooled** (default) | First run only; reset on reuse | Yes | Yes (`.gitignore`d files) | Yes (slot per worker) | -| **Temp** (`mode: temp`) | Full clone + checkout every run | No | No | Sequential only | -| **Static** (`mode: static`) | Per-repo: clones only missing repos; auto-materialises if empty | Yes | User-managed | Sequential only | +| **Temp** (`--workspace-mode temp`) | Full clone + checkout every run | No | No | Sequential only | +| **Existing path** (`--workspace-path` / `execution.workspace_path`) | Uses the supplied directory as-is | Yes | User-managed | Sequential only | ## When to disable pooling **Pooling is typically the right default.** Consider disabling it when: - You need guaranteed clean-slate isolation between runs - You're debugging workspace setup issues and want fresh clones each time -- You use `mode: static` with a pre-existing or auto-materialised directory (pooling is automatically skipped) +- You use `--workspace-path` with a pre-existing local directory (pooling is automatically skipped) - You need `isolation: per_case` (each test gets its own workspace copy; pooling is automatically skipped) diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx index 48092b649..8a765b0bc 100644 --- a/apps/web/src/content/docs/docs/targets/configuration.mdx +++ b/apps/web/src/content/docs/docs/targets/configuration.mdx @@ -166,8 +166,6 @@ workspace: after_each: reset: fast # none | fast | strict isolation: shared # shared (default) | per_case - mode: pooled # pooled | temp | static - path: /tmp/my-ws # workspace path for mode=static ``` `repo` declares the repository identity. Acquisition is harness-owned: AgentV first looks for matching registered projects and configured mirrors, then uses its git cache, then falls back to remote clone. See [Workspace Architecture](/docs/guides/workspace-architecture/#repo-provenance-vs-acquisition) for the resolver order and `git_cache.mirrors` config. @@ -182,15 +180,13 @@ workspace: | `repos[].sparse` | Sparse checkout paths | | `hooks.after_each.reset` | Reset policy after each test: `none`, `fast`, `strict` | | `isolation` | `shared` reuses one workspace; `per_case` creates a fresh copy per test case | -| `mode` | Workspace mode: `pooled`, `temp`, `static` | -| `path` | Workspace path for `mode=static`. When empty or missing, the workspace is auto-materialised (template copied + repos cloned). Populated directories are reused as-is. | | `hooks.enabled` | Boolean (default: `true`). Set `false` to skip all lifecycle hooks. | `isolation: per_case` is the spelling for fresh workspace state per test case. -**Pooling:** `mode: pooled` (or default shared repo mode) reuses pool slots between runs. Use `mode: temp` to disable pooling for fresh clone/checkouts each run. +**Pooling:** shared workspaces with `repos` use pool slots by default. Use `--workspace-mode temp` or `execution.workspace_mode: temp` in `config.local.yaml` to disable pooling for a local run. -**Static auto-materialisation:** When `mode: static` and `path` points to an empty or missing directory, AgentV automatically copies the template and clones repos into it. If the directory already exists and is populated, it is reused as-is. +**Existing local workspaces:** do not commit local paths in eval YAML. Use `--workspace-path /path/to/workspace` for a one-off run, or put `execution.workspace_path` in `.agentv/config.local.yaml`. Pool management commands: - `agentv workspace list` — list all pool entries with size and repo info diff --git a/docs/adr/0006-separate-experiments-from-eval-definitions.md b/docs/adr/0006-separate-experiments-from-eval-definitions.md index 0cc4f8f46..e4ad436ad 100644 --- a/docs/adr/0006-separate-experiments-from-eval-definitions.md +++ b/docs/adr/0006-separate-experiments-from-eval-definitions.md @@ -222,14 +222,12 @@ fall back into the parent run. Scoped runtime overrides that the parent wants to apply to imported tests live in `tests[].run`. A parent eval that imports any child eval suite with `type: suite` must not -define parent workspace-affecting fields, including `workspace`, -`experiment.workspace`, or legacy `execution.workspace`. The wrapper owns -runtime policy, not task environment. Imported child suites keep their own -`workspace`, including `workspace.repos[]`, templates, hooks, and isolation. -`experiment.workspace` remains a narrow runtime override for `mode` and `path` -only; it is not a place for repos, hooks, templates, Docker config, or -isolation. If the parent should own workspace context, import raw cases with -`type: tests` or shorthand paths instead of importing an eval suite. +define parent `workspace`. The wrapper owns runtime policy, not task +environment. Imported child suites keep their own `workspace`, including +`workspace.repos[]`, templates, hooks, and isolation. Existing local workspace +paths are machine-local bindings supplied through CLI flags or +`config.local.yaml`. If the parent should own workspace context, import raw cases +with `type: tests` or shorthand paths instead of importing an eval suite. `type: tests` imports only raw test entries. It intentionally drops shared suite context such as workspace, shared input, and shared assertions. Use this diff --git a/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md b/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md index e6ac498f2..55f2e71b4 100644 --- a/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md +++ b/docs/adr/0009-keep-benchmark-schema-on-existing-primitives.md @@ -70,13 +70,13 @@ child `experiment:` block and uses the parent `experiment:` when one exists; it does not fall back to the child `experiment:`. Workspace follows task ownership, not runtime fallback: imported child tests keep the child suite workspace that was already expanded into those tests. Therefore a parent eval that imports any -child eval with `type: suite` must not define parent workspace-affecting fields -such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`. -Parent workspace context is valid for parent-owned raw cases only, including -raw cases imported with `type: tests` or shorthand paths. `experiment.workspace` -is limited to runtime `mode` and `path`; task environment fields remain in -top-level or case-level `workspace`. A "tests only" import mode may drop child -workspace context, but that must be opt-in. +child eval with `type: suite` must not define parent `workspace`. Parent +workspace context is valid for parent-owned raw cases only, including raw cases +imported with `type: tests` or shorthand paths. Machine-local existing workspace +paths are no longer authored in eval YAML; they belong in CLI flags or +`config.local.yaml`. Task environment fields remain in top-level or case-level +`workspace`. A "tests only" import mode may drop child workspace context, but +that must be opt-in. ADR 0006 defines the contract-layer model behind this rule: task data, task prompt, task environment, and scoring come from the imported child suite; run diff --git a/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md b/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md index f9ecab40e..2fe9ea744 100644 --- a/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md +++ b/docs/plans/2026-06-27-001-docs-agentv-schema-benchmark-research-plan.md @@ -69,8 +69,9 @@ with a special name. suites must not define parent workspace-affecting fields such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`; parent workspace applies only to parent-owned raw cases, including cases imported with - `type: tests`. `experiment.workspace` is only a runtime `mode`/`path` - override, not a task workspace definition. + `type: tests`. Existing local workspace paths are machine-local bindings + supplied through CLI flags or `config.local.yaml`, not eval YAML task + workspace definitions. ### Evidence Summary @@ -151,8 +152,8 @@ Research ambiguity: imported suite tests keep their child workspace. A parent eval that imports any suite with `type: suite` must not define parent workspace-affecting fields such as `workspace`, `experiment.workspace`, or legacy `execution.workspace`. - `experiment.workspace` is limited to runtime `mode` and `path`; task - workspace setup stays in `workspace`. + Existing local workspace paths are machine-local bindings supplied through + CLI flags or `config.local.yaml`; task workspace setup stays in `workspace`. - R14. Parent workspace applies to parent-owned raw cases only, including raw cases imported with `type: tests`. Any future parent workspace override/remap for imported suites should require explicit syntax. diff --git a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml index 138efb57c..d9ab18042 100644 --- a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml +++ b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml @@ -3,7 +3,6 @@ description: >- enabled by default for shared workspaces with repos. workspace: - mode: pooled repos: - path: ./repo repo: https://github.com/EntityProcess/agentv.git diff --git a/examples/features/workspace-shared-config/workspace.yaml b/examples/features/workspace-shared-config/workspace.yaml index 6dfba4301..55a8b443d 100644 --- a/examples/features/workspace-shared-config/workspace.yaml +++ b/examples/features/workspace-shared-config/workspace.yaml @@ -1,5 +1,4 @@ template: ./workspace-template -mode: pooled hooks: after_each: reset: fast diff --git a/packages/core/src/evaluation/experiment.ts b/packages/core/src/evaluation/experiment.ts index cb9d62307..9db03781f 100644 --- a/packages/core/src/evaluation/experiment.ts +++ b/packages/core/src/evaluation/experiment.ts @@ -33,11 +33,6 @@ export type ExperimentRepeat = { readonly costLimitUsd?: number; }; -export type ExperimentWorkspaceConfig = { - readonly mode?: 'pooled' | 'temp' | 'static'; - readonly path?: string; -}; - export type ExperimentConfigWire = { readonly name?: string; readonly agent?: string; @@ -53,7 +48,7 @@ export type ExperimentConfigWire = { readonly threshold?: number; readonly budget_usd?: number; readonly sandbox?: ExperimentSandbox; - readonly workspace?: ExperimentWorkspaceConfig; + readonly workspace?: never; }; export type ExperimentConfig = { @@ -71,7 +66,6 @@ export type ExperimentConfig = { readonly threshold?: number; readonly budgetUsd?: number; readonly sandbox?: ExperimentSandbox; - readonly workspace?: ExperimentWorkspaceConfig; readonly fingerprint?: string; }; @@ -142,7 +136,7 @@ export function normalizeExperimentConfig(rawConfig: unknown): ExperimentConfig 'budget_usd', ); const sandbox = readOptionalSandbox(rawConfig.sandbox); - const workspace = readOptionalWorkspace(rawConfig.workspace); + rejectExperimentWorkspace(rawConfig.workspace); const configWithoutFingerprint: Omit = { ...(name !== undefined && { name }), @@ -159,7 +153,6 @@ export function normalizeExperimentConfig(rawConfig: unknown): ExperimentConfig ...(threshold !== undefined && { threshold }), ...(budgetUsd !== undefined && { budgetUsd }), ...(sandbox !== undefined && { sandbox }), - ...(workspace !== undefined && { workspace }), }; return { @@ -396,34 +389,13 @@ function readOptionalRecord(raw: unknown): Record | undefined { return raw; } -function readOptionalWorkspace(raw: unknown): ExperimentWorkspaceConfig | undefined { - const workspace = readOptionalRecord(raw); - if (workspace === undefined) { - return undefined; - } - - for (const key of Object.keys(workspace)) { - if (key !== 'mode' && key !== 'path') { - throw new Error( - `Experiment workspace.${key} is not supported. Experiment workspace supports only mode and path; put task setup in top-level workspace.`, - ); - } - } - - const mode = workspace.mode; - if (mode !== undefined && mode !== 'pooled' && mode !== 'temp' && mode !== 'static') { - throw new Error("Experiment workspace.mode must be 'pooled', 'temp', or 'static'."); - } - - const path = workspace.path; - if (path !== undefined && (typeof path !== 'string' || path.trim().length === 0)) { - throw new Error('Experiment workspace.path must be a non-empty string.'); +function rejectExperimentWorkspace(raw: unknown): void { + if (raw === undefined) { + return; } - - return { - ...(mode !== undefined && { mode }), - ...(path !== undefined && { path: path.trim() }), - }; + throw new Error( + 'Experiment workspace has been removed from eval YAML. Put machine-local workspace_path/workspace_mode in .agentv/config.local.yaml under execution, or pass --workspace-path/--workspace-mode. Keep portable task setup in top-level workspace.', + ); } function isRecord(value: unknown): value is Record { diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index 74bbd7b40..d31921a78 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -3,6 +3,8 @@ import path from 'node:path'; import { AGENTV_CONFIG_FILE_NAME, + AGENTV_LOCAL_CONFIG_FILE_NAME, + AGENTV_LOCAL_CONFIG_YML_FILE_NAME, getLocalConfigPath, isPlainConfigObject, mergeConfigObjects, @@ -32,6 +34,8 @@ export const DEFAULT_EVAL_PATTERNS: readonly string[] = [ export type ExecutionDefaults = { readonly verbose?: boolean; readonly keep_workspaces?: boolean; + readonly workspace_mode?: 'pooled' | 'temp' | 'static'; + readonly workspace_path?: string; readonly otel_file?: string; readonly export_otel?: boolean; readonly otel_backend?: string; @@ -139,13 +143,17 @@ async function readConfigObjectFile( } async function readConfigFilePair(configPath: string): Promise { - const base = await readConfigObjectFile(configPath); - const local = await readConfigObjectFile(getLocalConfigPath(configPath)); + const localConfigPath = getLocalConfigPath(configPath); + const base = stripLocalOnlyExecutionDefaults(await readConfigObjectFile(configPath), configPath); + const local = stripLocalOnlyExecutionDefaults( + await readConfigObjectFile(localConfigPath), + localConfigPath, + ); const rawMerged = base && local ? mergeConfigObjects(base, local) : (local ?? base); if (!rawMerged) { return null; } - return parseConfigObject(rawMerged, local ? getLocalConfigPath(configPath) : configPath); + return parseConfigObject(rawMerged, local ? localConfigPath : configPath); } function parseConfigObject( @@ -209,6 +217,57 @@ function parseConfigObject( } } +function isLocalConfigPath(configPath: string): boolean { + const basename = path.basename(configPath); + return ( + basename === AGENTV_LOCAL_CONFIG_FILE_NAME || basename === AGENTV_LOCAL_CONFIG_YML_FILE_NAME + ); +} + +function stripLocalOnlyExecutionDefaults( + rawConfig: Record | undefined, + configPath: string, +): Record | undefined { + if (!rawConfig || isLocalConfigPath(configPath)) { + return rawConfig; + } + + const execution = rawConfig.execution; + if (!isPlainConfigObject(execution)) { + return rawConfig; + } + + let stripped = false; + if ('workspace_path' in execution) { + stripped = true; + logWarning( + `execution.workspace_path in ${configPath} is machine-local and only supported in config.local.yaml; ignoring.`, + ); + } + if ('workspace_mode' in execution) { + stripped = true; + logWarning( + `execution.workspace_mode in ${configPath} is machine-local and only supported in config.local.yaml; ignoring.`, + ); + } + + if (!stripped) { + return rawConfig; + } + + const nextConfig = { ...rawConfig }; + const nextExecution = Object.fromEntries( + Object.entries(execution).filter( + ([key]) => key !== 'workspace_path' && key !== 'workspace_mode', + ), + ); + if (Object.keys(nextExecution).length === 0) { + return Object.fromEntries(Object.entries(nextConfig).filter(([key]) => key !== 'execution')); + } + nextConfig.execution = nextExecution; + return nextConfig; +} + function getSuiteRuntimeBlock(suite: JsonObject): Record | undefined { if (suite.experiment !== undefined && suite.execution !== undefined) { throw new Error("Use either top-level 'experiment' or legacy 'execution', not both."); @@ -529,6 +588,34 @@ export function parseExecutionDefaults( logWarning(`Invalid execution.keep_workspaces in ${configPath}, expected boolean`); } + const workspaceMode = obj.workspace_mode; + if (workspaceMode === 'pooled' || workspaceMode === 'temp' || workspaceMode === 'static') { + if (isLocalConfigPath(configPath)) { + result.workspace_mode = workspaceMode; + } else { + logWarning( + `execution.workspace_mode in ${configPath} is machine-local and only supported in config.local.yaml; ignoring.`, + ); + } + } else if (workspaceMode !== undefined) { + logWarning( + `Invalid execution.workspace_mode in ${configPath}, expected 'pooled', 'temp', or 'static'`, + ); + } + + const workspacePath = obj.workspace_path; + if (typeof workspacePath === 'string' && workspacePath.trim().length > 0) { + if (isLocalConfigPath(configPath)) { + result.workspace_path = workspacePath.trim(); + } else { + logWarning( + `execution.workspace_path in ${configPath} is machine-local and only supported in config.local.yaml; ignoring.`, + ); + } + } else if (workspacePath !== undefined) { + logWarning(`Invalid execution.workspace_path in ${configPath}, expected non-empty string`); + } + const otelFile = obj.otel_file; if (typeof otelFile === 'string' && otelFile.trim().length > 0) { result.otel_file = otelFile.trim(); diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 91d2c8188..27c564f82 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -356,10 +356,6 @@ export type WorkspaceConfig = { readonly repos?: readonly RepoConfig[]; /** Workspace lifecycle hooks */ readonly hooks?: WorkspaceHooksConfig; - /** Workspace materialization mode */ - readonly mode?: 'pooled' | 'temp' | 'static'; - /** Required when mode=static: use this existing directory directly */ - readonly path?: string; /** Docker-based workspace: run grader commands inside a container */ readonly docker?: DockerWorkspaceConfig; /** Directory containing the workspace file when workspace is a file reference. diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 4c2db309b..d6aafbab3 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -325,8 +325,6 @@ const WorkspaceSchema = z isolation: z.enum(['shared', 'per_case']).optional(), repos: z.array(RepoSchema).optional(), hooks: WorkspaceHooksSchema.optional(), - mode: z.enum(['pooled', 'temp', 'static']).optional(), - path: z.string().optional(), docker: DockerWorkspaceSchema.optional(), env: WorkspaceEnvSchema.optional(), }) @@ -372,6 +370,7 @@ const ExecutionSchema = z.object({ fail_on_error: FailOnErrorSchema.optional(), failOnError: FailOnErrorSchema.optional(), threshold: z.number().min(0).max(1).optional(), + workspace: z.never().optional(), }); const ExperimentRepeatSchema = z @@ -392,13 +391,6 @@ const RunOverrideSchema = z }) .strict(); -const ExperimentWorkspaceSchema = z - .object({ - mode: z.enum(['pooled', 'temp', 'static']).optional(), - path: z.string().min(1).optional(), - }) - .strict(); - const ExperimentTargetRefSchema = z.union([ z.string().min(1), z @@ -422,7 +414,7 @@ const ExperimentRuntimeSchema = ExecutionSchema.extend({ timeout_seconds: z.number().gt(0).optional(), budget_usd: z.number().gt(0).optional(), sandbox: z.enum(['auto', 'docker', 'vercel']).optional(), - workspace: ExperimentWorkspaceSchema.optional(), + workspace: z.never().optional(), setup: z.never().optional(), }).refine((value) => value.repeat === undefined || value.runs === undefined, { message: 'Use repeat or runs, not both.', diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index 1b199230d..afa311471 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -372,6 +372,14 @@ export async function validateEvalFile(filePath: string): Promise { await validateWorkspaceConfig(parsed.workspace, absolutePath, errors, 'workspace'); if (isObject(parsed.experiment)) { - validateExperimentWorkspaceConfig( + rejectRuntimeWorkspaceConfig( parsed.experiment.workspace, absolutePath, errors, @@ -463,7 +471,7 @@ async function validateSuiteWorkspaceConfigs( ); } if (isObject(parsed.execution)) { - validateExperimentWorkspaceConfig( + rejectRuntimeWorkspaceConfig( parsed.execution.workspace, absolutePath, errors, @@ -472,7 +480,7 @@ async function validateSuiteWorkspaceConfigs( } } -function validateExperimentWorkspaceConfig( +function rejectRuntimeWorkspaceConfig( workspace: JsonValue | undefined, filePath: string, errors: ValidationError[], @@ -482,50 +490,12 @@ function validateExperimentWorkspaceConfig( return; } - if (!isObject(workspace)) { - errors.push({ - severity: 'error', - filePath, - location, - message: `${location} must be an object with mode and/or path.`, - }); - return; - } - - for (const key of Object.keys(workspace)) { - if (key === 'mode' || key === 'path') { - continue; - } - errors.push({ - severity: 'error', - filePath, - location: `${location}.${key}`, - message: `${location} supports only mode and path. Put task workspace setup in top-level workspace.`, - }); - } - - const mode = workspace.mode; - if (mode !== undefined && mode !== 'pooled' && mode !== 'temp' && mode !== 'static') { - errors.push({ - severity: 'error', - filePath, - location: `${location}.mode`, - message: `${location}.mode must be 'pooled', 'temp', or 'static'.`, - }); - } - - const workspacePath = workspace.path; - if ( - workspacePath !== undefined && - (typeof workspacePath !== 'string' || workspacePath.trim().length === 0) - ) { - errors.push({ - severity: 'error', - filePath, - location: `${location}.path`, - message: `${location}.path must be a non-empty string.`, - }); - } + errors.push({ + severity: 'error', + filePath, + location, + message: `${location} has been removed from eval YAML. Put machine-local workspace_path/workspace_mode in .agentv/config.local.yaml under execution, or pass --workspace-path/--workspace-mode. Keep portable task setup in top-level workspace.`, + }); } async function validateCompositionDiagnostics( @@ -923,6 +893,26 @@ function validateWorkspaceRepoConfig( const docker = workspace.docker; + if ('mode' in workspace) { + errors.push({ + severity: 'error', + filePath, + location: `${location}.mode`, + message: + 'workspace.mode has been removed from eval YAML. Use workspace.isolation: shared|per_case for folder isolation; use --workspace-mode or config.local.yaml execution.workspace_mode only for machine-local runtime overrides.', + }); + } + + if ('path' in workspace) { + errors.push({ + severity: 'error', + filePath, + location: `${location}.path`, + message: + 'workspace.path has been removed from eval YAML. Put existing workspace paths in .agentv/config.local.yaml execution.workspace_path or pass --workspace-path.', + }); + } + if (isolation !== undefined && isolation !== 'shared' && isolation !== 'per_case') { errors.push({ severity: 'error', diff --git a/packages/core/src/evaluation/workspace/setup.ts b/packages/core/src/evaluation/workspace/setup.ts index 38c630d2c..660db6a84 100644 --- a/packages/core/src/evaluation/workspace/setup.ts +++ b/packages/core/src/evaluation/workspace/setup.ts @@ -13,7 +13,7 @@ import { execFile } from 'node:child_process'; import { existsSync } from 'node:fs'; -import { copyFile, mkdir, readdir, stat } from 'node:fs/promises'; +import { copyFile, mkdir, stat } from 'node:fs/promises'; import path from 'node:path'; import { promisify } from 'node:util'; @@ -208,8 +208,6 @@ function workspaceNeedsSharedSetup( return false; } return !!( - workspace.path || - workspace.mode === 'static' || workspace.template || workspace.hooks || workspace.repos?.length || @@ -421,21 +419,24 @@ export async function prepareSharedWorkspaceSetup( const cliWorkspacePath = workspacePath ?? legacyWorkspacePath; const sharedWorkspaceAppliesToAllCases = !!cliWorkspacePath; - const yamlWorkspacePath = suiteWorkspace?.path; if (cliWorkspacePath && workspaceMode && workspaceMode !== 'static') { throw new Error('--workspace-path requires --workspace-mode static when both are provided'); } let configuredMode: WorkspaceSetupMode = cliWorkspacePath ? 'static' - : (workspaceMode ?? suiteWorkspace?.mode ?? (yamlWorkspacePath ? 'static' : 'pooled')); - const configuredStaticPath = cliWorkspacePath ?? yamlWorkspacePath; + : (workspaceMode ?? 'pooled'); + const configuredStaticPath = cliWorkspacePath; if (configuredMode === 'static' && !configuredStaticPath) { if (!suiteWorkspace?.repos?.length) { - setupLog('workspace.mode=static with no path and no repos — falling back to temp mode'); + setupLog( + 'runtime workspaceMode=static with no path and no repos — falling back to temp mode', + ); configuredMode = 'temp'; } else { - throw new Error('workspace.mode=static requires workspace.path or --workspace-path'); + throw new Error( + 'runtime workspaceMode=static requires --workspace-path or execution.workspace_path in config.local.yaml', + ); } } @@ -446,10 +447,6 @@ export async function prepareSharedWorkspaceSetup( 'static workspace mode is incompatible with isolation: per_case. Use isolation: shared (default).', ); } - if (configuredMode !== 'static' && configuredStaticPath) { - throw new Error('workspace.path requires workspace.mode=static'); - } - const hasSharedWorkspace = !!( useStaticWorkspace || (!isPerCaseWorkspace && @@ -493,31 +490,11 @@ export async function prepareSharedWorkspaceSetup( const hookExecutions: WorkspaceSetupHookExecution[] = []; const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50); - let staticMaterialised = false; - const isYamlConfiguredPath = !cliWorkspacePath && !!yamlWorkspacePath; let repoManager: RepoManager | undefined; try { if (useStaticWorkspace && configuredStaticPath) { - const dirExists = await stat(configuredStaticPath).then( - (s) => s.isDirectory(), - () => false, - ); - const isEmpty = dirExists ? (await readdir(configuredStaticPath)).length === 0 : false; - - if (isYamlConfiguredPath && (!dirExists || isEmpty)) { - if (!dirExists) { - await mkdir(configuredStaticPath, { recursive: true }); - } - if (workspaceTemplate) { - await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath); - setupLog(`copied template into static workspace: ${configuredStaticPath}`); - } - staticMaterialised = true; - setupLog(`materialised static workspace at: ${configuredStaticPath}`); - } else { - setupLog(`reusing existing static workspace: ${configuredStaticPath}`); - } + setupLog(`reusing existing static workspace: ${configuredStaticPath}`); sharedWorkspacePath = configuredStaticPath; } else if (!isPerCaseWorkspace && usePool && suiteWorkspace?.repos) { const slotsNeeded = workers; @@ -581,40 +558,19 @@ export async function prepareSharedWorkspaceSetup( const hasReposToMaterialize = !!suiteWorkspace?.repos?.length && !usePool && !isPerCaseWorkspace; - const needsRepoMaterialisation = - hasReposToMaterialize && (!useStaticWorkspace || staticMaterialised); - const needsPerRepoCheck = - hasReposToMaterialize && useStaticWorkspace && !staticMaterialised && isYamlConfiguredPath; + const needsRepoMaterialisation = hasReposToMaterialize && !useStaticWorkspace; repoManager = repoManager ?? - (needsRepoMaterialisation || needsPerRepoCheck + (needsRepoMaterialisation ? new RepoManager(verbose, { projectConfigDir: evalDir }) : undefined); - if ( - (needsRepoMaterialisation || needsPerRepoCheck) && - repoManager && - sharedWorkspacePath && - suiteWorkspace?.repos - ) { + if (needsRepoMaterialisation && repoManager && sharedWorkspacePath && suiteWorkspace?.repos) { try { - if (needsPerRepoCheck) { - for (const repo of suiteWorkspace.repos) { - if (!repo.path || !repo.repo) continue; - const targetDir = path.join(sharedWorkspacePath, repo.path); - if (existsSync(targetDir)) { - setupLog(`reusing existing repo at: ${targetDir}`); - continue; - } - setupLog(`materializing missing repo: ${repo.path}`); - await repoManager.materialize(repo, sharedWorkspacePath); - } - } else { - setupLog( - `materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`, - ); - await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath); - } + setupLog( + `materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`, + ); + await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath); setupLog('shared repo materialization complete'); } catch (error) { const message = error instanceof Error ? error.message : String(error); diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index bb912ad86..e7bd89634 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -332,8 +332,6 @@ export type EvalSuiteResult = { readonly threshold?: number; /** Top-level runtime block from `experiment:` or legacy `execution:`. */ readonly experimentConfig?: ExperimentConfig; - /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */ - readonly workspacePath?: string; /** Inline target definition from a TS eval config. */ readonly inlineTarget?: import('./providers/types.js').TargetDefinition; /** Custom provider factory from a TS eval config task(). */ @@ -360,12 +358,8 @@ export async function loadTestSuite( const { loadTsEvalSuite } = await import('./loaders/ts-eval-loader.js'); return loadTsEvalSuite(evalFilePath, resolveToAbsolutePath(repoRoot), options); } - const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml( - evalFilePath, - repoRoot, - options, - ); - return buildEvalSuiteResult(parsed, tests, suiteWorkspacePath); + const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options); + return buildEvalSuiteResult(parsed, tests); } /** @deprecated Use `loadTestSuite` instead */ @@ -377,14 +371,14 @@ export async function loadTestSuiteFromYamlObject( repoRoot: URL | string, options?: LoadOptions, ): Promise { - const { tests, parsed, suiteWorkspacePath } = await loadTestsFromParsedYamlValue( + const { tests, parsed } = await loadTestsFromParsedYamlValue( suiteObject, evalFilePath, repoRoot, options, ); - return buildEvalSuiteResult(parsed, tests, suiteWorkspacePath); + return buildEvalSuiteResult(parsed, tests); } export async function loadTests( @@ -416,7 +410,7 @@ async function loadTestsFromYaml( evalFilePath: string, repoRoot: URL | string, options?: LoadOptions, -): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> { +): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject }> { const absoluteTestPath = path.resolve(evalFilePath); const currentImport: SuiteImportStackEntry = { identity: await canonicalEvalFileIdentity(absoluteTestPath), @@ -441,7 +435,7 @@ async function loadTestsFromParsedYamlValue( evalFilePath: string, repoRoot: URL | string, options?: LoadOptions, -): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject; suiteWorkspacePath?: string }> { +): Promise<{ tests: readonly EvalTest[]; parsed: JsonObject }> { const verbose = options?.verbose ?? false; const filterPattern = options?.filter; const absoluteTestPath = path.resolve(evalFilePath); @@ -557,6 +551,11 @@ async function loadTestsFromParsedYamlValue( // Extract per-case execution config early (reused below for skip_defaults) const caseExecution = isJsonObject(renderedCase.execution) ? renderedCase.execution : undefined; + if (caseExecution?.workspace !== undefined) { + throw new Error( + `test '${id ?? 'unknown'}'.execution.workspace has been removed from eval YAML. Put machine-local workspace_path/workspace_mode in .agentv/config.local.yaml under execution, or pass --workspace-path/--workspace-mode. Keep portable task setup in test workspace or suite workspace.`, + ); + } const skipDefaults = caseExecution?.skip_defaults === true; const caseThreshold = typeof caseExecution?.threshold === 'number' && @@ -823,15 +822,10 @@ async function loadTestsFromParsedYamlValue( return { tests: [...importedSuiteTests, ...results], parsed: suite, - suiteWorkspacePath: suiteWorkspace?.path, }; } -function buildEvalSuiteResult( - parsed: JsonObject, - tests: readonly EvalTest[], - suiteWorkspacePath?: string, -): EvalSuiteResult { +function buildEvalSuiteResult(parsed: JsonObject, tests: readonly EvalTest[]): EvalSuiteResult { const metadata = parseMetadata(parsed); const failOnError = extractFailOnError(parsed); const threshold = extractThreshold(parsed); @@ -848,7 +842,6 @@ function buildEvalSuiteResult( ...(failOnError !== undefined && { failOnError }), ...(threshold !== undefined && { threshold }), ...(experimentConfig !== undefined && { experimentConfig }), - ...(suiteWorkspacePath !== undefined && { workspacePath: suiteWorkspacePath }), }; } @@ -1725,14 +1718,28 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi const obj = raw as Record; if ('static_path' in obj) { throw new Error( - 'workspace.static_path has been removed. Use workspace.path with workspace.mode=static.', + 'workspace.static_path has been removed from eval YAML. Put existing workspace paths in .agentv/config.local.yaml execution.workspace_path or pass --workspace-path.', ); } if ('pool' in obj) { - throw new Error("workspace.pool has been removed. Use workspace.mode='pooled' or 'temp'."); + throw new Error( + 'workspace.pool has been removed from eval YAML. Shared repo workspaces are pooled by default; use --workspace-mode or config.local.yaml execution.workspace_mode for machine-local runtime overrides.', + ); } if ('static' in obj) { - throw new Error("workspace.static has been removed. Use workspace.mode='static'."); + throw new Error( + 'workspace.static has been removed from eval YAML. Put existing workspace paths in .agentv/config.local.yaml execution.workspace_path or pass --workspace-path.', + ); + } + if ('mode' in obj) { + throw new Error( + 'workspace.mode has been removed from eval YAML. Use workspace.isolation: shared|per_case for folder isolation; use --workspace-mode or config.local.yaml execution.workspace_mode only for machine-local runtime overrides.', + ); + } + if ('path' in obj) { + throw new Error( + 'workspace.path has been removed from eval YAML. Put existing workspace paths in .agentv/config.local.yaml execution.workspace_path or pass --workspace-path.', + ); } let template = typeof obj.template === 'string' ? obj.template : undefined; @@ -1753,24 +1760,17 @@ function parseWorkspaceConfig(raw: unknown, evalFileDir: string): WorkspaceConfi : undefined; const hooks = parseWorkspaceHooksConfig(obj.hooks, evalFileDir); - const explicitMode = - obj.mode === 'pooled' || obj.mode === 'temp' || obj.mode === 'static' ? obj.mode : undefined; - const workspacePath = typeof obj.path === 'string' ? obj.path : undefined; - const mode = explicitMode ?? (workspacePath ? 'static' : undefined); const docker = parseDockerWorkspaceConfig(obj.docker); const env = parseWorkspaceEnvConfig(obj.env); - if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env) - return undefined; + if (!template && !isolation && !repos && !hooks && !docker && !env) return undefined; return { ...(template !== undefined && { template }), ...(isolation !== undefined && { isolation }), ...(repos !== undefined && { repos }), ...(hooks !== undefined && { hooks }), - ...(mode !== undefined && { mode }), - ...(workspacePath !== undefined && { path: workspacePath }), ...(docker !== undefined && { docker }), ...(env !== undefined && { env }), }; @@ -1850,8 +1850,6 @@ function mergeWorkspaceConfigs( isolation: caseLevel.isolation ?? suiteLevel.isolation, repos: caseLevel.repos ?? suiteLevel.repos, ...(hasHooks && { hooks: mergedHooks as WorkspaceHooksConfig }), - mode: caseLevel.mode ?? suiteLevel.mode, - path: caseLevel.path ?? suiteLevel.path, docker: caseLevel.docker ?? suiteLevel.docker, env: caseLevel.env ?? suiteLevel.env, workspaceFileDir: caseLevel.workspaceFileDir ?? suiteLevel.workspaceFileDir, diff --git a/packages/core/test/evaluation/eval-inline-experiment.test.ts b/packages/core/test/evaluation/eval-inline-experiment.test.ts index 17ce9604a..99477ed06 100644 --- a/packages/core/test/evaluation/eval-inline-experiment.test.ts +++ b/packages/core/test/evaluation/eval-inline-experiment.test.ts @@ -90,6 +90,28 @@ describe('eval.yaml inline experiment and tests imports', () => { await expect(loadTestSuite(conflictPath, tempDir)).rejects.toThrow(/experiment.*execution/); }); + it('rejects per-test execution workspace blocks', async () => { + const evalPath = path.join(tempDir, 'test-execution-workspace.eval.yaml'); + await writeFile( + evalPath, + [ + 'tests:', + ' - id: one', + ' input: hello', + ' criteria: ok', + ' execution:', + ' workspace:', + ' mode: static', + ' path: /tmp/ws', + '', + ].join('\n'), + ); + + await expect(loadTestSuite(evalPath, tempDir)).rejects.toThrow( + /execution\.workspace has been removed from eval YAML/, + ); + }); + it('globs raw case files through tests[].include with deterministic ordering and select filters', async () => { const casesDir = path.join(tempDir, 'cases'); await mkdir(casesDir, { recursive: true }); @@ -396,7 +418,7 @@ describe('eval.yaml inline experiment and tests imports', () => { ' timeout_seconds: 10', ' budget_usd: 0.5', 'workspace:', - ' path: ./child-workspace', + ' template: ./child-workspace', 'input: child shared input', 'assertions:', ' - type: contains', @@ -441,7 +463,7 @@ describe('eval.yaml inline experiment and tests imports', () => { expect(suite.experimentConfig?.repeat).toMatchObject({ count: 3, strategy: 'pass_at_k' }); expect(test.run).toBeUndefined(); expect(test.suite).toBe('child-suite'); - expect(test.workspace?.path).toBe('./child-workspace'); + expect(test.workspace?.template).toBe(path.join(tempDir, 'child-workspace')); expect(test.input.map((message) => message.content)).toEqual([ 'child shared input', 'child case input', @@ -470,7 +492,7 @@ describe('eval.yaml inline experiment and tests imports', () => { [ 'name: parent-suite', 'workspace:', - ' path: ./parent-workspace', + ' template: ./parent-workspace', 'tests:', ' - include: child.eval.yaml', ' type: suite', @@ -511,7 +533,7 @@ describe('eval.yaml inline experiment and tests imports', () => { ); await expect(loadTestSuite(parentPath, tempDir)).rejects.toThrow( - /Parent workspace is not allowed.*experiment\.workspace/, + /Experiment workspace has been removed from eval YAML/, ); }); @@ -543,7 +565,7 @@ describe('eval.yaml inline experiment and tests imports', () => { ); await expect(loadTestSuite(parentPath, tempDir)).rejects.toThrow( - /Parent workspace is not allowed.*execution\.workspace/, + /Experiment workspace has been removed from eval YAML/, ); }); @@ -754,7 +776,7 @@ describe('eval.yaml inline experiment and tests imports', () => { [ 'name: parent-suite', 'workspace:', - ' path: ./parent-workspace', + ' template: ./parent-workspace', 'input: parent shared input', 'assertions:', ' - type: contains', @@ -774,7 +796,7 @@ describe('eval.yaml inline experiment and tests imports', () => { 'parent shared input', 'raw case input', ]); - expect(test.workspace?.path).toBe('./parent-workspace'); + expect(test.workspace?.template).toBe(path.join(tempDir, 'parent-workspace')); expect(test.assertions?.[0]).toMatchObject({ type: 'contains', value: 'parent' }); }); }); diff --git a/packages/core/test/evaluation/experiment.test.ts b/packages/core/test/evaluation/experiment.test.ts index ff7244ffc..e4d4f1ea8 100644 --- a/packages/core/test/evaluation/experiment.test.ts +++ b/packages/core/test/evaluation/experiment.test.ts @@ -21,7 +21,6 @@ describe('inline experiment config', () => { threshold: 0.8, budget_usd: 1.25, sandbox: 'auto', - workspace: { mode: 'static', path: './workspace' }, }); expect(config).toMatchObject({ @@ -37,7 +36,6 @@ describe('inline experiment config', () => { workers: 4, budgetUsd: 1.25, sandbox: 'auto', - workspace: { mode: 'static', path: './workspace' }, }); expect(config.fingerprint).toMatch(/^[a-f0-9]{64}$/); }); @@ -93,11 +91,11 @@ describe('inline experiment config', () => { /scripts are not supported/, ); expect(() => normalizeExperimentConfig({ workspace: { isolation: 'per_test' } })).toThrow( - /supports only mode and path/, + /Experiment workspace has been removed from eval YAML/, ); expect(() => normalizeExperimentConfig({ workspace: { repos: [{ repo: 'acme/support-app' }] } }), - ).toThrow(/supports only mode and path/); + ).toThrow(/Experiment workspace has been removed from eval YAML/); }); it('builds safe snake_case artifact metadata without agent options', () => { diff --git a/packages/core/test/evaluation/loaders/config-loader.test.ts b/packages/core/test/evaluation/loaders/config-loader.test.ts index adb5a0c62..3fdd76e39 100644 --- a/packages/core/test/evaluation/loaders/config-loader.test.ts +++ b/packages/core/test/evaluation/loaders/config-loader.test.ts @@ -122,6 +122,7 @@ describe('loadConfig', () => { ' - "**/*.local.eval.yaml"', 'execution:', ' keep_workspaces: true', + ' workspace_path: /tmp/agentv-local-workspace', 'results:', ' repo:', ' branch: local-results', @@ -135,6 +136,7 @@ describe('loadConfig', () => { expect(config?.execution).toEqual({ verbose: true, keep_workspaces: true, + workspace_path: '/tmp/agentv-local-workspace', pool_slots: 2, }); expect(config?.results).toEqual({ @@ -147,6 +149,44 @@ describe('loadConfig', () => { } }); + it('ignores workspace runtime bindings in committed config.yaml before applying local overlays', async () => { + const tempDir = mkdtempSync(path.join(os.tmpdir(), 'agentv-local-only-workspace-')); + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const projectDir = path.join(tempDir, 'project'); + const evalDir = path.join(projectDir, 'evals'); + const localConfigDir = path.join(projectDir, '.agentv'); + mkdirSync(evalDir, { recursive: true }); + mkdirSync(localConfigDir, { recursive: true }); + writeFileSync( + path.join(localConfigDir, 'config.yaml'), + [ + 'execution:', + ' keep_workspaces: true', + ' workspace_mode: static', + ' workspace_path: /tmp/committed-workspace', + '', + ].join('\n'), + ); + writeFileSync( + path.join(localConfigDir, 'config.local.yaml'), + ['execution:', ' verbose: true', ''].join('\n'), + ); + + const config = await loadConfig(path.join(evalDir, 'suite.eval.yaml'), projectDir); + + expect(config?.execution).toEqual({ + keep_workspaces: true, + verbose: true, + }); + expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('execution.workspace_mode')); + expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('execution.workspace_path')); + } finally { + warnSpy.mockRestore(); + rmSync(tempDir, { recursive: true, force: true }); + } + }); + it('treats project-local config.local.yaml alone as configured and does not fall back global', async () => { const tempDir = mkdtempSync(path.join(os.tmpdir(), 'agentv-local-only-overlay-')); try { @@ -934,6 +974,37 @@ describe('parseExecutionDefaults', () => { expect(result?.keep_workspaces).toBe(true); }); + it('parses workspace runtime bindings', () => { + const result = parseExecutionDefaults( + { + workspace_mode: 'static', + workspace_path: ' /tmp/agentv-workspace ', + }, + '/test/config.local.yaml', + ); + expect(result?.workspace_mode).toBe('static'); + expect(result?.workspace_path).toBe('/tmp/agentv-workspace'); + }); + + it('ignores workspace runtime bindings outside local config', () => { + const warnSpy = spyOn(console, 'warn').mockImplementation(() => {}); + try { + const result = parseExecutionDefaults( + { + verbose: true, + workspace_mode: 'static', + workspace_path: '/tmp/agentv-workspace', + }, + '/test/config.yaml', + ); + expect(result).toEqual({ verbose: true }); + expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('execution.workspace_mode')); + expect(warnSpy).toHaveBeenCalledWith(expect.stringContaining('execution.workspace_path')); + } finally { + warnSpy.mockRestore(); + } + }); + it('parses otel_file string', () => { const result = parseExecutionDefaults( { otel_file: '.agentv/results/otel.json' }, @@ -947,13 +1018,15 @@ describe('parseExecutionDefaults', () => { { verbose: true, keep_workspaces: false, + workspace_mode: 'temp', otel_file: 'otel.json', }, - '/test/config.yaml', + '/test/config.local.yaml', ); expect(result).toEqual({ verbose: true, keep_workspaces: false, + workspace_mode: 'temp', otel_file: 'otel.json', }); }); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 79cfd34b0..3e7e81e7c 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -3481,7 +3481,7 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id expect(results[0].error).toBeUndefined(); }); - it('materializes only missing repos in YAML-configured static workspace', async () => { + it('uses runtime workspacePath as an existing workspace without materializing repos', async () => { const { mkdtemp, mkdir: fsMkdir, @@ -3501,15 +3501,11 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id const missingRepoBSource = path.join(testDir, 'missing-repo-b-source'); - // Use YAML workspace.path (not CLI --workspace) with mixed repo states. - // repo-a exists → should be reused. repo-b points to a missing file:// repo and fails - // during materialization. This proves the per-repo existence check skips repo-a without - // depending on network timeouts from cloning fake remotes. + // Runtime workspacePath points at an existing machine-local workspace. It is used + // as-is and does not materialize workspace.repos from the portable eval contract. const evalCase: EvalTest = { ...baseTestCase, workspace: { - mode: 'static', - path: testDir, repos: [ { path: 'repo-a', @@ -3524,79 +3520,30 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id }, }; - // repo-b materialization fails, which proves repo-a was skipped and only repo-b was attempted. const savedAgentvHome = process.env.AGENTV_HOME; const savedAgentvDataDir = process.env.AGENTV_DATA_DIR; process.env.AGENTV_HOME = path.join(testDir, 'agentv-home'); process.env.AGENTV_DATA_DIR = path.join(testDir, 'agentv-data'); try { - await expect( - runEvaluation({ - testFilePath: 'in-memory.yaml', - repoRoot: 'in-memory', - target: baseTarget, - providerFactory: () => provider, - evaluators: evaluatorRegistry, - evalCases: [evalCase], - keepWorkspaces: true, - }), - ).rejects.toThrow('Failed to materialize repos'); + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: 'in-memory', + target: baseTarget, + providerFactory: () => provider, + evaluators: evaluatorRegistry, + evalCases: [evalCase], + workspacePath: testDir, + keepWorkspaces: true, + }); + expect(results).toHaveLength(1); + expect(results[0].error).toBeUndefined(); } finally { process.env.AGENTV_HOME = savedAgentvHome; process.env.AGENTV_DATA_DIR = savedAgentvDataDir; } - // repo-a marker should still exist (not deleted by static workspace cleanup) await fsAccess(path.join(repoADir, 'marker.txt')); - }); - - it('skips all repos when all exist in YAML-configured static workspace', async () => { - const { mkdtemp, mkdir: fsMkdir, writeFile } = await import('node:fs/promises'); - testDir = await mkdtemp(path.join(tmpdir(), 'agentv-ws-static-')); - - // Pre-create both repos - await fsMkdir(path.join(testDir, 'repo-a'), { recursive: true }); - await writeFile(path.join(testDir, 'repo-a', 'file.txt'), 'a'); - await fsMkdir(path.join(testDir, 'repo-b'), { recursive: true }); - await writeFile(path.join(testDir, 'repo-b', 'file.txt'), 'b'); - - const provider = new SequenceProvider('mock', { - responses: [{ output: [{ role: 'assistant', content: [{ type: 'text', text: 'answer' }] }] }], - }); - - // Both repos exist → no clone attempts → should succeed without network - const evalCase: EvalTest = { - ...baseTestCase, - workspace: { - mode: 'static', - path: testDir, - repos: [ - { - path: 'repo-a', - repo: 'https://github.com/example/repo-a.git', - commit: 'main', - }, - { - path: 'repo-b', - repo: 'https://github.com/example/repo-b.git', - commit: 'main', - }, - ], - }, - }; - - const results = await runEvaluation({ - testFilePath: 'in-memory.yaml', - repoRoot: 'in-memory', - target: baseTarget, - providerFactory: () => provider, - evaluators: evaluatorRegistry, - evalCases: [evalCase], - keepWorkspaces: true, - }); - - expect(results).toHaveLength(1); - expect(results[0].error).toBeUndefined(); + await expect(fsAccess(path.join(testDir, 'repo-b'))).rejects.toThrow(); }); it('falls back to temp mode when workspaceMode is static with no path and no repos', async () => { @@ -3640,7 +3587,9 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id evalCases: [evalCase], workspaceMode: 'static', }), - ).rejects.toThrow('workspace.mode=static requires workspace.path or --workspace-path'); + ).rejects.toThrow( + 'runtime workspaceMode=static requires --workspace-path or execution.workspace_path in config.local.yaml', + ); }); it('errors when workspace path is combined with non-static workspaceMode', async () => { diff --git a/packages/core/test/evaluation/repo-schema-validation.test.ts b/packages/core/test/evaluation/repo-schema-validation.test.ts index 86a3abdaf..27f55cc48 100644 --- a/packages/core/test/evaluation/repo-schema-validation.test.ts +++ b/packages/core/test/evaluation/repo-schema-validation.test.ts @@ -183,7 +183,7 @@ describe('repo lifecycle schema validation', () => { expect(result.success).toBe(false); }); - it('rejects removed experiment workspace isolation per_test value', () => { + it('rejects experiment workspace blocks', () => { const result = EvalFileSchema.safeParse({ ...baseEval, experiment: { @@ -195,7 +195,7 @@ describe('repo lifecycle schema validation', () => { expect(result.success).toBe(false); }); - it('accepts experiment workspace runtime override fields', () => { + it('rejects experiment workspace runtime override fields', () => { const result = EvalFileSchema.safeParse({ ...baseEval, experiment: { @@ -205,7 +205,25 @@ describe('repo lifecycle schema validation', () => { }, }, }); - expect(result.success).toBe(true); + expect(result.success).toBe(false); + }); + + it('rejects test execution workspace blocks', () => { + const result = EvalFileSchema.safeParse({ + ...baseEval, + tests: [ + { + ...baseEval.tests[0], + execution: { + workspace: { + mode: 'static', + path: '/tmp/my-workspace', + }, + }, + }, + ], + }); + expect(result.success).toBe(false); }); it('rejects task workspace fields in experiment workspace', () => { @@ -225,25 +243,24 @@ describe('repo lifecycle schema validation', () => { expect(result.success).toBe(false); }); - it('accepts workspace.mode=temp', () => { + it('rejects removed workspace.mode', () => { const result = EvalFileSchema.safeParse({ ...baseEval, workspace: { mode: 'temp', }, }); - expect(result.success).toBe(true); + expect(result.success).toBe(false); }); - it('accepts workspace.path for static mode', () => { + it('rejects removed workspace.path', () => { const result = EvalFileSchema.safeParse({ ...baseEval, workspace: { - mode: 'static', path: '/tmp/my-workspace', }, }); - expect(result.success).toBe(true); + expect(result.success).toBe(false); }); it('rejects removed workspace.static_path field', () => { diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index b3c3f059a..f0a18c0e7 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -191,7 +191,7 @@ tests: ).toBe(true); }); - it('rejects removed isolation values in experiment workspace', async () => { + it('rejects experiment workspace blocks', async () => { const filePath = path.join(tempDir, 'experiment-workspace-legacy-isolation.eval.yaml'); await writeFile( filePath, @@ -212,8 +212,8 @@ tests: result.errors.some( (error) => error.severity === 'error' && - error.location === 'experiment.workspace.isolation' && - error.message.includes('supports only mode and path'), + error.location === 'experiment.workspace' && + error.message.includes('has been removed from eval YAML'), ), ).toBe(true); }); @@ -241,13 +241,13 @@ tests: result.errors.some( (error) => error.severity === 'error' && - error.location === 'experiment.workspace.repos' && - error.message.includes('supports only mode and path'), + error.location === 'experiment.workspace' && + error.message.includes('has been removed from eval YAML'), ), ).toBe(true); }); - it('accepts runtime workspace overrides in experiment workspace', async () => { + it('rejects runtime workspace overrides in experiment workspace', async () => { const filePath = path.join(tempDir, 'experiment-workspace-runtime.eval.yaml'); await writeFile( filePath, @@ -264,7 +264,43 @@ tests: const result = await validateEvalFile(filePath); - expect(result.valid).toBe(true); + expect(result.valid).toBe(false); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'experiment.workspace' && + error.message.includes('has been removed from eval YAML'), + ), + ).toBe(true); + }); + + it('rejects test execution workspace blocks', async () => { + const filePath = path.join(tempDir, 'test-execution-workspace.eval.yaml'); + await writeFile( + filePath, + `tests: + - id: test-1 + criteria: Goal + input: Query + execution: + workspace: + mode: static + path: /tmp/my-workspace +`, + ); + + const result = await validateEvalFile(filePath); + + expect(result.valid).toBe(false); + expect( + result.errors.some( + (error) => + error.severity === 'error' && + error.location === 'tests[0].execution.workspace' && + error.message.includes('has been removed from eval YAML'), + ), + ).toBe(true); }); it('warns that imported child experiments are ignored by wrapper composition', async () => { @@ -307,7 +343,7 @@ tests: await writeFile( path.join(tempDir, 'composition-child-tests-import.eval.yaml'), `workspace: - path: ./child-workspace + template: ./child-workspace input: child suite input assertions: - type: contains @@ -322,7 +358,7 @@ tests: await writeFile( filePath, `workspace: - path: ./parent-workspace + template: ./parent-workspace tests: - include: composition-child-tests-import.eval.yaml type: tests @@ -1170,8 +1206,8 @@ tests: "./cases-shorthand-workspace.yaml" result.errors.some( (error) => error.severity === 'error' && - error.location === 'experiment.workspace.isolation' && - error.message.includes('supports only mode and path'), + error.location === 'experiment.workspace' && + error.message.includes('has been removed from eval YAML'), ), ).toBe(true); }); diff --git a/packages/core/test/evaluation/workspace-config-parsing.test.ts b/packages/core/test/evaluation/workspace-config-parsing.test.ts index d104b3f0c..c10317ebb 100644 --- a/packages/core/test/evaluation/workspace-config-parsing.test.ts +++ b/packages/core/test/evaluation/workspace-config-parsing.test.ts @@ -390,8 +390,8 @@ tests: ); }); - it('infers workspace.mode=static when workspace.path is provided without mode', async () => { - const evalFile = path.join(testDir, 'workspace-path-implies-static.yaml'); + it('rejects removed workspace.path', async () => { + const evalFile = path.join(testDir, 'workspace-path-removed.yaml'); await writeFile( evalFile, ` @@ -405,10 +405,25 @@ tests: `, ); - const cases = await loadTests(evalFile, testDir); - expect(cases).toHaveLength(1); - expect(cases[0].workspace?.mode).toBe('static'); - expect(cases[0].workspace?.path).toBe('/tmp/shared-workspace'); + await expect(loadTests(evalFile, testDir)).rejects.toThrow(/workspace\.path has been removed/i); + }); + + it('rejects removed workspace.mode', async () => { + const evalFile = path.join(testDir, 'workspace-mode-removed.yaml'); + await writeFile( + evalFile, + ` +workspace: + mode: temp + +tests: + - id: case-1 + input: "Hello" + criteria: "Should parse" +`, + ); + + await expect(loadTests(evalFile, testDir)).rejects.toThrow(/workspace\.mode has been removed/i); }); it('rejects removed workspace.static_path field', async () => { diff --git a/plugins/agentv-dev/skills/agentv-dev/SKILL.md b/plugins/agentv-dev/skills/agentv-dev/SKILL.md index 92c33628f..de071a95b 100644 --- a/plugins/agentv-dev/skills/agentv-dev/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-dev/SKILL.md @@ -22,6 +22,7 @@ agentv skills get |-------|---------|----------| | agentv-bench | `agentv skills get agentv-bench` | Run evals, benchmark agents, optimize against evals, compare targets, autoresearch | | agentv-eval-writer | `agentv skills get agentv-eval-writer` | Write, edit, or validate eval YAML files | +| agentv-eval-migrations | `agentv skills get agentv-eval-migrations` | Migrate eval YAML across breaking schema changes | | agentv-eval-review | `agentv skills get agentv-eval-review` | Review, lint, or check eval quality before committing | | agentv-governance | `agentv skills get agentv-governance` | Author or lint governance blocks (OWASP, MITRE, EU AI Act, ISO 42001) | | agentv-trace-analyst | `agentv skills get agentv-trace-analyst` | Analyze eval traces, find regressions, inspect tool trajectories | diff --git a/skills-data/agentv-eval-migrations/SKILL.md b/skills-data/agentv-eval-migrations/SKILL.md new file mode 100644 index 000000000..037f42dc0 --- /dev/null +++ b/skills-data/agentv-eval-migrations/SKILL.md @@ -0,0 +1,28 @@ +--- +name: agentv-eval-migrations +description: >- + Migrate AgentV eval YAML across breaking schema changes, especially workspace + contract updates and portable-vs-local runtime binding cleanup. +--- + +# AgentV Eval Migrations + +Use this skill when updating existing AgentV eval YAML, examples, docs, or +generated eval authoring guidance after a schema-breaking change. + +Before editing, read `references/breaking-changes.md` and compare the eval file +against the current portable contract: + +- Keep committed eval YAML portable: prompts, cases, assertions, workspace + templates, repos, hooks, env checks, Docker preflight/container config, and + `workspace.isolation`. +- Do not put machine-local existing workspace directories in eval YAML. Use + `--workspace-path` for one-off runs or `.agentv/config.local.yaml` with + `execution.workspace_path` for persistent local binding. +- Use `workspace.isolation: shared | per_case` for folder isolation. Docker + config is not a replacement for workspace folder isolation. +- Keep wire-format fields in `snake_case` and TypeScript internals in + `camelCase`. + +After migration, validate with `agentv eval --dry-run` when possible, or +run the repo's parser/schema tests for generated examples and fixtures. diff --git a/skills-data/agentv-eval-migrations/references/breaking-changes.md b/skills-data/agentv-eval-migrations/references/breaking-changes.md new file mode 100644 index 000000000..e9c0da31d --- /dev/null +++ b/skills-data/agentv-eval-migrations/references/breaking-changes.md @@ -0,0 +1,110 @@ +# AgentV Eval YAML Breaking Changes + +This reference tracks schema changes that eval authors and migration agents +should apply when modernizing AgentV eval files. + +## Workspace Isolation Spelling + +Old: + +```yaml +workspace: + isolation: per_test +``` + +New: + +```yaml +workspace: + isolation: per_case +``` + +Use `per_case` for a fresh workspace folder per eval case. Use `shared` when +cases share one prepared workspace. + +## Suite Wrapper Workspace Ownership + +Eval files that import child eval suites with `type: suite` cannot define a +parent `workspace`. Imported suites own their task environment, including repos, +templates, hooks, Docker config, env checks, and isolation. + +If the parent should own workspace context, import raw cases with `type: tests` +or direct path shorthand instead of importing suites. + +## Runtime Workspace Blocks Removed From Eval YAML + +Do not author these blocks in eval YAML: + +```yaml +experiment: + workspace: + mode: static + path: /path/to/local/workspace + +execution: + workspace: + mode: static + path: /path/to/local/workspace +``` + +Existing local workspace directories are machine-local runtime bindings. Use one +of these instead: + +```bash +agentv eval evals/my-eval.yaml --workspace-path /path/to/local/workspace +``` + +```yaml +# .agentv/config.local.yaml +execution: + workspace_path: /path/to/local/workspace + workspace_mode: static +``` + +Keep portable task setup under top-level or case-level `workspace`. + +## Workspace Mode and Path Removed From Eval YAML + +Do not author `workspace.mode`, `workspace.path`, `workspace.static_path`, +`workspace.static`, or `workspace.pool` in eval YAML. + +Old: + +```yaml +workspace: + mode: static + path: /path/to/local/workspace +``` + +New portable eval YAML: + +```yaml +workspace: + repos: + - path: ./repo + repo: org/repo + commit: main + isolation: shared +``` + +Optional local binding: + +```yaml +# .agentv/config.local.yaml +execution: + workspace_path: /path/to/local/workspace +``` + +Shared repo workspaces are pooled by default. Use +`--workspace-mode temp` or `execution.workspace_mode: temp` in local config to +force fresh temporary materialization for a local run. Use +`--workspace-path` or `execution.workspace_path` when an existing directory +should be used as-is. + +## Docker Is Not Folder Isolation + +`workspace.docker` describes environment, preflight, or container bindings. It +does not replace `workspace.isolation`. + +Use `workspace.isolation: shared | per_case` for workspace folder reuse versus +per-case folders, regardless of whether Docker is configured. diff --git a/skills-data/agentv-eval-writer/SKILL.md b/skills-data/agentv-eval-writer/SKILL.md index 1cc0ea8f6..f38a40253 100644 --- a/skills-data/agentv-eval-writer/SKILL.md +++ b/skills-data/agentv-eval-writer/SKILL.md @@ -384,7 +384,6 @@ workspace: after_each: reset: fast # none | fast | strict isolation: shared # shared | per_case - mode: pooled # pooled | temp | static ``` - `repo`: full clone URL or GitHub `org/name` shorthand @@ -393,8 +392,9 @@ workspace: - `ancestor`: walk N commits back from the checked-out ref - `sparse`: sparse checkout paths array - Do not use legacy `source`, `type`, `checkout`, `resolve`, or `clone` fields under `workspace.repos[]` -- `mode`: `pooled` (default for shared repos), `temp`, or `static` -- `path`: workspace path used when `mode: static`; when empty/missing the workspace is auto-materialised (template copied + repos cloned); populated dirs are reused as-is +- Do not author `workspace.mode`, `workspace.path`, `experiment.workspace`, or `execution.workspace` in eval YAML +- Shared repo workspaces are pooled by default; use `--workspace-mode temp` or `.agentv/config.local.yaml` with `execution.workspace_mode: temp` for a local fresh-clone run +- Existing local workspace directories are machine-local bindings; use `--workspace-path` or `.agentv/config.local.yaml` with `execution.workspace_path` - `hooks.enabled`: boolean (default `true`); set `false` to skip all lifecycle hooks - Pool reset defaults to `fast` (`git clean -fd`); use `--workspace-clean full` for strict reset (`git clean -fdx`) - Pool entries are managed separately via `agentv workspace list` and `agentv workspace clean` diff --git a/skills-data/agentv-eval-writer/references/config-schema.json b/skills-data/agentv-eval-writer/references/config-schema.json index 3a75a1ea4..6633383be 100644 --- a/skills-data/agentv-eval-writer/references/config-schema.json +++ b/skills-data/agentv-eval-writer/references/config-schema.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "AgentV Config Schema", - "description": "Schema for .agentv/config.yaml configuration files", + "description": "Schema for .agentv/config.yaml and .agentv/config.local.yaml configuration files", "type": "object", "properties": { "$schema": { @@ -37,6 +37,16 @@ "description": "Always keep temp workspaces after eval (equivalent to --keep-workspaces)", "default": false }, + "workspace_path": { + "type": "string", + "description": "Machine-local existing workspace directory for eval runs (equivalent to --workspace-path). Only honored in .agentv/config.local.yaml; do not commit local paths.", + "examples": ["/home/user/workspaces/my-eval"] + }, + "workspace_mode": { + "type": "string", + "description": "Machine-local workspace preparation override (equivalent to --workspace-mode). Only honored in .agentv/config.local.yaml. Use static with workspace_path, temp for fresh materialization, or pooled for default shared repo pooling.", + "enum": ["pooled", "temp", "static"] + }, "otel_file": { "type": "string", "description": "Write OTLP JSON trace to this path (equivalent to --otel-file). Supports {timestamp} placeholder.", diff --git a/skills-data/agentv-eval-writer/references/eval-schema.json b/skills-data/agentv-eval-writer/references/eval-schema.json index 0080ca074..86f227700 100644 --- a/skills-data/agentv-eval-writer/references/eval-schema.json +++ b/skills-data/agentv-eval-writer/references/eval-schema.json @@ -5178,6 +5178,9 @@ "type": "number", "minimum": 0, "maximum": 1 + }, + "workspace": { + "not": {} } }, "additionalProperties": false @@ -5459,13 +5462,6 @@ }, "additionalProperties": false }, - "mode": { - "type": "string", - "enum": ["pooled", "temp", "static"] - }, - "path": { - "type": "string" - }, "docker": { "type": "object", "properties": { @@ -11985,6 +11981,9 @@ "type": "number", "minimum": 0, "maximum": 1 + }, + "workspace": { + "not": {} } }, "additionalProperties": false @@ -12266,13 +12265,6 @@ }, "additionalProperties": false }, - "mode": { - "type": "string", - "enum": ["pooled", "temp", "static"] - }, - "path": { - "type": "string" - }, "docker": { "type": "object", "properties": { @@ -16132,6 +16124,9 @@ "minimum": 0, "maximum": 1 }, + "workspace": { + "not": {} + }, "agent": { "type": "string", "minLength": 1 @@ -16187,20 +16182,6 @@ "type": "string", "enum": ["auto", "docker", "vercel"] }, - "workspace": { - "type": "object", - "properties": { - "mode": { - "type": "string", - "enum": ["pooled", "temp", "static"] - }, - "path": { - "type": "string", - "minLength": 1 - } - }, - "additionalProperties": false - }, "setup": { "not": {} } @@ -18573,6 +18554,9 @@ "minimum": 0, "maximum": 1 }, + "workspace": { + "not": {} + }, "agent": { "type": "string", "minLength": 1 @@ -18628,20 +18612,6 @@ "type": "string", "enum": ["auto", "docker", "vercel"] }, - "workspace": { - "type": "object", - "properties": { - "mode": { - "type": "string", - "enum": ["pooled", "temp", "static"] - }, - "path": { - "type": "string", - "minLength": 1 - } - }, - "additionalProperties": false - }, "setup": { "not": {} } @@ -20057,13 +20027,6 @@ }, "additionalProperties": false }, - "mode": { - "type": "string", - "enum": ["pooled", "temp", "static"] - }, - "path": { - "type": "string" - }, "docker": { "type": "object", "properties": {